In [29]:
import sagemaker 
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd 

# Specify your AWS region
region = "us-east-1"

sm_boto3= boto3.client("sagemaker")
# Create SageMaker session with the specified region
sess = sagemaker.Session(boto_session=boto3.Session(region_name=region))

# bucket name
bucket = 'healthcarefrauddetection'
print("Using Bucket: " + bucket)



INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


Using Bucket: healthcarefrauddetection


In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
# let's load the dataset into the dataframe 

df = pd.read_csv(r"insuranceFraud.csv")


In [4]:
# There are only 91 values that are missing in the 'authorities contacted column', since it is a categorical column , we will remove this feature from our dataset
df = df.drop('authorities_contacted',axis=1)

In [5]:
# Let's count the classes of the dependent variable 
count_classes = pd.value_counts(df['fraud_reported'], sort= True)

# Let's now visylize the count_classes

fig = px.bar(count_classes, x= count_classes.index, y= count_classes.values)

fig.update_layout(xaxis=dict(
    tickvals=[0, 1],
    ticktext=["Normal", "Fraud"]
), 
title = "Class Distribution",
yaxis_title="Frequency"
)
fig.show()

In [6]:
# Here we can use SMOTE technique to upsample the fraud class in order to rectify the problem of imbalanced dataset but for now we are making this project very simple 

In [7]:
import sagemaker

print(sagemaker.__file__)

d:\Projects and Work\Projects\ML PROJECTS\Insurance Fraud detection\virtual\Lib\site-packages\sagemaker\__init__.py


In [8]:
# let's drop the unique policy number and the policy_number column
df = df.drop(['policy_number'],axis=1)

In [9]:
## let's segregate the dependent and the independent variable 
X = df.drop('fraud_reported',axis=1)
y= df['fraud_reported']

In [10]:
# Creating categorical and numerical columns 
categorical_cols= X.select_dtypes(include="object").columns
numerical_cols= X.select_dtypes(exclude="object").columns

In [11]:
categorical_cols

Index(['policy_bind_date', 'policy_state', 'policy_csl', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'incident_date', 'incident_type',
       'collision_type', 'incident_severity', 'incident_state',
       'incident_city', 'incident_location', 'property_damage',
       'police_report_available', 'auto_make', 'auto_model'],
      dtype='object')

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [13]:
# Encode the categorical columns
encoder = LabelEncoder()

for cols in categorical_cols:
    X[cols] = encoder.fit_transform(X[cols])

In [14]:
X.head()

Unnamed: 0,months_as_customer,age,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year
0,328,48,99,2,1,1000,1406.91,0,466132,1,...,1,2,2,71610,6510,13020,52080,10,1,2004
1,228,42,684,1,1,2000,1197.22,5000000,468176,1,...,0,0,0,5070,780,780,3510,8,12,2007
2,134,29,945,2,0,2000,1413.14,5000000,430632,0,...,2,3,1,34650,7700,3850,23100,4,30,2007
3,256,41,604,0,1,2000,1415.74,6000000,608117,0,...,1,2,1,63400,6340,6340,50720,3,34,2014
4,228,44,705,0,2,1000,1583.91,6000000,610706,1,...,0,1,1,6500,1300,650,4550,0,31,2009


In [15]:
# let's split the data into train -test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
y_train

29     N
535    Y
695    N
557    N
836    N
      ..
106    Y
270    N
860    N
435    N
102    N
Name: fraud_reported, Length: 800, dtype: object

In [17]:
X_train 

Unnamed: 0,months_as_customer,age,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year
29,241,45,476,0,2,2000,1104.50,0,432211,0,...,2,2,1,91650,14100,14100,63450,0,33,2011
535,65,23,247,2,1,1000,1099.95,0,473109,0,...,1,0,2,52400,6550,6550,39300,0,23,2005
695,289,45,131,2,1,2000,1221.41,0,466289,0,...,2,1,1,2700,300,300,2100,6,6,2006
557,63,26,87,0,2,500,1500.04,6000000,613826,1,...,0,2,2,5160,860,860,3440,0,33,2004
836,257,43,590,1,0,1000,974.84,0,468984,0,...,0,1,2,85320,21330,7110,56880,9,29,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,464,61,149,2,2,1000,1125.37,0,604450,0,...,0,2,2,79800,6650,19950,53200,10,3,2000
270,369,55,507,2,1,2000,1589.54,0,444734,1,...,2,0,2,85300,17060,8530,59710,12,18,2003
860,230,42,178,1,0,1000,1023.11,0,476130,0,...,1,2,2,58850,10700,10700,37450,0,23,1999
435,102,28,849,2,1,500,1075.41,0,445648,1,...,1,0,1,73400,7340,7340,58720,4,27,1996


In [18]:
# creating new training data for the sagemaker training

trainX = pd.DataFrame(X_train)
trainX['fraud_reported']= y_train

In [19]:
testX = pd.DataFrame(X_test)
testX['fraud_reported']= y_test

In [20]:
y_test

521    N
737    N
740    N
660    N
411    N
      ..
408    N
332    N
208    N
613    N
78     N
Name: fraud_reported, Length: 200, dtype: object

In [21]:
testX['fraud_reported']

521    N
737    N
740    N
660    N
411    N
      ..
408    N
332    N
208    N
613    N
78     N
Name: fraud_reported, Length: 200, dtype: object

In [22]:
y_train

29     N
535    Y
695    N
557    N
836    N
      ..
106    Y
270    N
860    N
435    N
102    N
Name: fraud_reported, Length: 800, dtype: object

In [23]:
trainX

Unnamed: 0,months_as_customer,age,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
29,241,45,476,0,2,2000,1104.50,0,432211,0,...,2,1,91650,14100,14100,63450,0,33,2011,N
535,65,23,247,2,1,1000,1099.95,0,473109,0,...,0,2,52400,6550,6550,39300,0,23,2005,Y
695,289,45,131,2,1,2000,1221.41,0,466289,0,...,1,1,2700,300,300,2100,6,6,2006,N
557,63,26,87,0,2,500,1500.04,6000000,613826,1,...,2,2,5160,860,860,3440,0,33,2004,N
836,257,43,590,1,0,1000,974.84,0,468984,0,...,1,2,85320,21330,7110,56880,9,29,2006,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,464,61,149,2,2,1000,1125.37,0,604450,0,...,2,2,79800,6650,19950,53200,10,3,2000,Y
270,369,55,507,2,1,2000,1589.54,0,444734,1,...,0,2,85300,17060,8530,59710,12,18,2003,N
860,230,42,178,1,0,1000,1023.11,0,476130,0,...,2,2,58850,10700,10700,37450,0,23,1999,N
435,102,28,849,2,1,500,1075.41,0,445648,1,...,0,1,73400,7340,7340,58720,4,27,1996,N


In [24]:
# Saving the data to be able to push these files into the s3 bucket 
trainX.to_csv("train-V-1.csv", index= False)
testX.to_csv("test-V-1.csv", index= False)

## Data Ingestion into Sagemaker using boto3


In [25]:
# sending data to S3 sagemaker will take training data from s3

# creating a prefix for the s3 bucket 
sk_prefix = "sagemaker/fraud_detection_classification/sklearcontainer"

## using the session to upload the training and test data path with the file name into s3
trainpath = sess.upload_data(
    path= "train-V-1.csv", bucket = bucket, key_prefix = sk_prefix

)

testpath = sess.upload_data(
    path = "test-V-1.csv", bucket =bucket, key_prefix = sk_prefix
)

print(trainpath)
print(testpath)

s3://healthcarefrauddetection/sagemaker/fraud_detection_classification/sklearcontainer/train-V-1.csv
s3://healthcarefrauddetection/sagemaker/fraud_detection_classification/sklearcontainer/test-V-1.csv


**We have the training and the test data added to the AWS S3 bucket**

In [26]:
%%writefile script.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import sklearn
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client and are passed as command line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, Model and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading Data")
    print()

    # Creating the Training and Testing dataset by reading the files using the args.train and args.test values using args.train_file and args.test_file
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    print("Building the training and testing dataset")

    features = list(train_df.columns)
    label = features.pop(-1)

    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("Column order :")
    print(features)
    print()

    print("Label Column is: ", label)
    print()

    print("Data Shape : ")
    print()

    print("-----Shape of the Training Data----")
    print(X_train.shape)
    print(y_train.shape)
    print()

    print("-----Shape of the Testing  Data----")
    print(X_test.shape)
    print(y_test.shape)
    print()

    print("Training RandomForest Model")
    print()

    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    # model = RandomForestClassifier()
    model.fit(X_train, y_train)
    print()

    # dumping the model
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at" + model_path)
    print()

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("-----Metrics Results for Testing Data----")
    print()

    print("Total Rows are: ", X_test.shape[0])
    print("[Testing] Model accuracy is: ", test_acc)
    print("[Testing] Testing Report is: ")
    print(test_rep)


Overwriting script.py


In [27]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_estimator = SKLearn(
    entry_point='script.py',
    role = "arn:aws:iam::358363118429:role/SageMakerRole",
    instance_count=1,# number of instances within the Sagemaker instance
    instance_type="ml.m5.large", # instance type within the Sagemaker instance
    framework_version= FRAMEWORK_VERSION, # framework version

    base_job_name= "RF-custom-Insurance-Fraud", # base job name
    hyperparameters={"n_estimators": 100, "random_state": 0},
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
)

In [28]:
# launch training jobs, with asynchronous execution

sklearn_estimator.fit({
    "train": trainpath, "test": testpath}, wait= True)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: RF-custom-Insurance-Fraud-2023-09-05-09-44-58-981


2023-09-05 09:45:03 Starting - Starting the training job...
2023-09-05 09:45:19 Starting - Preparing the instances for training.........
2023-09-05 09:47:03 Downloading - Downloading input data...
2023-09-05 09:47:34 Training - Downloading the training image..2023-09-05 09:48:12,327 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2023-09-05 09:48:12,331 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-05 09:48:12,378 sagemaker_sklearn_container.training INFO     Invoking user training script.
2023-09-05 09:48:12,563 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-05 09:48:12,576 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-05 09:48:12,589 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-09-05 09:48:12,598 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    

In [30]:
sklearn_estimator.latest_training_job.wait(logs= "None")
artifact = sm_boto3.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact location: {}".format(artifact))


2023-09-05 09:48:30 Starting - Preparing the instances for training
2023-09-05 09:48:30 Downloading - Downloading input data
2023-09-05 09:48:30 Training - Training image download completed. Training in progress.
2023-09-05 09:48:30 Uploading - Uploading generated training model
2023-09-05 09:48:30 Completed - Training job completed
Model artifact location: s3://sagemaker-us-east-1-358363118429/RF-custom-Insurance-Fraud-2023-09-05-09-44-58-981/output/model.tar.gz


In [31]:
artifact

's3://sagemaker-us-east-1-358363118429/RF-custom-Insurance-Fraud-2023-09-05-09-44-58-981/output/model.tar.gz'

In [48]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

# creating new folder for deployment 
model_name ="Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

model =SKLearnModel(
    name = model_name,
    model_data= artifact,
    role = "arn:aws:iam::358363118429:role/SageMakerRole",
    entry_point='script.py',
    framework_version= FRAMEWORK_VERSION


)

In [49]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x284ee550910>

**Endpoint deployment**

In [50]:

endpoint_name= "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(f"EndpointName={endpoint_name}")

predictor= model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name= endpoint_name

)

EndpointName=Custom-sklearn-model-2023-09-05-10-24-23


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2023-09-05-10-24-08
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2023-09-05-10-24-23
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2023-09-05-10-24-23


------!

In [51]:
endpoint_name

'Custom-sklearn-model-2023-09-05-10-24-23'

In [52]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x284f5fb12d0>

In [53]:
testX.head()

Unnamed: 0,months_as_customer,age,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
521,5,26,762,0,1,2000,1137.02,0,468872,0,...,3,0,88110,16020,16020,56070,1,5,2003,N
737,160,33,409,0,2,1000,1422.78,0,616583,0,...,3,2,52800,5280,5280,42240,9,29,2006,N
740,385,51,742,1,1,1000,976.37,0,602842,0,...,3,0,67600,13520,6760,47320,11,21,2007,N
660,446,57,320,1,0,2000,1373.21,0,478486,1,...,0,1,62800,6280,12560,43960,7,36,2012,N
411,84,29,568,2,1,1000,1117.17,0,473645,0,...,0,2,6820,620,1240,4960,2,0,2005,N


In [54]:
features = list(testX.columns)

In [55]:
features

['months_as_customer',
 'age',
 'policy_bind_date',
 'policy_state',
 'policy_csl',
 'policy_deductable',
 'policy_annual_premium',
 'umbrella_limit',
 'insured_zip',
 'insured_sex',
 'insured_education_level',
 'insured_occupation',
 'insured_hobbies',
 'insured_relationship',
 'capital-gains',
 'capital-loss',
 'incident_date',
 'incident_type',
 'collision_type',
 'incident_severity',
 'incident_state',
 'incident_city',
 'incident_location',
 'incident_hour_of_the_day',
 'number_of_vehicles_involved',
 'property_damage',
 'bodily_injuries',
 'witnesses',
 'police_report_available',
 'total_claim_amount',
 'injury_claim',
 'property_claim',
 'vehicle_claim',
 'auto_make',
 'auto_model',
 'auto_year',
 'fraud_reported']

In [56]:
testX[features][0:2].values.tolist()

[[5,
  26,
  762,
  0,
  1,
  2000,
  1137.02,
  0,
  468872,
  0,
  6,
  4,
  16,
  1,
  31500,
  0,
  17,
  2,
  2,
  2,
  6,
  6,
  297,
  22,
  1,
  2,
  1,
  3,
  0,
  88110,
  16020,
  16020,
  56070,
  1,
  5,
  2003,
  'N'],
 [160,
  33,
  409,
  0,
  2,
  1000,
  1422.78,
  0,
  616583,
  0,
  2,
  3,
  8,
  0,
  61600,
  0,
  12,
  0,
  1,
  2,
  0,
  5,
  111,
  17,
  3,
  0,
  2,
  3,
  2,
  52800,
  5280,
  5280,
  42240,
  9,
  29,
  2006,
  'N']]

In [57]:
print(predictor.predict(testX[features][0:2].values.tolist()))

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/Custom-sklearn-model-2023-09-05-10-24-23 in account 358363118429 for more information.

In [58]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '6276c4a7-e670-4bf6-9c71-3264d86dcd30',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6276c4a7-e670-4bf6-9c71-3264d86dcd30',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 05 Sep 2023 10:28:49 GMT'},
  'RetryAttempts': 0}}