In [1]:
import boto3
import sagemaker
from sagemaker import Model
import os
import json

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()
print(f"SageMaker Role ARN: {role}")

SageMaker Role ARN: arn:aws:iam::796932308591:role/service-role/SageMaker-ExecutionRole-20250214T145019


In [3]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
# from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

from utils import feature_engineering, additional_feature_engineering
# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [4]:
s3 = boto3.client('s3')

bucket = 'mitrailabs-personaclassification'
model_prefix = 'risk_prediction/data/'
response = s3.get_object(
    Bucket=bucket,
    Key=f"{model_prefix}preprocessed_bank_data.csv"
)

df_credit = pd.read_csv(response['Body'])
df_credit.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41741 entries, 0 to 41740
Data columns (total 27 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   target_default                                   41741 non-null  bool   
 1   score_1                                          41741 non-null  object 
 2   score_2                                          41741 non-null  object 
 3   score_3                                          41741 non-null  float64
 4   score_4                                          41741 non-null  float64
 5   score_5                                          41741 non-null  float64
 6   score_6                                          41741 non-null  float64
 7   risk_rate                                        41741 non-null  float64
 8   last_amount_borrowed                             41741 non-null  float64
 9   last_borrowed_in_months     

In [5]:
X_processed = feature_engineering(df_credit)
X_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41741 entries, 0 to 41740
Data columns (total 42 columns):
 #   Column                                           Non-Null Count  Dtype   
---  ------                                           --------------  -----   
 0   target_default                                   41741 non-null  bool    
 1   score_1                                          41741 non-null  object  
 2   score_2                                          41741 non-null  object  
 3   score_3                                          41741 non-null  float64 
 4   score_4                                          41741 non-null  float64 
 5   score_5                                          41741 non-null  float64 
 6   score_6                                          41741 non-null  float64 
 7   risk_rate                                        41741 non-null  float64 
 8   last_amount_borrowed                             41741 non-null  float64 
 9   last_borrowed_in_

In [6]:
import pickle
categorical_cols = X_processed.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
print(categorical_cols)

label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X_processed[col] = label_encoders[col].fit_transform(X_processed[col])

print(label_encoders)
import joblib

label_encoders_path = 'saved/label_encoders.joblib'
joblib.dump(label_encoders, label_encoders_path)

target_bucket = 'mitrailabs-personaclassification'
target_prefix = 'risk_prediction/Intermediate_states'
model_path = f'{target_prefix}/label_encoders.joblib'
s3.upload_file(
    label_encoders_path,
    target_bucket,
    model_path
)

['target_default', 'score_1', 'score_2', 'facebook_profile', 'state', 'real_state', 'shipping_state', 'fraud_score_bin']
{'target_default': LabelEncoder(), 'score_1': LabelEncoder(), 'score_2': LabelEncoder(), 'facebook_profile': LabelEncoder(), 'state': LabelEncoder(), 'real_state': LabelEncoder(), 'shipping_state': LabelEncoder(), 'fraud_score_bin': LabelEncoder()}


In [7]:
X_processed2 = additional_feature_engineering(X_processed)
X_processed2

Unnamed: 0,target_default,score_1,score_2,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,...,score_1_bin_x_score_2_bin,score_1_bin_x_fraud_score_bin,score_2_bin_x_fraud_score_bin,state_x_real_state,state_x_shipping_state,state_real_state_avg_score_1,state_shipping_state_avg_score_2,score_1_sq,score_2_sq,facebook_profile_sq
0,0,0,10,350.0,101.800832,0.259555,108.427273,0.40,25033.92,36.0,...,0,0,0,45,56,1.896886,17.223859,0,100,1
1,0,3,16,370.0,97.062615,0.942655,92.002546,0.24,0.00,0.0,...,0,0,0,48,67,1.868128,17.316279,9,256,0
2,1,3,9,360.0,100.027073,0.351918,112.892453,0.29,7207.92,36.0,...,0,0,0,35,51,1.832704,17.004016,9,81,0
3,0,0,21,510.0,101.599485,0.987673,94.902491,0.32,0.00,0.0,...,0,0,0,26,40,1.805825,17.203866,0,441,0
4,0,2,1,500.0,98.474289,0.532539,118.126207,0.18,0.00,0.0,...,0,0,0,50,59,1.946789,17.227197,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41736,0,3,16,280.0,96.379531,0.416693,103.667082,0.17,14766.42,36.0,...,0,0,0,47,54,1.882734,16.993333,9,256,0
41737,0,6,31,370.0,96.124977,0.692196,97.977973,0.27,0.00,0.0,...,0,0,0,37,54,2.029155,16.993333,36,961,0
41738,0,4,24,280.0,102.377780,0.530938,93.687747,0.30,0.00,0.0,...,0,0,0,51,70,1.891652,17.386712,16,576,0
41739,1,6,5,240.0,100.476090,0.214697,86.759074,0.37,0.00,0.0,...,0,0,0,48,70,1.868128,17.386712,36,25,1


In [8]:

preprocessed_path = 'saved/preprocessed_bank_data.csv'
df_credit.to_csv(preprocessed_path, index=False)


target_prefix = 'risk_prediction/data'


model_path = f'{target_prefix}/feature_engineered_data.csv'
s3.upload_file(preprocessed_path, 
               target_bucket, 
               model_path
              )
