In [1]:
import boto3
import sagemaker
from sagemaker import Model
import os
import json

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
!pip install scikit-learn==1.5.2
# !pip install imblearn




In [3]:
# import requests

# url = "http://dl.dropboxusercontent.com/s/xn2a4kzf0zer0xu/acquisition_train.csv?dl=0"
# file_name = "acquisition_train.csv"

# response = requests.get(url)
# with open(file_name, "wb") as file:
#     file.write(response.content)

# print(f"Downloaded {file_name}")


In [4]:
sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()
print(f"SageMaker Role ARN: {role}")

SageMaker Role ARN: arn:aws:iam::796932308591:role/service-role/SageMaker-ExecutionRole-20250214T145019


In [5]:
# s3 = boto3.client('s3')


# target_bucket = 'mitrailabs-personaclassification'
# target_prefix = 'risk_prediction/data'

# model_path = f'{target_prefix}/acquisition_train.csv'
# s3.upload_file(
#     'acquisition_train2.csv',
#     target_bucket,
#     model_path
# )

In [6]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
s3_path = 's3://mitrailabs-personaclassification/risk_prediction/data/acquisition_train.csv'
df_credit = pd.read_csv(s3_path)

In [8]:
# data frame shape
print('Number of rows: ', df_credit.shape[0])
print('Number of columns: ', df_credit.shape[1])

Number of rows:  45000
Number of columns:  43


In [9]:
# Count the number of columns for each data type
dtype_counts = df_credit.dtypes.value_counts()

# Print the results
print("Number of columns by data type:")
print(dtype_counts)

float_features = df_credit.select_dtypes(include=['float']).columns
object_features = df_credit.select_dtypes(include=['object']).columns
int_features = df_credit.select_dtypes(include=['int']).columns
bool_features = df_credit.select_dtypes(include=['bool']).columns

# Print the feature names for each category
print("Float features:", list(float_features))
print("Object features:", list(object_features))
print("Integer features:", list(int_features))
print("Bool features:", list(bool_features))

Number of columns by data type:
object     21
float64    18
int64       4
Name: count, dtype: int64
Float features: ['score_3', 'score_4', 'score_5', 'score_6', 'risk_rate', 'last_amount_borrowed', 'last_borrowed_in_months', 'credit_limit', 'income', 'ok_since', 'n_bankruptcies', 'n_defaulted_loans', 'n_accounts', 'n_issues', 'external_data_provider_credit_checks_last_2_year', 'external_data_provider_credit_checks_last_year', 'external_data_provider_email_seen_before', 'reported_income']
Object features: ['ids', 'target_default', 'score_1', 'score_2', 'reason', 'facebook_profile', 'state', 'zip', 'channel', 'job_name', 'real_state', 'application_time_applied', 'email', 'external_data_provider_first_name', 'lat_lon', 'marketing_channel', 'profile_phone_number', 'shipping_state', 'profile_tags', 'user_agent', 'target_fraud']
Integer features: ['application_time_in_funnel', 'external_data_provider_credit_checks_last_month', 'external_data_provider_fraud_score', 'shipping_zip_code']
Bool f

In [10]:
df_credit.dropna(subset=['target_default'], inplace=True)
df_credit.drop('target_fraud', axis=1, inplace=True)
df_credit.drop(labels=['channel', 'external_data_provider_credit_checks_last_2_year'], axis=1, inplace=True)
df_credit.drop(labels=['email', 'reason', 'zip', 'job_name', 'external_data_provider_first_name', 'lat_lon',
                       'shipping_zip_code', 'user_agent', 'profile_tags', 'marketing_channel',
                       'profile_phone_number', 'application_time_applied', 'ids'], axis=1, inplace=True)

In [11]:
df_credit.describe()

Unnamed: 0,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,credit_limit,income,ok_since,n_bankruptcies,n_defaulted_loans,n_accounts,n_issues,application_time_in_funnel,external_data_provider_credit_checks_last_month,external_data_provider_credit_checks_last_year,external_data_provider_email_seen_before,external_data_provider_fraud_score,reported_income
count,41741.0,41741.0,41741.0,41741.0,41741.0,14133.0,14133.0,28632.0,41741.0,17276.0,41606.0,41729.0,41741.0,30818.0,41741.0,41741.0,27720.0,39656.0,41741.0,41741.0
mean,346.459836,100.00682,0.499416,99.919399,0.294451,13328.104095,40.58841,33877.220453,71080.12,35.192174,0.076696,0.004625,10.639108,11.023882,247.748545,1.504396,0.504185,12.731188,500.491771,inf
std,110.102271,3.183821,0.288085,10.022703,0.101561,7918.698433,9.437936,36141.985884,52259.78,21.629577,0.27482,0.080157,4.588175,4.596036,146.326172,1.114207,0.499992,125.711218,287.993121,
min,0.0,86.191572,3.5e-05,60.663039,0.0,1005.18,36.0,0.0,4821.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-999.0,0.0,403.0
25%,270.0,97.862546,0.251595,93.182517,0.22,7210.28,36.0,9975.0,44019.58,17.0,0.0,0.0,7.0,8.0,120.0,1.0,0.0,11.0,252.0,50910.0
50%,340.0,100.01795,0.500174,99.977774,0.29,12011.05,36.0,25213.0,60044.09,32.0,0.0,0.0,10.0,10.0,248.0,2.0,1.0,27.0,502.0,101623.0
75%,420.0,102.1431,0.74763,106.630991,0.36,18030.16,36.0,46492.5,85032.89,50.0,0.0,0.0,13.0,14.0,375.0,2.0,1.0,43.0,747.0,151248.0
max,990.0,113.978234,0.999973,142.1924,0.9,35059.6,60.0,448269.0,5000028.0,141.0,5.0,5.0,49.0,49.0,500.0,3.0,1.0,59.0,1000.0,inf


In [12]:
# count of values = -999 in "external_data_provider_email_seen_before"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'].value_counts()

external_data_provider_email_seen_before
-999.0    591
Name: count, dtype: int64

In [13]:
# replace "inf" values with "nan"
df_credit['reported_income'] = df_credit['reported_income'].replace(np.inf, np.nan)

# replace "-999" values with "nan"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'] = np.nan

In [14]:
df_credit = df_credit[df_credit['income'] <= 700000]
df_credit = df_credit[df_credit['reported_income'] <= 210000]
df_credit.drop(labels=["n_bankruptcies", "n_defaulted_loans"], axis=1, inplace=True)
df_credit.drop(labels=["score_2"], axis=1, inplace=True) #highly corelated with score 1


In [15]:
import json

prev_feature_path = 'saved/before_feature.json'
with open(prev_feature_path, 'w') as f:
    json.dump(df_credit.columns.tolist(), f)


s3 = boto3.client('s3')
target_bucket = 'mitrailabs-personaclassification'
target_prefix = 'risk_prediction/Intermediate_states/'


model_path = f'{target_prefix}before_feature.json'
# s3.upload_file(preprocessed_path , target_bucket, model_path)


json_data = df_credit.columns.tolist()
json_str = json.dumps(json_data)
s3.put_object(Bucket=target_bucket, Key=model_path, Body=json_str, ContentType="application/json")


{'ResponseMetadata': {'RequestId': '1Q0952Y6HH0KHA03',
  'HostId': 'YowmacGSyBa5jy0WJgiKFYTPiVDn963BpfnF6+8udumy+cyGFXrajVvOmkPumw/GlEKzMtLG25VydIc039yKvSDmQnuJKT+e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'YowmacGSyBa5jy0WJgiKFYTPiVDn963BpfnF6+8udumy+cyGFXrajVvOmkPumw/GlEKzMtLG25VydIc039yKvSDmQnuJKT+e',
   'x-amz-request-id': '1Q0952Y6HH0KHA03',
   'date': 'Thu, 27 Feb 2025 04:15:40 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"79385c60434f92513c42f7ff19344a81"',
   'x-amz-checksum-crc32': 's251dg==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"79385c60434f92513c42f7ff19344a81"',
 'ChecksumCRC32': 's251dg==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}

In [16]:
df_credit_num = df_credit.select_dtypes(exclude='object').columns
df_credit_cat = df_credit.select_dtypes(include='object').columns

# fill missing values for "last_amount_borrowed", "last_borrowed_in_months" and "n_issues"
df_credit['last_amount_borrowed'].fillna(value=0, inplace=True)
df_credit['last_borrowed_in_months'].fillna(value=0, inplace=True)
df_credit['n_issues'].fillna(value=0, inplace=True)

# fill missing values for numerical variables
nimputer = SimpleImputer(missing_values=np.nan, strategy='median')
nimputer = nimputer.fit(df_credit.loc[:, df_credit_num])
df_credit.loc[:, df_credit_num] = nimputer.transform(df_credit.loc[:, df_credit_num])


# fill missing values for categorical variables
cimputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cimputer = cimputer.fit(df_credit.loc[:, df_credit_cat])
df_credit.loc[:, df_credit_cat] = cimputer.transform(df_credit.loc[:, df_credit_cat])

import pickle

target_bucket = 'mitrailabs-personaclassification'
target_prefix = 'risk_prediction/Intermediate_states'


# nimputer_path = 'saved/nimputer.pkl'
# with open(nimputer_path, 'wb') as f:
#     pickle.dump(nimputer, f)

# model_path = f'{target_prefix}/nimputer.pkl'
# s3.upload_file(
#     nimputer_path,
#     target_bucket,
#     model_path
# )

# cimputer_path = 'saved/cimputer.pkl'   
# with open(cimputer_path , 'wb') as f:
#     pickle.dump(cimputer, f)

# model_path = f'{target_prefix}/cimputer.pkl'
# s3.upload_file(
#     cimputer_path,
#     target_bucket,
#     model_path
# )
# Import joblib at the top
import joblib

# Replace pickle with joblib for the numeric imputer
nimputer_path = 'saved/nimputer.joblib'
joblib.dump(nimputer, nimputer_path)
model_path = f'{target_prefix}/nimputer.joblib'
s3.upload_file(
    nimputer_path,
    target_bucket,
    model_path
)

# Replace pickle with joblib for the categorical imputer
cimputer_path = 'saved/cimputer.joblib'   
joblib.dump(cimputer, cimputer_path)
model_path = f'{target_prefix}/cimputer.joblib'
s3.upload_file(
    cimputer_path,
    target_bucket,
    model_path
)

preprocessed_path = 'saved/preprocessed_bank_data.csv'
df_credit.to_csv(preprocessed_path, index=False)


target_prefix = 'risk_prediction/data'


model_path = f'{target_prefix}/preprocessed_bank_data.csv'
s3.upload_file(preprocessed_path , target_bucket, model_path)


# os.remove(nimputer_path)
# os.remove(cimputer_path)
# os.remove(preprocessed_path)
