In [1]:
import boto3
import sagemaker
from sagemaker import Model
import os
import json

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()
print(f"SageMaker Role ARN: {role}")

SageMaker Role ARN: arn:aws:iam::796932308591:role/service-role/SageMaker-ExecutionRole-20250214T145019


In [3]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
# from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

from utils import feature_engineering, additional_feature_engineering
# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [4]:
s3 = boto3.client('s3')

bucket = 'mitrailabs-personaclassification'
model_prefix = 'risk_prediction/data/'
response = s3.get_object(
    Bucket=bucket,
    Key=f"{model_prefix}preprocessed_bank_data.csv"
)

df_credit = pd.read_csv(response['Body'])
df_credit.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41346 entries, 0 to 41345
Data columns (total 24 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   target_default                                   41346 non-null  bool   
 1   score_1                                          41346 non-null  object 
 2   score_3                                          41346 non-null  float64
 3   score_4                                          41346 non-null  float64
 4   score_5                                          41346 non-null  float64
 5   score_6                                          41346 non-null  float64
 6   risk_rate                                        41346 non-null  float64
 7   last_amount_borrowed                             41346 non-null  float64
 8   last_borrowed_in_months                          41346 non-null  float64
 9   credit_limit                

In [5]:
X_processed = feature_engineering(df_credit)
X_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41346 entries, 0 to 41345
Data columns (total 40 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   target_default                                   41346 non-null  bool   
 1   score_1                                          41346 non-null  object 
 2   score_3                                          41346 non-null  float64
 3   score_4                                          41346 non-null  float64
 4   score_5                                          41346 non-null  float64
 5   score_6                                          41346 non-null  float64
 6   risk_rate                                        41346 non-null  float64
 7   last_amount_borrowed                             41346 non-null  float64
 8   last_borrowed_in_months                          41346 non-null  float64
 9   credit_limit                

In [6]:
# ! pip install category_encoders

In [7]:
import pandas as pd
import numpy as np

class ManualTargetEncoder:
    def __init__(self, smoothing=1.0):
        """
        Initialize the encoder.
        :param smoothing: Smoothing parameter to balance between category mean and global mean.
        """
        self.smoothing = smoothing
        self.encodings = {}  # Store encodings for each categorical column
        self.global_mean = None  # Store the global mean of the target

    def fit(self, X, y):
        """
        Fit the encoder on the training data.
        :param X: DataFrame containing categorical columns.
        :param y: Target variable.
        """
        self.global_mean = y.mean()

        for col in X.columns:
            # Calculate the mean target for each category
            category_means = y.groupby(X[col]).mean()
            # Calculate the count of each category
            category_counts = y.groupby(X[col]).count()
            # Apply smoothing
            smoothed_encoding = (category_means * category_counts + self.global_mean * self.smoothing) / (
                        category_counts + self.smoothing)
            # Store the encodings
            self.encodings[col] = smoothed_encoding

    def transform(self, X):
        """
        Transform the categorical columns using the learned encodings.
        :param X: DataFrame containing categorical columns.
        :return: Transformed DataFrame.
        """
        X_transformed = X.copy()
        for col in X.columns:
            # Replace categories with their encodings
            X_transformed[col] = X[col].map(self.encodings[col]).fillna(self.global_mean)
        return X_transformed

    def fit_transform(self, X, y):
        """
        Fit the encoder and transform the data in one step.
        :param X: DataFrame containing categorical columns.
        :param y: Target variable.
        :return: Transformed DataFrame.
        """
        self.fit(X, y)
        return self.transform(X)

In [8]:
X_processed["target_default"] = X_processed["target_default"].astype(int)

X = X_processed.drop(columns=["target_default"])
y = X_processed["target_default"]

categorical_cols = X_processed.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
print(categorical_cols)

encoder = ManualTargetEncoder(smoothing=1.0)
X_processed[categorical_cols] = encoder.fit_transform(X[categorical_cols], y)
import joblib

label_encoders_path = 'saved/label_encoders.joblib'
joblib.dump(encoder, label_encoders_path)

target_bucket = 'mitrailabs-personaclassification'
target_prefix = 'risk_prediction/Intermediate_states'
model_path = f'{target_prefix}/label_encoders.joblib'
s3.upload_file(
    label_encoders_path,
    target_bucket,
    model_path
)

['score_1', 'facebook_profile', 'state', 'real_state', 'shipping_state']


In [9]:
X_processed2 = additional_feature_engineering(X_processed)
X_processed2

Unnamed: 0,target_default,score_1,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,credit_limit,...,credit_available,income_per_account,loan_amount_to_income,n_accounts_to_credit_limit,credit_utilization_x_fraud_score,real_state_avg_facebook_profile,shipping_state_avg_facebook_profile,state_x_real_state,state_x_shipping_state,state_real_state_avg_score_1
0,0,0.181345,350.0,101.800832,0.259555,108.427273,0.40,25033.92,36.0,0.0,...,-25033.92,60.099048,0.620527,134164.078650,3.227187e+09,0.159239,0.158735,0.020228,0.022130,0.155778
1,0,0.116848,370.0,97.062615,0.942655,92.002546,0.24,0.00,0.0,39726.0,...,39726.00,84.523416,0.000000,0.018773,0.000000e+00,0.159239,0.158596,0.022775,0.019892,0.146465
2,1,0.116848,360.0,100.027073,0.351918,112.892453,0.29,7207.92,36.0,25212.0,...,18004.08,80.637243,0.332943,0.019916,3.475479e+01,0.160467,0.163200,0.026219,0.033583,0.157287
3,0,0.181345,510.0,101.599485,0.987673,94.902491,0.32,0.00,0.0,54591.0,...,54591.00,60.188305,0.000000,0.018656,0.000000e+00,0.160467,0.159774,0.022531,0.019610,0.163695
4,0,0.232212,500.0,98.474289,0.532539,118.126207,0.18,0.00,0.0,25212.0,...,25212.00,73.861843,0.000000,0.020888,0.000000e+00,0.160467,0.158735,0.026611,0.024916,0.166215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41341,0,0.116848,280.0,96.379531,0.416693,103.667082,0.17,14766.42,36.0,10627.0,...,-4139.42,119.530773,0.384246,0.025665,8.605084e+02,0.159239,0.157965,0.026022,0.026707,0.150927
41342,0,0.388041,370.0,96.124977,0.692196,97.977973,0.27,0.00,0.0,36262.0,...,36262.00,77.065998,0.000000,0.021006,0.000000e+00,0.159239,0.159803,0.021760,0.021884,0.154300
41343,0,0.300101,280.0,102.377780,0.530938,93.687747,0.30,0.00,0.0,0.0,...,0.00,132.929116,0.000000,77459.666924,0.000000e+00,0.159078,0.160440,0.024557,0.026397,0.159704
41344,1,0.388041,240.0,100.476090,0.214697,86.759074,0.37,0.00,0.0,0.0,...,0.00,136.021348,0.000000,100000.000000,0.000000e+00,0.159239,0.160440,0.022775,0.026397,0.146465


In [10]:

preprocessed_path = 'saved/preprocessed_bank_data.csv'
df_credit.to_csv(preprocessed_path, index=False)


target_prefix = 'risk_prediction/data'


model_path = f'{target_prefix}/feature_engineered_data.csv'
s3.upload_file(preprocessed_path, 
               target_bucket, 
               model_path
              )
