In [21]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
import json
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


In [None]:
sagemaker_session = sagemaker.Session()
client = boto3.client("sagemaker")
region = sagemaker_session.boto_session.region_name



In [26]:
df_credit = pd.read_csv('../Loan.csv')


In [None]:
df_credit = df_credit.drop(columns=["LoanApproved"])


In [None]:

with open('saved/before_feature.json', 'w') as f:
    json.dump(df_credit.columns.tolist(), f)

bucket = 'mitrailabs-personaclassification'
prefix = 'risk_prediction/regression/intermediate'

sagemaker_session.upload_data(path='saved/before_feature.json', 
                              bucket=bucket,
                              key_prefix=prefix)

's3://mitrailabs-personaclassification/risk_prediction/regression/intermediate/before_feature.json'

In [28]:
df_credit = df_credit[df_credit['TotalAssets'] <= 1200000 ]  


In [29]:
df_credit_num = df_credit.select_dtypes(exclude='object').columns
df_credit_cat = df_credit.select_dtypes(include='object').columns


# fill missing values for numerical variables
nimputer = SimpleImputer(missing_values=np.nan, strategy='median')
nimputer = nimputer.fit(df_credit.loc[:, df_credit_num])
df_credit.loc[:, df_credit_num] = nimputer.transform(df_credit.loc[:, df_credit_num])


# fill missing values for categorical variables
cimputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cimputer = cimputer.fit(df_credit.loc[:, df_credit_cat])
df_credit.loc[:, df_credit_cat] = cimputer.transform(df_credit.loc[:, df_credit_cat])


joblib.dump(nimputer, 'saved/nimputer.joblib')
joblib.dump(cimputer, 'saved/cimputer.joblib')

bucket = 'mitrailabs-personaclassification'
prefix = 'risk_prediction/regression/intermediate'

sagemaker_session.upload_data(path='saved/nimputer.joblib', 
                              bucket=bucket,
                              key_prefix=prefix)

sagemaker_session.upload_data(path='saved/cimputer.joblib', 
                              bucket=bucket,
                              key_prefix=prefix)

's3://mitrailabs-personaclassification/risk_prediction/regression/intermediate/cimputer.joblib'

In [None]:
from utils import feature_engineering
from sklearn.preprocessing import OneHotEncoder


In [31]:
X_processed = feature_engineering(df_credit)


In [32]:
# List of categorical columns
c_columns = ['HighDebtToIncome', 'AgeBin', 'CreditScoreBin', 'EmploymentStatus', 
             'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose']

# Create and fit the encoder
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoder.fit(X_processed[c_columns])

# Get feature names from the encoder
feature_names = encoder.get_feature_names_out(c_columns)
# During training
def transform_with_encoder(X, categorical_cols, other_cols, encoder):
    # Apply encoding to categorical columns
    encoded_array = encoder.transform(X[categorical_cols])
    
    # Convert to DataFrame with proper column names
    encoded_df = pd.DataFrame(
        encoded_array, 
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X.index
    )
    
    # Combine with non-categorical columns
    if other_cols:
        result = pd.concat([X[other_cols], encoded_df], axis=1)
    else:
        result = encoded_df
        
    return result

# Get non-categorical columns
other_columns = [col for col in X_processed.columns if col not in c_columns]
print(X_processed.columns)
print(c_columns)
print(other_columns)

# Apply the transformation
X_processed = transform_with_encoder(X_processed, c_columns, other_columns, encoder)



joblib.dump(encoder, 'saved/encoder.joblib')

bucket = 'mitrailabs-personaclassification'
prefix = 'risk_prediction/regression/intermediate'

sagemaker_session.upload_data(path='saved/encoder.joblib', 
                              bucket=bucket,
                              key_prefix=prefix)


Index(['ApplicationDate', 'CreditScore', 'EmploymentStatus', 'EducationLevel',
       'Experience', 'LoanAmount', 'LoanDuration', 'MaritalStatus',
       'NumberOfDependents', 'HomeOwnershipStatus', 'MonthlyDebtPayments',
       'CreditCardUtilizationRate', 'NumberOfOpenCreditLines',
       'NumberOfCreditInquiries', 'DebtToIncomeRatio', 'BankruptcyHistory',
       'LoanPurpose', 'PreviousLoanDefaults', 'PaymentHistory',
       'LengthOfCreditHistory', 'CheckingAccountBalance', 'TotalAssets',
       'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory',
       'JobTenure', 'BaseInterestRate', 'MonthlyLoanPayment',
       'TotalDebtToIncomeRatio', 'RiskScore', 'IncomeToDebtRatio',
       'SavingsToIncomeRatio', 'NetWorthToIncomeRatio',
       'HighCreditUtilization', 'HighDebtToIncome', 'JobStability',
       'PreviousLoanDefaultRate', 'UtilityBillsPaymentHistoryRatio',
       'InterestRateSpread', 'AgeExperienceInteraction', 'LogMonthlyIncome',
       'LogSavingsAccountBala

's3://mitrailabs-personaclassification/risk_prediction/regression/intermediate/encoder.joblib'

In [33]:
X = X_processed.drop(columns=['RiskScore'])
y = X_processed['RiskScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1)

X_train.to_csv('saved/train-v1.csv', index=False)
X_test.to_csv('saved/test-v1.csv', index=False)


bucket = 'mitrailabs-personaclassification'
prefix = 'risk_prediction/regression/intermediate'

sagemaker_session.upload_data(path='saved/train-v1.csv', 
                              bucket=bucket,
                              key_prefix=prefix)

sagemaker_session.upload_data(path='saved/test-v1.csv', 
                              bucket=bucket,
                              key_prefix=prefix)


's3://mitrailabs-personaclassification/risk_prediction/regression/intermediate/test-v1.csv'