In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

from utils import feature_engineering, additional_feature_engineering
# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_credit = pd.read_csv('saved/preprocessed_bank_data.csv')
df_credit.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41741 entries, 0 to 41740
Data columns (total 27 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   target_default                                   41741 non-null  bool   
 1   score_1                                          41741 non-null  object 
 2   score_2                                          41741 non-null  object 
 3   score_3                                          41741 non-null  float64
 4   score_4                                          41741 non-null  float64
 5   score_5                                          41741 non-null  float64
 6   score_6                                          41741 non-null  float64
 7   risk_rate                                        41741 non-null  float64
 8   last_amount_borrowed                             41741 non-null  float64
 9   last_borrowed_in_months     

In [3]:
X_processed = feature_engineering(df_credit)
X_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41741 entries, 0 to 41740
Data columns (total 43 columns):
 #   Column                                           Non-Null Count  Dtype   
---  ------                                           --------------  -----   
 0   target_default                                   41741 non-null  bool    
 1   score_1                                          41741 non-null  object  
 2   score_2                                          41741 non-null  object  
 3   score_3                                          41741 non-null  float64 
 4   score_4                                          41741 non-null  float64 
 5   score_5                                          41741 non-null  float64 
 6   score_6                                          41741 non-null  float64 
 7   risk_rate                                        41741 non-null  float64 
 8   last_amount_borrowed                             41741 non-null  float64 
 9   last_borrowed_in_

In [4]:
import pickle
categorical_cols = X_processed.select_dtypes(exclude=['float64', 'int64']).columns.tolist()

label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X_processed[col] = label_encoders[col].fit_transform(X_processed[col])




# Save label encoders to disk
with open('saved/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [5]:
X_processed2 = additional_feature_engineering(X_processed)
X_processed2

Unnamed: 0,target_default,score_1,score_2,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,...,real_state_avg_facebook_profile,shipping_state_avg_facebook_profile,score_1_div_score_2,score_1_minus_score_2,state_x_real_state,state_x_shipping_state,state_real_state_avg_score_1,state_shipping_state_avg_score_2,score_1_sq,score_2_sq
0,0,0,10,350.0,101.800832,0.259555,108.427273,0.40,10.128027,3.610918,...,0.329297,0.324643,0.000000,-10,44,528,1.896319,17.306748,0,100
1,0,3,16,370.0,97.062615,0.942655,92.002546,0.24,0.000000,0.000000,...,0.329297,0.322504,0.187500,-13,47,940,1.859049,17.294872,9,256
2,1,3,9,360.0,100.027073,0.351918,112.892453,0.29,8.883074,3.610918,...,0.340317,0.361389,0.333333,-6,96,608,1.906542,16.590909,9,81
3,0,0,21,510.0,101.599485,0.987673,94.902491,0.32,0.000000,0.000000,...,0.340317,0.333609,0.000000,-21,69,391,1.915888,17.650000,0,441
4,0,2,1,500.0,98.474289,0.532539,118.126207,0.18,0.000000,0.000000,...,0.340317,0.324643,2.000000,1,141,564,1.947590,17.600000,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41736,0,3,16,280.0,96.379531,0.416693,103.667082,0.17,9.600179,3.610918,...,0.329297,0.319314,0.187500,-13,46,368,1.804416,17.637410,9,256
41737,0,6,31,370.0,96.124977,0.692196,97.977973,0.27,0.000000,0.000000,...,0.329297,0.334390,0.193548,-25,36,648,1.921519,19.615385,36,961
41738,0,4,24,280.0,102.377780,0.530938,93.687747,0.30,0.000000,0.000000,...,0.327586,0.339492,0.166667,-20,188,1081,1.922551,17.456029,16,576
41739,1,6,5,240.0,100.476090,0.214697,86.759074,0.37,0.000000,0.000000,...,0.329297,0.339492,1.200000,1,47,1081,1.859049,17.456029,36,25


In [6]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE


# # Separate features (X) and target (y)
# X = df_credit.drop('target_default', axis=1)
# y = df_credit['target_default']

# # Apply SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # Create a new DataFrame with resampled data
# df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
# df_resampled['target_default'] = y_resampled

# Save the resampled data to a new CSV file
# df_resampled.to_csv('saved/feature_engineered_data.csv', index=False)

# print("SMOTE applied and saved to 'saved/feature_engineered_data.csv'")

In [7]:
df_credit.to_csv('saved/feature_engineered_data.csv', index = False)