In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_credit = pd.read_csv('../acquisition_train.csv')

In [3]:
# data frame shape
print('Number of rows: ', df_credit.shape[0])
print('Number of columns: ', df_credit.shape[1])

Number of rows:  45000
Number of columns:  43


In [4]:
# Count the number of columns for each data type
dtype_counts = df_credit.dtypes.value_counts()

# Print the results
print("Number of columns by data type:")
print(dtype_counts)

float_features = df_credit.select_dtypes(include=['float']).columns
object_features = df_credit.select_dtypes(include=['object']).columns
int_features = df_credit.select_dtypes(include=['int']).columns
bool_features = df_credit.select_dtypes(include=['bool']).columns

# Print the feature names for each category
print("Float features:", list(float_features))
print("Object features:", list(object_features))
print("Integer features:", list(int_features))
print("Bool features:", list(bool_features))

Number of columns by data type:
object     21
float64    18
int64       4
Name: count, dtype: int64
Float features: ['score_3', 'score_4', 'score_5', 'score_6', 'risk_rate', 'last_amount_borrowed', 'last_borrowed_in_months', 'credit_limit', 'income', 'ok_since', 'n_bankruptcies', 'n_defaulted_loans', 'n_accounts', 'n_issues', 'external_data_provider_credit_checks_last_2_year', 'external_data_provider_credit_checks_last_year', 'external_data_provider_email_seen_before', 'reported_income']
Object features: ['ids', 'target_default', 'score_1', 'score_2', 'reason', 'facebook_profile', 'state', 'zip', 'channel', 'job_name', 'real_state', 'application_time_applied', 'email', 'external_data_provider_first_name', 'lat_lon', 'marketing_channel', 'profile_phone_number', 'shipping_state', 'profile_tags', 'user_agent', 'target_fraud']
Integer features: ['application_time_in_funnel', 'external_data_provider_credit_checks_last_month', 'external_data_provider_fraud_score', 'shipping_zip_code']
Bool f

In [5]:
df_credit.dropna(subset=['target_default'], inplace=True)
df_credit.drop('target_fraud', axis=1, inplace=True)
df_credit.drop(labels=['channel', 'external_data_provider_credit_checks_last_2_year'], axis=1, inplace=True)
df_credit.drop(labels=['email', 'reason', 'zip', 'job_name', 'external_data_provider_first_name', 'lat_lon',
                       'shipping_zip_code', 'user_agent', 'profile_tags', 'marketing_channel',
                       'profile_phone_number', 'application_time_applied', 'ids'], axis=1, inplace=True)

In [6]:
# df_credit.drop('facebook_profile', axis=1, inplace=True)
# df_credit.drop('external_data_provider_credit_checks_last_year', axis=1, inplace=True)



In [7]:
df_credit.describe()

Unnamed: 0,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,credit_limit,income,ok_since,n_bankruptcies,n_defaulted_loans,n_accounts,n_issues,application_time_in_funnel,external_data_provider_credit_checks_last_month,external_data_provider_credit_checks_last_year,external_data_provider_email_seen_before,external_data_provider_fraud_score,reported_income
count,41741.0,41741.0,41741.0,41741.0,41741.0,14133.0,14133.0,28632.0,41741.0,17276.0,41606.0,41729.0,41741.0,30818.0,41741.0,41741.0,27720.0,39656.0,41741.0,41741.0
mean,346.459836,100.00682,0.499416,99.919399,0.294451,13328.104095,40.58841,33877.220453,71080.12,35.192174,0.076696,0.004625,10.639108,11.023882,247.748545,1.504396,0.504185,12.731188,500.491771,inf
std,110.102271,3.183821,0.288085,10.022703,0.101561,7918.698433,9.437936,36141.985884,52259.78,21.629577,0.27482,0.080157,4.588175,4.596036,146.326172,1.114207,0.499992,125.711218,287.993121,
min,0.0,86.191572,3.5e-05,60.663039,0.0,1005.18,36.0,0.0,4821.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-999.0,0.0,403.0
25%,270.0,97.862546,0.251595,93.182517,0.22,7210.28,36.0,9975.0,44019.58,17.0,0.0,0.0,7.0,8.0,120.0,1.0,0.0,11.0,252.0,50910.0
50%,340.0,100.01795,0.500174,99.977774,0.29,12011.05,36.0,25213.0,60044.09,32.0,0.0,0.0,10.0,10.0,248.0,2.0,1.0,27.0,502.0,101623.0
75%,420.0,102.1431,0.74763,106.630991,0.36,18030.16,36.0,46492.5,85032.89,50.0,0.0,0.0,13.0,14.0,375.0,2.0,1.0,43.0,747.0,151248.0
max,990.0,113.978234,0.999973,142.1924,0.9,35059.6,60.0,448269.0,5000028.0,141.0,5.0,5.0,49.0,49.0,500.0,3.0,1.0,59.0,1000.0,inf


In [8]:
# count of values = -999 in "external_data_provider_email_seen_before"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'].value_counts()

external_data_provider_email_seen_before
-999.0    591
Name: count, dtype: int64

In [9]:
# replace "inf" values with "nan"
df_credit['reported_income'] = df_credit['reported_income'].replace(np.inf, np.nan)

# replace "-999" values with "nan"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'] = np.nan

In [10]:
import json

with open('saved/before_feature.json', 'w') as f:
    json.dump(df_credit.columns.tolist(), f)
# with open('saved/before_feature.json', 'r') as f:
#     loaded_list = json.dump(df_credit.columns)

In [11]:
df_credit_num = df_credit.select_dtypes(exclude='object').columns
df_credit_cat = df_credit.select_dtypes(include='object').columns

# fill missing values for "last_amount_borrowed", "last_borrowed_in_months" and "n_issues"
df_credit['last_amount_borrowed'].fillna(value=0, inplace=True)
df_credit['last_borrowed_in_months'].fillna(value=0, inplace=True)
df_credit['n_issues'].fillna(value=0, inplace=True)

# fill missing values for numerical variables
nimputer = SimpleImputer(missing_values=np.nan, strategy='median')
nimputer = nimputer.fit(df_credit.loc[:, df_credit_num])
df_credit.loc[:, df_credit_num] = nimputer.transform(df_credit.loc[:, df_credit_num])


# fill missing values for categorical variables
cimputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cimputer = cimputer.fit(df_credit.loc[:, df_credit_cat])
df_credit.loc[:, df_credit_cat] = cimputer.transform(df_credit.loc[:, df_credit_cat])

import pickle   

with open('saved/nimputer.pkl', 'wb') as f:
    pickle.dump(nimputer, f)
with open('saved/cimputer.pkl', 'wb') as f:
    pickle.dump(cimputer, f)



print(df_credit.columns)
df_credit.to_csv('saved/preprocessed_bank_data.csv', index = False)


Index(['target_default', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5',
       'score_6', 'risk_rate', 'last_amount_borrowed',
       'last_borrowed_in_months', 'credit_limit', 'income', 'facebook_profile',
       'state', 'real_state', 'ok_since', 'n_bankruptcies',
       'n_defaulted_loans', 'n_accounts', 'n_issues',
       'application_time_in_funnel',
       'external_data_provider_credit_checks_last_month',
       'external_data_provider_credit_checks_last_year',
       'external_data_provider_email_seen_before',
       'external_data_provider_fraud_score', 'reported_income',
       'shipping_state'],
      dtype='object')


## extra stuff

In [12]:
X= df_credit.drop(columns=["target_default"])
y = df_credit['target_default']
X.head()

Unnamed: 0,score_1,score_2,score_3,score_4,score_5,score_6,risk_rate,last_amount_borrowed,last_borrowed_in_months,credit_limit,...,n_defaulted_loans,n_accounts,n_issues,application_time_in_funnel,external_data_provider_credit_checks_last_month,external_data_provider_credit_checks_last_year,external_data_provider_email_seen_before,external_data_provider_fraud_score,reported_income,shipping_state
0,1Rk8w4Ucd5yR3KcqZzLdow==,IOVu8au3ISbo6+zmfnYwMg==,350.0,101.800832,0.259555,108.427273,0.4,25033.92,36.0,0.0,...,0.0,18.0,18.0,444,2,0.0,51.0,645,57849.0,BR-MT
1,DGCQep2AE5QRkNCshIAlFQ==,SaamrHMo23l/3TwXOWgVzw==,370.0,97.062615,0.942655,92.002546,0.24,0.0,0.0,39726.0,...,0.0,14.0,14.0,346,1,0.0,17.0,243,4902.0,BR-RS
2,DGCQep2AE5QRkNCshIAlFQ==,Fv28Bz0YRTVAT5kl1bAV6g==,360.0,100.027073,0.351918,112.892453,0.29,7207.92,36.0,25213.0,...,0.0,10.0,0.0,6,2,1.0,9.0,65,163679.0,BR-RR
3,1Rk8w4Ucd5yR3KcqZzLdow==,dCm9hFKfdRm7ej3jW+gyxw==,510.0,101.599485,0.987673,94.902491,0.32,0.0,0.0,54591.0,...,0.0,19.0,19.0,406,3,1.0,38.0,815,1086.0,BR-RN
4,8k8UDR4Yx0qasAjkGrUZLw==,+CxEO4w7jv3QPI/BQbyqAA==,500.0,98.474289,0.532539,118.126207,0.18,0.0,0.0,25213.0,...,0.0,11.0,0.0,240,2,1.0,46.0,320,198618.0,BR-MT


In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder




# Handle categorical features using Label Encoding
label_encoders = {}
for column in df_credit.columns:
    if pd.api.types.is_categorical_dtype(df_credit[column]) or pd.api.types.is_object_dtype(df_credit[column]):
        label_encoders[column] = LabelEncoder()
        df_credit[column] = label_encoders[column].fit_transform(df_credit[column])

X = df_credit.drop(columns=["target_default"])
y = df_credit['target_default']
# Calculate F-scores and p-values
f_scores, p_values = f_classif(X, y)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)

                                            Feature     F-Score        P-Value
11                                 facebook_profile  989.975837  8.871783e-215
6                                         risk_rate  394.664283   2.028374e-87
22         external_data_provider_email_seen_before  218.145792   3.057232e-49
0                                           score_1  208.944111   3.037402e-47
2                                           score_3   92.801384   6.093787e-22
10                                           income   78.194594   9.694242e-19
13                                       real_state   33.440592   7.399712e-09
18                                         n_issues   20.954957   4.715639e-06
19                       application_time_in_funnel   20.832297   5.027319e-06
8                           last_borrowed_in_months   19.479501   1.019397e-05
12                                            state   13.731873   2.111179e-04
7                              last_amount_borrowed 

In [14]:
from imblearn.over_sampling import SMOTE

X, y = SMOTE().fit_resample(df_credit.drop(columns=['target_default']), df_credit['target_default'])

print(len(X))
f_scores, p_values = f_classif(X, y)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)


70160
                                            Feature      F-Score  \
6                                         risk_rate  1397.959008   
20  external_data_provider_credit_checks_last_month  1380.700579   
22         external_data_provider_email_seen_before   819.733464   
2                                           score_3   334.585732   
13                                       real_state   294.731965   
10                                           income   290.419198   
12                                            state   119.421668   
19                       application_time_in_funnel    78.568413   
1                                           score_2    53.628972   
18                                         n_issues    28.015260   
17                                       n_accounts    27.576137   
15                                   n_bankruptcies    20.182886   
25                                   shipping_state    18.388157   
9                                      cre

In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
correlation_matrix = X.corr()

# Set figure size
plt.figure(figsize=(12, 8))

# Create heatmap with bigger annotations
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, annot_kws={"size": 10})

# # Show plot
# plt.title("Feature Correlation Heatmap", fontsize=14)
# plt.xticks(rotation=45, ha='right', fontsize=10)
# plt.yticks(fontsize=10)
# plt.show()



<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [16]:
import numpy as np

# Replace 'target' with the actual target column
correlations = df_credit.corr()['target_default'].abs().sort_values(ascending=False)
print("Feature Correlations with Target:\n", correlations)


Feature Correlations with Target:
 target_default                                     1.000000
facebook_profile                                   0.152213
risk_rate                                          0.096783
external_data_provider_email_seen_before           0.072106
score_1                                            0.070576
score_3                                            0.047100
income                                             0.043243
real_state                                         0.028294
n_issues                                           0.022401
application_time_in_funnel                         0.022335
last_borrowed_in_months                            0.021598
state                                              0.018135
last_amount_borrowed                               0.017141
n_accounts                                         0.015127
n_bankruptcies                                     0.014014
external_data_provider_credit_checks_last_year     0.009363
credi

In [17]:
import pandas as pd

# Load dataset
df = df_credit
# Replace with actual column names
facebook_col = "facebook_profile"  # The feature to filter on
target_col = "target_default"  # The target variable

# Count target values when facebook_profile is True
true_set = df[df[facebook_col] == 1][target_col].value_counts()

# Count target values when facebook_profile is False
false_set = df[df[facebook_col] == 0][target_col].value_counts()

# Print results
print(f"Target counts when {facebook_col} is True:")
print(true_set)
print("\n" + "-"*40 + "\n")
print(f"Target counts when {facebook_col} is False:")
print(false_set)


Target counts when facebook_profile is True:
target_default
0    10614
1     3321
Name: count, dtype: int64

----------------------------------------

Target counts when facebook_profile is False:
target_default
0    24466
1     3340
Name: count, dtype: int64
