In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_credit = pd.read_csv('../acquisition_train.csv')

In [None]:
# data frame shape
print('Number of rows: ', df_credit.shape[0])
print('Number of columns: ', df_credit.shape[1])

In [None]:
# Count the number of columns for each data type
dtype_counts = df_credit.dtypes.value_counts()

# Print the results
print("Number of columns by data type:")
print(dtype_counts)

float_features = df_credit.select_dtypes(include=['float']).columns
object_features = df_credit.select_dtypes(include=['object']).columns
int_features = df_credit.select_dtypes(include=['int']).columns
bool_features = df_credit.select_dtypes(include=['bool']).columns

# Print the feature names for each category
print("Float features:", list(float_features))
print("Object features:", list(object_features))
print("Integer features:", list(int_features))
print("Bool features:", list(bool_features))

In [None]:
df_credit.dropna(subset=['target_default'], inplace=True)
df_credit.drop('target_fraud', axis=1, inplace=True)
df_credit.drop(labels=['channel', 'external_data_provider_credit_checks_last_2_year'], axis=1, inplace=True)
df_credit.drop(labels=['email', 'reason', 'zip', 'job_name', 'external_data_provider_first_name', 'lat_lon',
                       'shipping_zip_code', 'user_agent', 'profile_tags', 'marketing_channel',
                       'profile_phone_number', 'application_time_applied', 'ids'], axis=1, inplace=True)

In [None]:
# df_credit.drop('facebook_profile', axis=1, inplace=True)
# df_credit.drop('external_data_provider_credit_checks_last_year', axis=1, inplace=True)



In [None]:
df_credit.describe()

In [None]:
# count of values = -999 in "external_data_provider_email_seen_before"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'].value_counts()

In [None]:
# replace "inf" values with "nan"
df_credit['reported_income'] = df_credit['reported_income'].replace(np.inf, np.nan)

# replace "-999" values with "nan"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'] = np.nan

In [None]:
df_credit = df_credit[df_credit['income'] <= 700000]
df_credit = df_credit[df_credit['reported_income'] <= 210000]
df_credit.drop(labels=["n_bankruptcies", "n_defaulted_loans"], axis=1, inplace=True)
df_credit.drop(labels=["score_2"], axis=1, inplace=True) #highly corelated with score 1


In [None]:
import json

with open('saved/before_feature.json', 'w') as f:
    json.dump(df_credit.columns.tolist(), f)
# with open('saved/before_feature.json', 'r') as f:
#     loaded_list = json.dump(df_credit.columns)

In [None]:
df_credit_num = df_credit.select_dtypes(exclude='object').columns
df_credit_cat = df_credit.select_dtypes(include='object').columns

# fill missing values for "last_amount_borrowed", "last_borrowed_in_months" and "n_issues"
df_credit['last_amount_borrowed'].fillna(value=0, inplace=True)
df_credit['last_borrowed_in_months'].fillna(value=0, inplace=True)
df_credit['n_issues'].fillna(value=0, inplace=True)

# fill missing values for numerical variables
nimputer = SimpleImputer(missing_values=np.nan, strategy='median')
nimputer = nimputer.fit(df_credit.loc[:, df_credit_num])
df_credit.loc[:, df_credit_num] = nimputer.transform(df_credit.loc[:, df_credit_num])


# fill missing values for categorical variables
cimputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cimputer = cimputer.fit(df_credit.loc[:, df_credit_cat])
df_credit.loc[:, df_credit_cat] = cimputer.transform(df_credit.loc[:, df_credit_cat])

import pickle   

with open('saved/nimputer.pkl', 'wb') as f:
    pickle.dump(nimputer, f)
with open('saved/cimputer.pkl', 'wb') as f:
    pickle.dump(cimputer, f)



print(df_credit.columns)
df_credit.to_csv('saved/preprocessed_bank_data.csv', index = False)


In [None]:
df_credit.nunique().sort_values()

In [None]:
df_credit.head(5)

## extra stuff

In [None]:
X= df_credit.drop(columns=["target_default"])
y = df_credit['target_default']
X.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder


# Handle categorical features using Label Encoding
label_encoders = {}
for column in df_credit.columns:
    if pd.api.types.is_categorical_dtype(df_credit[column]) or pd.api.types.is_object_dtype(df_credit[column]):
        label_encoders[column] = LabelEncoder()
        df_credit[column] = label_encoders[column].fit_transform(df_credit[column])

X = df_credit.drop(columns=["target_default"])
y = df_credit['target_default']
# Calculate F-scores and p-values
f_scores, p_values = f_classif(X, y)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)

In [None]:
from imblearn.over_sampling import SMOTE

X, y = SMOTE().fit_resample(df_credit.drop(columns=['target_default']), df_credit['target_default'])

print(len(X))
f_scores, p_values = f_classif(X, y)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Compute correlation matrix
# correlation_matrix = X.corr()

# # Set figure size
# plt.figure(figsize=(12, 8))

# # Create heatmap with bigger annotations
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, annot_kws={"size": 10})

# # Show plot
# plt.title("Feature Correlation Heatmap", fontsize=14)
# plt.xticks(rotation=45, ha='right', fontsize=10)
# plt.yticks(fontsize=10)
# plt.show()



In [None]:
import numpy as np

# Replace 'target' with the actual target column
correlations = df_credit.corr()['target_default'].abs().sort_values(ascending=False)
print("Feature Correlations with Target:\n", correlations)


In [None]:
import pandas as pd

# Load dataset
df = df_credit
# Replace with actual column names
facebook_col = "facebook_profile"  # The feature to filter on
target_col = "target_default"  # The target variable

# Count target values when facebook_profile is True
true_set = df[df[facebook_col] == 1][target_col].value_counts()

# Count target values when facebook_profile is False
false_set = df[df[facebook_col] == 0][target_col].value_counts()

# Print results
print(f"Target counts when {facebook_col} is True:")
print(true_set)
print("\n" + "-"*40 + "\n")
print(f"Target counts when {facebook_col} is False:")
print(false_set)
