In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_credit = pd.read_csv('../Loan.csv')

In [None]:
# data frame shape
print('Number of rows: ', df_credit.shape[0])
print('Number of columns: ', df_credit.shape[1])

In [None]:
df_credit.columns

In [None]:
# Count the number of columns for each data type
dtype_counts = df_credit.dtypes.value_counts()

# Print the results
print("Number of columns by data type:")
print(dtype_counts)

float_features = df_credit.select_dtypes(include=['float']).columns
object_features = df_credit.select_dtypes(include=['object']).columns
int_features = df_credit.select_dtypes(include=['int']).columns
bool_features = df_credit.select_dtypes(include=['bool']).columns

# Print the feature names for each category
print("Float features:", list(float_features))
print("Object features:", list(object_features))
print("Integer features:", list(int_features))
print("Bool features:", list(bool_features))

In [None]:
df_credit = df_credit.drop(columns=["LoanApproved"])

In [None]:
df_credit.info()


In [None]:
df_credit.describe()

In [None]:
df_credit.nunique().sort_values()


In [None]:
df_credit = df_credit[df_credit['TotalAssets'] <= 1200000 ]  

## correlation 

age - experience
annualincome - monthly income
networth-totalassest
interestrate-base interest-rate
monthlypay-loanamount
baseinterestrate-creditscore


baseinterestrate 
annualincome
monthlypay
networth
experience



In [None]:
df_credit = df_credit.drop(columns=['DebtToIncomeRatio', 'TotalDebtToIncomeRatio', 'InterestRate', 'BaseInterestRate', 'MonthlyLoanPayment', 'NetWorth', 'Experience', 'AnnualIncome'])

In [None]:
(df_credit.describe())

In [None]:
import json

with open('saved/before_feature.json', 'w') as f:
    json.dump(df_credit.columns.tolist(), f)

In [None]:
df_credit_num = df_credit.select_dtypes(exclude='object').columns
df_credit_cat = df_credit.select_dtypes(include='object').columns
print(df_credit_cat)

# fill missing values for numerical variables
nimputer = SimpleImputer(missing_values=np.nan, strategy='median')
nimputer = nimputer.fit(df_credit.loc[:, df_credit_num])
df_credit.loc[:, df_credit_num] = nimputer.transform(df_credit.loc[:, df_credit_num])


# fill missing values for categorical variables
cimputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cimputer = cimputer.fit(df_credit.loc[:, df_credit_cat])
df_credit.loc[:, df_credit_cat] = cimputer.transform(df_credit.loc[:, df_credit_cat])

import joblib   

joblib.dump(nimputer, 'saved/nimputer.joblib')
joblib.dump(cimputer, 'saved/cimputer.joblib')


In [None]:
from utils import plot_credit_correlation

# Example usage:
ordinal_cols = ['EmploymentStatus', 'EducationLevel']
target_cols = ['MaritalStatus', 'LoanPurpose', 'HomeOwnershipStatus']
# plot_credit_correlation(df_credit, ordinal_cols, target_cols, 'RiskScore')
plot_credit_correlation(df_credit, ordinal_cols, target_cols, 'RiskScore')


In [None]:

print(df_credit.columns)
df_credit.to_csv('saved/preprocessed_bank_data.csv', index = False)

In [None]:
df_credit[df_credit.select_dtypes(include='object').columns].nunique().sort_values()


In [None]:
df_credit.head(5)

In [None]:
print({col: df_credit[col].unique() for col in df_credit.select_dtypes(include=['object', 'category']).columns})