In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
df_main = pd.read_csv("main.csv")

In [None]:
df_main

In [None]:
df_address = pd.read_csv("address.csv")

In [None]:
df_address

In [None]:
df_paid = pd.read_csv("paid_record.csv")

In [None]:
df_paid

In [None]:
len(df_paid['ID'].unique())

# Data cleaning

In [None]:
df_main

In [None]:
df_main["New_ID"] = df_main["Main_ID"].astype(str) + df_main["Letter"]

In [None]:
df_main

In [None]:
df_main['Main_ID'].duplicated().sum()

In [None]:
df_main.loc[df_main['Main_ID'].duplicated()]

In [None]:
df_main.loc[df_main['Main_ID'] == 7052783]

In [None]:
len(df_main['New_ID'].unique())

In [None]:
# Find duplicates in the dataset
df_address.duplicated().sum()

In [None]:
df_address.drop_duplicates(inplace=True)

In [None]:
df_address

In [None]:
# Merge df_address and df_main as one
df = pd.merge(df_main, df_address, how='inner')

In [None]:
df

In [None]:
df['Main_ID'].duplicated().sum()

In [None]:
# Drop duplicates
df['Main_ID'].drop_duplicates(inplace=True)

In [None]:
df

In [None]:
df.describe()

In [None]:
df['FLAG_MOBIL'].value_counts()

In [None]:
df.nunique()

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(df.isnull(), cbar=False)

In [None]:
# Impute the null values of occupation as None
df['OCCUPATION_TYPE'].fillna('None', inplace=True)

In [None]:
df['OCCUPATION_TYPE'].value_counts()

In [None]:
df.isna().sum()

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

In [None]:
df

# Cleaning Process

In [None]:
# Drop the null values that cannot be imputed
df.dropna(subset=['CODE_GENDER', 'DAYS_EMPLOYED', 'DAYS_BIRTH_CLEAN'], inplace=True)

In [None]:
df.dropna(inplace=True)

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
# Drop columns that we are not using 
df.drop(columns=['CNT_FAM_MEMBERS','FLAG_WORK_PHONE', 'FLAG_MOBIL','FLAG_PHONE', 'FLAG_EMAIL','NAME_HOUSING_TYPE'],inplace=True)

In [None]:
df

### Fill NA

In [None]:
# Impute the null values of occupation as None
df['OCCUPATION_TYPE'].fillna('None', inplace=True)

In [None]:
# Create new columns in the dataframe
df['Children'] = df['CNT_CHILDREN'].apply(lambda x: 1 if x != 0 else 0)

## Feature Engineering

In [None]:
df

In [None]:
# Convert categorical variables into smaller groups
df.replace({'NAME_INCOME_TYPE':{'Commercial associate': 'Working'}}, inplace=True)
df.replace({'NAME_INCOME_TYPE':{'State servant': 'Working'}}, inplace=True)

In [None]:
# Convert the DOB into Age column
df['DAYS_BIRTH_CLEAN'] = pd.to_datetime(df['DAYS_BIRTH_CLEAN'])
df['Age'] = (datetime.now() - df['DAYS_BIRTH_CLEAN'])
df['Age'] = df['DAYS_BIRTH_CLEAN'].dt.year
df['Age'] = 2023 - df['Age']

# Drop DOB column
df.drop(columns=['DAYS_BIRTH_CLEAN'], axis=1, inplace=True)

In [None]:
# Convert the civil marriage into married group
df.replace({'NAME_FAMILY_STATUS':{'Civil marriage':'Married'}}, inplace=True)


In [None]:
df

In [None]:
# Convert the categorical variables into dummies
df = pd.get_dummies(df, columns=['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS', 'OCCUPATION_TYPE'])

In [None]:
df.info()

## Join of datasets

In [None]:
df_paid.info()

In [None]:
df_paid

In [None]:
df_paid['STATUS']= df_paid['STATUS'].astype(str)

In [None]:
df_paid = df_paid[df_paid["STATUS"] != "X"]
df_paid = df_paid[df_paid["MONTHS_BALANCE"] > -13]
def replacing(x):
    if(x in ["0", "1", "2", "3", "4", "5"]):
        x = 0
    else:
        x = 1
    return x
df_paid["STATUS"] = df_paid["STATUS"].map(replacing)
paid_df3 = df_paid[["ID", "STATUS"]].groupby(["ID"])["STATUS"].agg(pd.Series.mode)
paid_df3 = paid_df3.to_frame()
paid_df3 = paid_df3.reset_index()
paid_df3.rename(columns={0: "STATUS"}, inplace = True)

In [None]:
paid_df3.info()

In [None]:
paid_df3['STATUS'] = pd.to_numeric(paid_df3['STATUS'], errors='coerce')
paid_df3 = paid_df3.dropna()

In [None]:
paid_df3

In [None]:
df_final = paid_df3.merge(df, left_on = "ID", right_on = "Main_ID")

In [None]:
good = df_final.loc[df_final["STATUS"] == 1]
bad = df_final.loc[df_final["STATUS"] == 0]

In [None]:
good.describe()

In [None]:
bad.describe()

# Model Training

In [None]:
import lazypredict
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier

In [None]:
X = df_final.drop(["STATUS", "Letter", "Main_ID", "New_ID", "ADDRESS"], axis=1)
Y = df_final["STATUS"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify= Y,test_size = 0.2, random_state = 64)
clf = LazyClassifier(verbose=0, ignore_warnings=False, custom_metric=None)
models, pred = clf.fit(X_train, X_test, y_train, y_test)
models

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Generate predictions with the best model
y_pred = rf.predict(X_test)

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm).plot();