# 3. Machine Learning Models

### Libraries

In [2]:
## pandas and Numpy
import pandas as pd
import numpy as np
## Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
## Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
## Others
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

### Importing the data

In [8]:
df = pd.read_csv('clean_data_for_ml.csv')

In [9]:
df.head(3)

Unnamed: 0,Applicant_ID,Applicant_Gender,Owned_Car,Owned_Realty,Total_Children,Total_Income,Total_Income_Catg,Income_Type,Education_Type,Family_Status,Housing_Type,Owned_Mobile_Phone,Owned_Work_Phone,Owned_Phone,Owned_Email,Job_Title,Total_Family_Members,Applicant_Age,Years_of_Working,Total_Bad_Debt,Total_Bad_Debt_Catg,Total_Good_Debt,Debt_Score,Debit_Score_Catg,Status
0,5008806,M,1,1,0,112500,Poor,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,2,59,4,0,No bad dept,30,30,Very Good,1
1,5008808,F,0,1,0,270000,Upper Middle Class,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1,53,9,0,No bad dept,5,5,Poor,1
2,5008809,F,0,1,0,270000,Upper Middle Class,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1,53,9,0,No bad dept,5,5,Poor,1


### Info and data description

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Applicant_ID          25128 non-null  int64 
 1   Applicant_Gender      25128 non-null  object
 2   Owned_Car             25128 non-null  int64 
 3   Owned_Realty          25128 non-null  int64 
 4   Total_Children        25128 non-null  int64 
 5   Total_Income          25128 non-null  int64 
 6   Total_Income_Catg     25128 non-null  object
 7   Income_Type           25128 non-null  object
 8   Education_Type        25128 non-null  object
 9   Family_Status         25128 non-null  object
 10  Housing_Type          25128 non-null  object
 11  Owned_Mobile_Phone    25128 non-null  int64 
 12  Owned_Work_Phone      25128 non-null  int64 
 13  Owned_Phone           25128 non-null  int64 
 14  Owned_Email           25128 non-null  int64 
 15  Job_Title             25128 non-null

In [11]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Applicant_ID,25128.0,5078835.0,41943.777736,5008806.0,5042225.75,5079004.0,5115603.25,5150487.0
Owned_Car,25128.0,0.4183779,0.493303,0.0,0.0,0.0,1.0,1.0
Owned_Realty,25128.0,0.6549268,0.475402,0.0,0.0,1.0,1.0,1.0
Total_Children,25128.0,0.5094715,0.762937,0.0,0.0,0.0,1.0,5.0
Total_Income,25128.0,194836.5,104521.1233,27000.0,135000.0,180000.0,225000.0,1575000.0
Owned_Mobile_Phone,25128.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Owned_Work_Phone,25128.0,0.2737584,0.445895,0.0,0.0,0.0,1.0,1.0
Owned_Phone,25128.0,0.2927412,0.45503,0.0,0.0,0.0,1.0,1.0
Owned_Email,25128.0,0.1006845,0.300916,0.0,0.0,0.0,0.0,1.0
Total_Family_Members,25128.0,2.291309,0.928871,1.0,2.0,2.0,3.0,7.0


### Null value analysis

In [12]:
df.isnull().sum()

Applicant_ID            0
Applicant_Gender        0
Owned_Car               0
Owned_Realty            0
Total_Children          0
Total_Income            0
Total_Income_Catg       0
Income_Type             0
Education_Type          0
Family_Status           0
Housing_Type            0
Owned_Mobile_Phone      0
Owned_Work_Phone        0
Owned_Phone             0
Owned_Email             0
Job_Title               0
Total_Family_Members    0
Applicant_Age           0
Years_of_Working        0
Total_Bad_Debt          0
Total_Bad_Debt_Catg     0
Total_Good_Debt         0
Debt_Score              0
Debit_Score_Catg        0
Status                  0
dtype: int64

### distributions and outliers

In [13]:
def distributions():
    for i in df.columns:
        if (df[i].dtypes != 'object') & (i != 'Applicant_ID'):
            print('\n')
            print(i)
            plt.figure(figsize=(12,5))
            sns.histplot(data = df, x = i,kde=True,color='b')
            plt.grid()
            plt.title(i)
            plt.show()
            
            plt.figure(figsize=(12,5))
            sns.boxplot(data = df, x = i,color='y')
            plt.grid()
            plt.title(i)
            plt.show()

In [258]:
# distributions()

### We can see that the data is imballanced for the target (Status)

### How much is the Target variable imbalanced ?

In [14]:
target_imbalance = pd.DataFrame(df.Status.value_counts())
target_imbalance['Percentage %'] = round(target_imbalance['Status'] / len(df),4)*100

In [16]:
target_imbalance['Class'] = target_imbalance.index

In [17]:
target_imbalance

Unnamed: 0,Status,Percentage %,Class
1,25007,99.52,1
0,121,0.48,0


### Inference : Basically, the CC is given to almost everyone who applies for it!

### Machine learning 

In [18]:
ml = df.copy()