## Home Credit Risk

In this project,Home Credit is presently encountering the risk of loan defaults due to extending loans to individuals in undeserved demographics, who possess either inadequate or non-existent credit histories.We will use various analytics techniques to identify customers with low or no risk, enabling them the provision of loan. The binary target variable is the "target" column, indicating whether a customer is experiencing payment difficulties with their loan or not. Predictor includes columns like AMT_INCOME_TOTAL, AMT_CREDIT, FLAG_OWN_CAR, FLAG_OWN_REALTY.

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score


### Import dataset

In [3]:
#importing training and testing dataset
train=pd.read_csv("/content/sample_data/application_train.csv")
test=pd.read_csv("/content/sample_data/application_test.csv")

In [4]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [6]:
#Data Cleaning - Replacing median value for 365243 days
a=train[train['DAYS_EMPLOYED']!=365243]
b=a['DAYS_EMPLOYED'].median()
b

-1648.0

In [7]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].replace(365243, -1648.0)

In [8]:
train.shape, test.shape

((307511, 122), (48744, 121))

In [9]:
train.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 122, dtype: object

### Check for Duplicates

In [10]:
train[train.duplicated()].shape, test[test.duplicated()].shape

((0, 122), (0, 121))

In [11]:
train['TARGET'].value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

There are no duplicates in training and testing data

### Check for Missing Values

In [12]:
#Train
missing_value=train.isnull().sum()

missing_value[missing_value>0]
df_missing=pd.DataFrame({'Column_Name':missing_value.index,'Value':missing_value.values})

df_missing['Percentage']=(df_missing['Value']/len(train))*100


In [13]:
#Test
missing_value_test=test.isnull().sum()

missing_value_test[missing_value_test>0]
df_missing_test=pd.DataFrame({'Column_Name':missing_value_test.index,'Value':missing_value_test.values})

df_missing_test['Percentage']=(df_missing_test['Value']/len(train))*100


### Handling Missing Values


We will employ the following techniques to handle missing values:

For categorical columns, we will substitute missing values with the mode.
For numerical columns, we will replace missing values with the median.

In [14]:
#Create a copy of train and test dataset
df=train.copy()
df_test=test.copy()

In [15]:
#Train
for col in df.columns:
    if df[col].dtype=='O':
        mode_val=df[col].mode()[0]
        df[col].fillna(mode_val,inplace=True)
    else:
        median_val=df[col].median()
        df[col].fillna(median_val,inplace=True)

#Verify for missing values
(df.isnull().sum() >0).any()

False

In [16]:
#Test
for col in df_test.columns:
    if df_test[col].dtype=='O':
        mode_val=df_test[col].mode()[0]
        df_test[col].fillna(mode_val,inplace=True)
    else:
        median_val=df_test[col].median()
        df_test[col].fillna(median_val,inplace=True)

#Verify for missing values
(df_test.isnull().sum() >0).any()

False

### Feature Engineering

In order to enhance the model's performance, it's essential to convert raw datasets into features, thereby gaining a deeper understanding of the underlying problem.

Here, we will be using one-Hot encoding to convert categorical variable into binary vectors where each cateforical variable is a different feature.

In [17]:
#To verify if all columns with object data type are categorical or include text data.
#We should refrain from using one-hot encoding for text type columns.

#Train
for col in df.columns:
    if df[col].dtype=='O':
        unique_value=df[col].unique()
        number_value=len(unique_value)
        print(f"Column '{col}': (Unique Values: {number_value})")


# We observe that all the object datatype are categorical.

Column 'NAME_CONTRACT_TYPE': (Unique Values: 2)
Column 'CODE_GENDER': (Unique Values: 3)
Column 'FLAG_OWN_CAR': (Unique Values: 2)
Column 'FLAG_OWN_REALTY': (Unique Values: 2)
Column 'NAME_TYPE_SUITE': (Unique Values: 7)
Column 'NAME_INCOME_TYPE': (Unique Values: 8)
Column 'NAME_EDUCATION_TYPE': (Unique Values: 5)
Column 'NAME_FAMILY_STATUS': (Unique Values: 6)
Column 'NAME_HOUSING_TYPE': (Unique Values: 6)
Column 'OCCUPATION_TYPE': (Unique Values: 18)
Column 'WEEKDAY_APPR_PROCESS_START': (Unique Values: 7)
Column 'ORGANIZATION_TYPE': (Unique Values: 58)
Column 'FONDKAPREMONT_MODE': (Unique Values: 4)
Column 'HOUSETYPE_MODE': (Unique Values: 3)
Column 'WALLSMATERIAL_MODE': (Unique Values: 7)
Column 'EMERGENCYSTATE_MODE': (Unique Values: 2)


In [18]:
#Test
for col in df_test.columns:
    if df_test[col].dtype=='O':
        unique_value=df_test[col].unique()
        number_value=len(unique_value)
        print(f"Column '{col}': (Unique Values: {number_value})")


# We observe that all the object datatype are categorical.

Column 'NAME_CONTRACT_TYPE': (Unique Values: 2)
Column 'CODE_GENDER': (Unique Values: 2)
Column 'FLAG_OWN_CAR': (Unique Values: 2)
Column 'FLAG_OWN_REALTY': (Unique Values: 2)
Column 'NAME_TYPE_SUITE': (Unique Values: 7)
Column 'NAME_INCOME_TYPE': (Unique Values: 7)
Column 'NAME_EDUCATION_TYPE': (Unique Values: 5)
Column 'NAME_FAMILY_STATUS': (Unique Values: 5)
Column 'NAME_HOUSING_TYPE': (Unique Values: 6)
Column 'OCCUPATION_TYPE': (Unique Values: 18)
Column 'WEEKDAY_APPR_PROCESS_START': (Unique Values: 7)
Column 'ORGANIZATION_TYPE': (Unique Values: 58)
Column 'FONDKAPREMONT_MODE': (Unique Values: 4)
Column 'HOUSETYPE_MODE': (Unique Values: 3)
Column 'WALLSMATERIAL_MODE': (Unique Values: 7)
Column 'EMERGENCYSTATE_MODE': (Unique Values: 2)


In [19]:
#Perform One hot Encoding of Categorical Columns
categorical_columns = df.select_dtypes(include=['object']).columns
Numeric_columns=df.select_dtypes(include=['number']).columns
#Train
encoded_dfs = []

for col in df.columns:
    if df[col].dtype=='O':
        encoded_df= pd.get_dummies(df[col],prefix=col,dtype=int)
        encoded_dfs.append(encoded_df)
df_encoded=pd.concat([df]+encoded_dfs,axis=1)
df_encoded.drop(categorical_columns, axis=1, inplace=True)
df_encoded.drop(Numeric_columns,axis=1,inplace=True)
df_encoded

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1,1,0,1,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
2,0,1,0,1,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,1,0
3,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
4,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
307507,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
307508,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
307509,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0


In [20]:
#Perform One hot Encoding of Categorical Columns
categorical_columns_test = df_test.select_dtypes(include=['object']).columns
Numeric_columns_test=df_test.select_dtypes(include=['number']).columns
#Test
encoded_dfs_test = []

for col in df_test.columns:
    if df_test[col].dtype=='O':
        encoded_df_test= pd.get_dummies(df_test[col],prefix=col,dtype=int)
        encoded_dfs_test.append(encoded_df_test)
df_encoded_test=pd.concat([df_test]+encoded_dfs_test,axis=1)
df_encoded_test.drop(categorical_columns_test, axis=1, inplace=True)
df_encoded_test.drop(Numeric_columns_test,axis=1,inplace=True)
df_encoded_test

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,1,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,1,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
2,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
4,1,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,1,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
48740,1,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
48741,1,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
48742,1,0,0,1,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,1,0


In [21]:
# Standard scaling to numeric variables- Train

from sklearn.preprocessing import StandardScaler
Numeric_columns_drop=Numeric_columns.drop(['TARGET','SK_ID_CURR'])
scaler = StandardScaler()
df_train_numeric_scaled = pd.DataFrame(scaler.fit_transform(df[Numeric_columns_drop]), columns=Numeric_columns_drop)
df_train_numeric_scaled

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,-0.577538,0.142129,-0.478095,-0.166143,-0.507236,-0.149452,1.506880,0.755835,0.379837,0.579154,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-0.440926
1,-0.577538,0.426792,1.725450,0.592683,1.600873,-1.252750,-0.166821,0.497899,1.078697,1.790855,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-1.007331
2,-0.577538,-0.427196,-1.152888,-1.404669,-1.092145,-0.783451,-0.689509,0.948701,0.206116,0.306869,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-1.007331
3,-0.577538,-0.142533,-0.711430,0.177874,-0.653463,-0.928991,-0.680114,-0.368597,-1.375829,0.369143,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-0.440926
4,-0.577538,-0.199466,-0.213734,-0.361749,-0.068554,0.563570,-0.892535,-0.368129,0.191639,-0.307263,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-1.007331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,-0.577538,-0.047646,-0.855489,0.031015,-0.848433,0.845396,1.537586,0.943552,-0.984955,0.670578,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-0.440926
307507,-0.577538,-0.408219,-0.818594,-1.042333,-0.848433,0.310593,-1.085707,0.282562,0.169782,-0.725959,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-0.440926
307508,-0.577538,-0.066623,0.195379,0.198056,0.126415,-1.147120,0.245417,-2.653975,-0.497002,-1.428203,...,-0.090534,-0.024402,-0.022529,-0.018305,12.747224,-0.058766,-0.155837,0.897175,-0.30862,-0.440926
307509,-0.577538,0.009287,-0.568757,-0.476318,-0.592535,-1.124635,0.934008,-1.186408,0.688107,1.366859,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.070987,-0.058766,-0.155837,-0.269947,-0.30862,-1.007331


In [22]:
df['AMT_ANNUITY'].describe()

count    307511.000000
mean      27108.487841
std       14493.461065
min        1615.500000
25%       16524.000000
50%       24903.000000
75%       34596.000000
max      258025.500000
Name: AMT_ANNUITY, dtype: float64

In [23]:
df_train_numeric_scaled['AMT_ANNUITY'].describe()

count    3.075110e+05
mean    -1.732969e-17
std      1.000002e+00
min     -1.758933e+00
25%     -7.302952e-01
50%     -1.521715e-01
75%      5.166140e-01
max      1.593252e+01
Name: AMT_ANNUITY, dtype: float64

In [24]:
df_concat=pd.concat([df_train_numeric_scaled,df_encoded,df['TARGET']],axis=1)
df_concat.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
0,-0.577538,0.142129,-0.478095,-0.166143,-0.507236,-0.149452,1.50688,0.755835,0.379837,0.579154,...,0,0,0,0,0,1,0,1,0,1
1,-0.577538,0.426792,1.72545,0.592683,1.600873,-1.25275,-0.166821,0.497899,1.078697,1.790855,...,1,0,0,0,0,0,0,1,0,0
2,-0.577538,-0.427196,-1.152888,-1.404669,-1.092145,-0.783451,-0.689509,0.948701,0.206116,0.306869,...,0,0,0,0,1,0,0,1,0,0
3,-0.577538,-0.142533,-0.71143,0.177874,-0.653463,-0.928991,-0.680114,-0.368597,-1.375829,0.369143,...,0,0,0,0,1,0,0,1,0,0
4,-0.577538,-0.199466,-0.213734,-0.361749,-0.068554,0.56357,-0.892535,-0.368129,0.191639,-0.307263,...,0,0,0,0,1,0,0,1,0,0


In [25]:
df_concat['TARGET'].value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

In [26]:
# Standard scaling to numeric variables- Test

Numeric_columns_drop_test=Numeric_columns_test.drop(['SK_ID_CURR'])
scaler = StandardScaler()
df_test_numeric_scaled = pd.DataFrame(scaler.fit_transform(df_test[Numeric_columns_drop_test]), columns=Numeric_columns_drop_test)
df_test_numeric_scaled

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,-0.559988,-0.427809,0.142475,-0.553580,-0.037477,-0.164654,-0.733477,-0.483656,-0.056958,1.427241,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,-1.153865
1,-0.559988,-0.782413,-0.804537,-0.752831,-0.839362,1.009586,-0.461392,-0.498482,-1.168264,0.910437,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.589324
2,-0.559988,0.237075,0.401002,2.520066,0.497113,-0.147258,-0.917718,-0.498406,0.786092,-0.287579,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,0.773588,1.170387
3,2.260729,1.345214,2.896221,1.223666,3.303709,0.358078,0.483623,-0.480449,0.835352,-0.736836,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.589324
4,0.850370,0.015447,0.297651,0.165019,0.483748,-0.775825,0.699997,-0.482700,0.272381,-0.771247,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.008261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,-0.559988,-0.560785,-0.285119,-0.746368,-0.572067,-1.329608,-0.901998,-0.503331,-1.161509,-0.221306,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,-0.572802
48740,2.260729,-0.206181,0.289202,0.155183,0.096170,1.009586,1.128582,-0.475482,0.549644,0.031042,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.008261
48741,0.850370,0.237075,-0.552119,0.236120,-0.438420,0.358078,0.033770,-0.488561,0.643661,0.986269,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,3.743406,-0.572802
48742,-0.559988,0.458703,-0.182654,-0.268332,-0.037477,-0.164654,0.485473,-0.486441,0.987073,1.075483,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.008261


In [27]:
df_test_numeric_scaled['AMT_INCOME_TOTAL'].describe()

count    4.874400e+04
mean    -1.341087e-16
std      1.000010e+00
min     -1.492199e+00
25%     -6.494366e-01
50%     -2.061809e-01
75%      4.587026e-01
max      4.168148e+01
Name: AMT_INCOME_TOTAL, dtype: float64

In [28]:
df_concat_test=pd.concat([df_test_numeric_scaled,df_encoded_test],axis=1)
df_concat_test.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,-0.559988,-0.427809,0.142475,-0.55358,-0.037477,-0.164654,-0.733477,-0.483656,-0.056958,1.427241,...,0,0,0,0,0,0,1,0,1,0
1,-0.559988,-0.782413,-0.804537,-0.752831,-0.839362,1.009586,-0.461392,-0.498482,-1.168264,0.910437,...,0,0,0,0,0,1,0,0,1,0
2,-0.559988,0.237075,0.401002,2.520066,0.497113,-0.147258,-0.917718,-0.498406,0.786092,-0.287579,...,0,0,0,0,0,1,0,0,1,0
3,2.260729,1.345214,2.896221,1.223666,3.303709,0.358078,0.483623,-0.480449,0.835352,-0.736836,...,0,0,0,0,0,1,0,0,1,0
4,0.85037,0.015447,0.297651,0.165019,0.483748,-0.775825,0.699997,-0.4827,0.272381,-0.771247,...,0,0,0,0,0,1,0,0,1,0


In [53]:
train_columns= set(df_concat.columns)
test_columns= set(df_concat_test.columns)

missing_columns=train_columns-test_columns
missing_columns

drop_columns=['CODE_GENDER_XNA','NAME_FAMILY_STATUS_Unknown','NAME_INCOME_TYPE_Maternity leave']

df_concat.drop(drop_columns,inplace=True,axis=1)

### Model Process

In [54]:
X=df_concat.drop('TARGET',axis=1)
Y=df_concat['TARGET']

In [55]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=.30, random_state= 42)

In [56]:
smote = SMOTE(random_state = 42)
X_train, Y_train = smote.fit_resample(X_train, Y_train)

In [57]:
Y_train.value_counts()

TARGET
0    197845
1    197845
Name: count, dtype: int64

In [58]:
print("X train Shape:",X_train.shape)
print("Y train Shape:",Y_train.shape)
print("X valid Shape:",X_valid.shape)
print("Y valid Shape:",Y_valid.shape)

X train Shape: (395690, 241)
Y train Shape: (395690,)
X valid Shape: (92254, 241)
Y valid Shape: (92254,)


### Decision Tree

In [59]:
# Decision Tree

clf=DecisionTreeClassifier()

clf.fit(X_train,Y_train)

y_pred=clf.predict(X_valid)

accuracy=accuracy_score(y_pred,Y_valid)
precision = precision_score(y_pred, Y_valid)
recall = recall_score(y_pred, Y_valid)
f1 = f1_score(y_pred, Y_valid)
conf_matrix = confusion_matrix(y_pred, Y_valid)
roc_auc = roc_auc_score(y_pred, Y_valid)

print("Accuracy:",accuracy)
print("Roc_auc:",roc_auc)


Accuracy: 0.8334164372276541
Roc_auc: 0.5282751909959242


In [60]:
#Logistic Regression

log_reg=LogisticRegression(max_iter=1000)

log_reg.fit(X_train,Y_train)

y_pred=log_reg.predict(X_valid)

accuracy=accuracy_score(y_pred,Y_valid)
roc_auc = roc_auc_score(y_pred, Y_valid)

print("Accuracy:",accuracy)
print("Roc_auc:",roc_auc)

Accuracy: 0.9195156849567498
Roc_auc: 0.6976165495991757


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
#XGBoost
xgb_model=xgb.XGBClassifier()
xgb_model.fit(X_train,Y_train)
y_pred=xgb_model.predict(X_valid)

accuracy=accuracy_score(y_pred,Y_valid)
roc_auc=roc_auc_score(y_pred,Y_valid)

print("Accuracy:",accuracy)
print("Roc_auc",roc_auc)

Accuracy: 0.9188111084614218
Roc_auc 0.6783937993421906


In [62]:
feature_names = X_train.columns
feature_importance = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
sorted_feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
sorted_feature_importance_df

Unnamed: 0,Feature,Importance
160,WEEKDAY_APPR_PROCESS_START_FRIDAY,0.066320
127,NAME_EDUCATION_TYPE_Higher education,0.051527
166,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,0.044760
108,FLAG_OWN_CAR_N,0.043036
129,NAME_EDUCATION_TYPE_Lower secondary,0.041911
...,...,...
11,FLAG_MOBIL,0.000000
210,ORGANIZATION_TYPE_Services,0.000000
124,NAME_INCOME_TYPE_Unemployed,0.000000
187,ORGANIZATION_TYPE_Industry: type 3,0.000000


In [66]:
n_folds=5
kf=KFold(n_splits=n_folds,shuffle=True,random_state=42)

def cross_model(model):
  cv_scores= cross_val_score(model,X,Y,cv=kf,scoring='roc_auc')
  return cv_scores.mean()

for m in [clf,log_reg,xgb_model]:
  print(cross_model(m))

0.5402344634081632


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7460029563248305
0.7493677633837798


In [48]:
train_columns= set(df_concat.columns)
test_columns= set(df_concat_test.columns)

missing_columns=train_columns-test_columns
missing_columns

{'CODE_GENDER_XNA',
 'NAME_FAMILY_STATUS_Unknown',
 'NAME_INCOME_TYPE_Maternity leave',
 'TARGET'}

In [63]:
# Train on the test data
y_pred_proba = xgb_model.predict_proba(df_concat_test)[:, 1]

In [64]:
submit = test.reset_index()[['SK_ID_CURR']]
submit['TARGET'] = y_pred_proba

submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.980888
1,100005,0.999956
2,100013,0.999874
3,100028,0.994741
4,100038,0.999988


In [65]:
submit.to_csv('submission.csv', index = False)