# House Loan Data Analysis

DESCRIPTION

For safe and secure lending experience, it's important to analyze the past data. In this project, you have to build a deep learning model to predict the chance of default for future loans using the historical data. As you will see, this dataset is highly imbalanced and includes a lot of features that make this problem more challenging.

Objective: Create a model that predicts whether or not an applicant will be able to repay a loan using historical data.

In [1]:
import pandas as pd
import numpy  as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing    import LabelEncoder
from sklearn.impute           import SimpleImputer
from sklearn.preprocessing    import StandardScaler
from sklearn.model_selection  import train_test_split
from sklearn.metrics          import accuracy_score 
from imblearn.over_sampling   import SMOTE
import matplotlib.pyplot as plt 
import matplotlib 
%matplotlib inline 

# ANN Modules
import keras
from keras.models      import Sequential
from keras.layers      import Dense, Dropout
from keras.optimizers  import adam

ImportError: cannot import name 'adam' from 'keras.optimizers' (C:\Users\Premalatha\anaconda3\Lib\site-packages\keras\optimizers\__init__.py)

In [None]:
df = pd.read_csv("loan_data.csv/loan_data.csv")
df= df.drop(['SK_ID_CURR'],axis=1)
df.head()

In [None]:
df = df[pd.notnull(df['EMERGENCYSTATE_MODE'])]
#EMERGENCYSTATE_MODE--> this column contains around 145755 of missing values in it  

In [None]:
df.shape

In [None]:
df = df.loc[df['CODE_GENDER'] != 'XNA']

### Filling the missing value in lable column

In [None]:
df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].replace(np.nan,'Other_C')
df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].replace('Unknown', 'Married')
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].replace(np.nan,'Others')
df['WALLSMATERIAL_MODE'] = df['WALLSMATERIAL_MODE'].replace(np.nan,'Others')
df['HOUSETYPE_MODE'] = df['HOUSETYPE_MODE'].replace(np.nan,'Unkown')
df['FONDKAPREMONT_MODE'] = df['FONDKAPREMONT_MODE'].replace(np.nan,'not available')

In [None]:
df = df[pd.notnull(df['AMT_REQ_CREDIT_BUREAU_YEAR'])]

In [None]:
df.info()

In [None]:
labels = df.describe(include=['object']).columns.values
labels

### Label encoding

In [None]:
le = LabelEncoder()
for lab in labels:
    le.fit(df[lab].values)
    df[lab] = le.transform(df[lab])
df.info()

### Imputing the missing values 

In [None]:
null_column = df.columns[df.isnull().any()]
print('Percentage of nan values :       %')
print()
print(df[null_column].isnull().sum()/df.shape[0]*100)

In [None]:
df = df.drop(['EXT_SOURCE_1','OWN_CAR_AGE','COMMONAREA_AVG','FLOORSMIN_AVG','LIVINGAPARTMENTS_AVG','COMMONAREA_MODE','NONLIVINGAPARTMENTS_AVG','FLOORSMIN_MODE','LIVINGAPARTMENTS_MODE','NONLIVINGAPARTMENTS_MODE','COMMONAREA_MEDI','FLOORSMIN_MEDI','LIVINGAPARTMENTS_MEDI','NONLIVINGAPARTMENTS_MEDI'], axis=1)

#since these columns contains more tham 39% of nan values 

In [None]:
df.shape

In [None]:
df = df[pd.notnull(df['AMT_ANNUITY'])]

### Imputing the missing values

In [None]:
imp1 = SimpleImputer(missing_values= np.nan, strategy='mean')
imp2 = SimpleImputer(missing_values= np.nan, strategy='median')

In [None]:
df[['AMT_GOODS_PRICE','EXT_SOURCE_2',
    'EXT_SOURCE_3','APARTMENTS_AVG',
    'BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG',
    'YEARS_BUILD_AVG','ELEVATORS_AVG',
    'ENTRANCES_AVG','FLOORSMAX_AVG',
    'LANDAREA_AVG','LIVINGAREA_AVG',
    'NONLIVINGAREA_AVG','APARTMENTS_MODE',
    'BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE',
    'YEARS_BUILD_MODE','ELEVATORS_MODE','ENTRANCES_MODE',
    'FLOORSMAX_MODE','LANDAREA_MODE','LIVINGAREA_MODE',
    'NONLIVINGAREA_MODE','APARTMENTS_MEDI',
    'BASEMENTAREA_MEDI','BASEMENTAREA_MEDI',
    'YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI',
    'ELEVATORS_MEDI','ENTRANCES_MEDI','FLOORSMAX_MEDI',
    'LANDAREA_MEDI','LIVINGAREA_MEDI',
    'NONLIVINGAREA_MEDI','TOTALAREA_MODE',]]             = imp1.fit_transform(df[['AMT_GOODS_PRICE','EXT_SOURCE_2',
                                                                                  'EXT_SOURCE_3','APARTMENTS_AVG',
                                                                                  'BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG',
                                                                                  'YEARS_BUILD_AVG','ELEVATORS_AVG',
                                                                                  'ENTRANCES_AVG','FLOORSMAX_AVG',
                                                                                  'LANDAREA_AVG','LIVINGAREA_AVG',
                                                                                  'NONLIVINGAREA_AVG','APARTMENTS_MODE',
                                                                                  'BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE',
                                                                                  'YEARS_BUILD_MODE','ELEVATORS_MODE','ENTRANCES_MODE',
                                                                                  'FLOORSMAX_MODE','LANDAREA_MODE','LIVINGAREA_MODE',
                                                                                  'NONLIVINGAREA_MODE','APARTMENTS_MEDI',
                                                                                  'BASEMENTAREA_MEDI','BASEMENTAREA_MEDI',
                                                                                  'YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI',
                                                                                  'ELEVATORS_MEDI','ENTRANCES_MEDI','FLOORSMAX_MEDI',
                                                                                  'LANDAREA_MEDI','LIVINGAREA_MEDI',
                                                                                  'NONLIVINGAREA_MEDI','TOTALAREA_MODE',]]  )

In [None]:
df[['CNT_FAM_MEMBERS','OBS_30_CNT_SOCIAL_CIRCLE',
    'DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE',]] = imp2.fit_transform(df[['CNT_FAM_MEMBERS','OBS_30_CNT_SOCIAL_CIRCLE',
                                                                                      'DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE',
                                                                                      'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE',]])

In [None]:
null_columns=df.columns[df.isnull().any()]
print('Percentage of nan values :       %')
print()
print(df[null_columns].isnull().sum()/df.shape[0]*100)

### Finding column with zero variance 

In [None]:
var = df.var()[df.var()==0].index.values
print(var)

In [None]:
df = df.drop(['FLAG_DOCUMENT_2','FLAG_MOBIL'],axis=1)

#since this column contains only one categorical variable, ie zero variance

In [None]:
class_counts = df.TARGET.value_counts()

print('Counts of Class 0 :',class_counts[0])
print('Counts of Class 1 :',class_counts[1])
print()
print('Propotion ---> ',round(class_counts[0]/len(df.TARGET)*100),':',round(class_counts[1]/len(df.TARGET)*100))

In [None]:
sc = StandardScaler()
df[['AMT_INCOME_TOTAL','AMT_ANNUITY',
   'AMT_CREDIT','AMT_GOODS_PRICE',
   'DAYS_BIRTH','DAYS_EMPLOYED',
   'DAYS_REGISTRATION','DAYS_ID_PUBLISH',
   'DAYS_LAST_PHONE_CHANGE']]              = sc.fit_transform(df[['AMT_INCOME_TOTAL','AMT_ANNUITY',
                                                                  'AMT_CREDIT','AMT_GOODS_PRICE',
                                                                  'DAYS_BIRTH','DAYS_EMPLOYED',
                                                                  'DAYS_REGISTRATION','DAYS_ID_PUBLISH',
                                                                  'DAYS_LAST_PHONE_CHANGE']])

### Finding the highly correlated columns in dataset

In [None]:
corr = df.corr()

In [None]:
import seaborn as sns
sns.heatmap(corr, annot=False, cmap=plt.cm.Reds)
plt.show()

In [None]:
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
to_drop = [col for col in upper.columns if any(upper[col]>0.90)]
to_drop

In [None]:
df = df.drop(df[to_drop], axis=1)

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=False, cmap=plt.cm.Reds)
plt.show()

### Splitting data in Hold out method

In [None]:
x = df.drop('TARGET',axis=1)
y = df.TARGET

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size= 0.2, random_state= 10, stratify=y)
print(x_train.shape)
print(y_train.shape)
print()
print(y_train.value_counts())

### Upsampling the minimum class

In [None]:
smt = SMOTE(random_state= 10, n_jobs=-1,sampling_strategy='all' )


#sampling_strategy='minority' ----> resample only the minority class;
#sampling_strategy='not minority' ----> resample all classes but the minority class;
#sampling_strategy='not majority' ----> resample all classes but the majority class;
#sampling_strategy='all' ----> resample all classes;
#sampling_strategy='auto' ----> equivalent to 'not majority'.

In [None]:
x_train, y_train = smt.fit_resample(x_train,y_train)

In [None]:
print(x_train.shape)
print(y_train.shape)

### Building NN model

In [None]:
model = Sequential()
model.add(Dense(units= 53,activation = 'relu',input_dim=79)) # first hidden and first input layer
model.add(Dropout(0.2))
model.add(Dense(units= 53,activation = 'relu')) # second hidden layer
model.add(Dropout(0.2))
model.add(Dense(units= 1,activation = 'sigmoid')) # output layer 
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(x_train,y_train,batch_size=10,epochs=20,validation_data=(x_test,y_test))

In [None]:
score = model.evaluate(x_test,y_test)

In [None]:
print('Test loss : ', score[0])
print('Test accuracy : ', score[1])

In [None]:
y_pred = model.predict(x_test) #predcting the classes

In [None]:
y_pred

In [None]:
y_test

### Calculating the accuracy score 

In [None]:
print('Accuracy of the model is : ',round(accuracy_score(y_pred.round(), y_test)*100),'%')