In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
data.shape

(614, 13)

In [4]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
data.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [6]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

# Null Imputation

In [7]:
data.Gender.fillna(data.Gender.mode()[0],inplace=True)
test.Gender.fillna(data.Gender.mode()[0],inplace=True)

data.Married.fillna(data.Married.mode()[0],inplace=True)

data.Dependents.fillna(data.Dependents.mode()[0],inplace=True)
test.Dependents.fillna(data.Dependents.mode()[0],inplace=True)

data.Self_Employed.fillna(data.Self_Employed.mode()[0],inplace=True)
test.Self_Employed.fillna(data.Self_Employed.mode()[0],inplace=True)

data.LoanAmount.fillna(data.LoanAmount.median(),inplace=True)
test.LoanAmount.fillna(data.LoanAmount.median(),inplace=True)

data.Loan_Amount_Term.fillna(data.Loan_Amount_Term.mode()[0],inplace=True)
test.Loan_Amount_Term.fillna(data.Loan_Amount_Term.mode()[0],inplace=True)

data.Credit_History.fillna("Unknown",inplace=True)
test.Credit_History.fillna("Unknown",inplace=True)

# Oulier Removal 

In [8]:
from scipy import stats
data['z_score_ApplicantIncome'] = np.abs(stats.zscore(data['ApplicantIncome']))
data['z_score_LoanAmount']=np.abs(stats.zscore(data['LoanAmount']))
data=data[data['z_score_ApplicantIncome']<=3]
data=data[data['z_score_LoanAmount']<=3]
data.drop(columns=['z_score_ApplicantIncome','z_score_LoanAmount'],axis=1,inplace=True)

# Encoding

In [9]:
data['Gender']=data['Gender'].map({"Male":0,"Female":1})
test['Gender']=test['Gender'].map({"Male":0,"Female":1})

data['Married']=data['Married'].map({"Yes":1,"No":0})
test['Married']=test['Married'].map({"Yes":1,"No":0})

data['Dependents']=data['Dependents'].map({"0":0,"1":1,"2":2,"3+":3})
test['Dependents']=test['Dependents'].map({"0":0,"1":1,"2":2,"3+":3})

data['Education']=data['Education'].map({"Graduate":1,'Not Graduate':0})
test['Education']=test['Education'].map({"Graduate":1,'Not Graduate':0})

data['Self_Employed']=data['Self_Employed'].map({"Yes":1,"No":0})
test['Self_Employed']=test['Self_Employed'].map({"Yes":1,"No":0})

data['Credit_History']=data['Credit_History'].astype(str)
test['Credit_History']=test['Credit_History'].astype(str)
data['Credit_History']=data['Credit_History'].map({"1.0":1,"0.0":0,"Unknown":2})
test['Credit_History']=test['Credit_History'].map({"1.0":1,"0.0":0,"Unknown":2})

data['Property_Area']=data['Property_Area'].map({'Urban':0,'Rural':1,'Semiurban':2})
test['Property_Area']=test['Property_Area'].map({'Urban':0,'Rural':1,'Semiurban':2})

# Feature Engineering

In [10]:
data['Debt_Income_Ratio'] = (data['ApplicantIncome']+ data['CoapplicantIncome']) / data['LoanAmount']
test['Debt_Income_Ratio'] = (test['ApplicantIncome']+ test['CoapplicantIncome']) / test['LoanAmount']

In [11]:
data['Loan_Status']=data['Loan_Status'].map({"Y":1,"N":0})

In [12]:
data.drop(columns=['Loan_ID'],axis=1,inplace=True)
X_test=test.drop(columns=['Loan_ID'],axis=1)

# Train-Validation Split

In [13]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(
    data,
    test_size=0.20,
    random_state=0,
      )

X_train=train.drop(columns=['Loan_Status'])
y_train=train['Loan_Status']

X_validation=validation.drop(columns=['Loan_Status'])
y_validation=validation['Loan_Status']

X_train.shape, y_train.shape, X_validation.shape, y_validation.shape

((475, 12), (475,), (119, 12), (119,))

# Training Model

In [14]:
from sklearn import metrics

In [15]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=2)
xgb.fit(X_train,y_train)
y_pred_xgb=xgb.predict(X_validation)
print("Accuracy for xgb:",metrics.accuracy_score(y_validation, y_pred_xgb))
print("f1 score for xgb:",metrics.f1_score(y_validation,y_pred_xgb))

Accuracy for xgb: 0.8151260504201681
f1 score for xgb: 0.8735632183908046


In [16]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state=2)
lgb.fit(X_train,y_train)
y_pred_lgb=lgb.predict(X_validation)
print("Accuracy for lgb:",metrics.accuracy_score(y_validation, y_pred_lgb))
print("f1 score for lgb:",metrics.f1_score(y_validation,y_pred_lgb))

Accuracy for lgb: 0.7815126050420168
f1 score for lgb: 0.8505747126436781


In [17]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(random_state=100,logging_level='Silent')
cat.fit(X_train,y_train)
y_pred_cat=cat.predict(X_validation)
print("Accuracy for cat:",metrics.accuracy_score(y_validation, y_pred_cat))
print("f1 score for cat:",metrics.f1_score(y_validation,y_pred_cat))

Accuracy for cat: 0.8151260504201681
f1 score for cat: 0.8791208791208791


# Final Prediction

In [18]:
y_test=cat.predict(X_test)

In [19]:
submission=pd.DataFrame({'Loan_ID': test["Loan_ID"],'Loan_Status':y_test})
submission['Loan_Status']=np.where(submission['Loan_Status']==1,'Y','N')

In [20]:
submission.to_csv('base.csv',index=False)