In [19]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,StackingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelBinarizer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_classif,chi2
from scipy.stats import ttest_rel

In [2]:
main_df=pd.read_csv("train_df.csv")

## Head of the data

In [3]:
main_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
main_df.columns.nunique()

13

In [5]:
#614 rows with 13 columns
main_df.shape

(614, 13)

In [6]:
#Loan_status is target column 
#problem is classification 
#convert target column in to number 
target_dict={"Y":1,
            "N":0}

main_df["Loan_Status"]=main_df["Loan_Status"].map(target_dict)

In [7]:
main_df["Loan_Status"].value_counts(dropna=False)

1    422
0    192
Name: Loan_Status, dtype: int64

In [8]:
#422 loan has been approved but 192 has been rejected 
# dataset is imbalanced and check the ratio
pass_loan=main_df.Loan_Status.value_counts()[1]
rejected_loan=main_df.Loan_Status.value_counts()[0]

In [9]:
ratio=(rejected_loan/len(main_df))*100
print(ratio)
#out of 100 ,31 people's loan is rejected

31.27035830618892


In [10]:
pass_loan/len(main_df)*100
#out of 100 ,68 people's loan is rejected

68.72964169381108

In [14]:
categorical_col=main_df.select_dtypes(include="object").columns.tolist()
numerical_col=main_df.select_dtypes(exclude="object").columns.tolist()

In [18]:
print(categorical_col)
numerical_col

['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']


['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Loan_Status']

In [17]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 62.5+ KB


# EDA

In [56]:
def values_counts(colname):
    temp_name=[]
    temp_value=[]
    temp_dict={}
    temp_value,temp_name=main_df[colname].value_counts(dropna=False).values.tolist(),main_df[colname].value_counts(dropna=False).index.tolist()
    for i,j in zip(temp_name,temp_value):
        temp_dict[i]=j
    return temp_dict

In [57]:
Gender_dict=values_counts("Gender")

In [58]:
Gender_dict

{'Male': 489, 'Female': 112, nan: 13}

In [64]:
list(Gender_dict.keys())

['Male', 'Female', nan]

In [72]:
main_df["Gender"].unique().tolist()

['Male', 'Female', nan]

In [68]:
main_df.corr()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
ApplicantIncome,1.0,-0.116605,0.570909,-0.045306,-0.014715,-0.00471
CoapplicantIncome,-0.116605,1.0,0.188619,-0.059878,-0.002056,-0.059187
LoanAmount,0.570909,0.188619,1.0,0.039447,-0.008433,-0.037318
Loan_Amount_Term,-0.045306,-0.059878,0.039447,1.0,0.00147,-0.021268
Credit_History,-0.014715,-0.002056,-0.008433,0.00147,1.0,0.561678
Loan_Status,-0.00471,-0.059187,-0.037318,-0.021268,0.561678,1.0


In [106]:
#credit history has high relation with target variable 
def convert_values_into_encoder(main_df,colname,target_name):
    temp_list=main_df[colname].unique().tolist()
    temp_dict={}
    for i,j in enumerate(temp_list):
        temp_dict[j]=i
    print(temp_dict)
    main_df[colname]=main_df[colname].map(temp_dict)
    chai2=chi2(main_df[[colname]],main_df[target_name])
    relation=mutual_info_classif(main_df[[colname]],main_df[target_name])
    
    if chai2[1][0]<0.05:
        print("chi2 approved")
        if relation>0.5:
            print("relation approved")
    return chai2,relation

In [97]:
Gender_dict=convert_values_into_encoder(main_df,"Gender","Loan_Status")

In [98]:
Gender_dict

((array([0.49896148]), array([0.47995679])), array([0.00460523]))

In [100]:
chi2(main_df[["Gender"]],main_df.Loan_Status)[1][0]

0.47995679396322755

In [101]:
mutual_info_classif(main_df[["Gender"]],main_df.Loan_Status)

array([0.00794327])

In [107]:
Married_dict=convert_values_into_encoder(main_df,"Married","Loan_Status")

{'No': 0, 'Yes': 1, nan: 2}


In [109]:
Married_dict

((array([2.0471442]), array([0.15249199])), array([0]))

In [108]:
main_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0,0,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,LP001003,0,1,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,0,1,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,0,1,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,0,0,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [110]:
convert_values_into_encoder(main_df,"Dependents","Loan_Status")

{'0': 0, '1': 1, '2': 2, '3+': 3, nan: 4}


((array([0.04895328]), array([0.82489475])), array([0]))

In [111]:
convert_values_into_encoder(main_df,"Education","Loan_Status")

{'Graduate': 0, 'Not Graduate': 1}


((array([3.54050246]), array([0.05988732])), array([0.01499047]))

In [112]:
convert_values_into_encoder(main_df,"Self_Employed","Loan_Status")

{'No': 0, 'Yes': 1, nan: 2}


((array([0.08726114]), array([0.76768836])), array([0.01779331]))

In [113]:
convert_values_into_encoder(main_df,"Property_Area","Loan_Status")

{'Urban': 0, 'Rural': 1, 'Semiurban': 2}
chi2 approved


((array([4.39885443]), array([0.03596308])), array([0.00320842]))

In [114]:
main_df[numerical_col]

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849,0.0,,360.0,1.0,1
1,4583,1508.0,128.0,360.0,1.0,0
2,3000,0.0,66.0,360.0,1.0,1
3,2583,2358.0,120.0,360.0,1.0,1
4,6000,0.0,141.0,360.0,1.0,1
...,...,...,...,...,...,...
609,2900,0.0,71.0,360.0,1.0,1
610,4106,0.0,40.0,180.0,1.0,1
611,8072,240.0,253.0,360.0,1.0,1
612,7583,0.0,187.0,360.0,1.0,1


In [116]:
main_df.ApplicantIncome.isnull().sum()

0

In [135]:
main_df.LoanAmount.isnull().sum()

22

In [136]:
main_df.Loan_Amount_Term.isnull().sum()

14

In [137]:
main_df.Credit_History.isnull().sum()

50

In [138]:
main_df.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [139]:
main_df.dropna(inplace=True)

In [140]:
def identify_numeric_relation(main_df,numerical_col,target_col):
    p={}
    for col in numerical_col:
        _,p_value=ttest_rel(main_df[col],main_df[target_col])
        p[col]=p_value
        
        if p_value<0.05:
            print(f"{col} approved" )
    return p

In [141]:
identify_numeric_relation(main_df,numerical_col,"Loan_Status")

ApplicantIncome approved
CoapplicantIncome approved
LoanAmount approved
Loan_Amount_Term approved
Credit_History approved


{'ApplicantIncome': 1.450273066442599e-65,
 'CoapplicantIncome': 2.6094948885596837e-38,
 'LoanAmount': 1.40429954781519e-160,
 'Loan_Amount_Term': 0.0,
 'Credit_History': 1.5631621192074994e-18,
 'Loan_Status': nan}

In [142]:
#applicat income,
mutual_info_classif(main_df[numerical_col],main_df.Loan_Status)

array([0.02793705, 0.        , 0.        , 0.00253175, 0.14608319,
       0.61854136])

In [143]:
#Credit History,property valuation
main_df.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Gender,1.0,-0.267226,-0.121249,-0.072985,0.038857,0.042558,-0.134343,-0.008853,0.067224,-0.036533,0.08725,-0.064047
Married,-0.267226,1.0,0.354413,0.009407,-0.036041,0.051737,0.110537,0.172137,-0.089838,0.013366,-0.007119,0.0986
Dependents,-0.121249,0.354413,1.0,0.040694,-0.00498,0.106932,-0.019422,0.158508,-0.054452,-0.082853,-0.04759,0.008312
Education,-0.072985,0.009407,0.040694,1.0,0.013549,-0.132015,-0.06052,-0.16939,-0.08091,-0.064098,0.0057,-0.078434
Self_Employed,0.038857,-0.036041,-0.00498,0.013549,1.0,0.110006,-0.019479,0.06481,-0.039429,0.050234,0.00878,0.03921
ApplicantIncome,0.042558,0.051737,0.106932,-0.132015,0.110006,1.0,-0.122631,0.570708,-0.062861,-0.023779,-0.0154,-0.006281
CoapplicantIncome,-0.134343,0.110537,-0.019422,-0.06052,-0.019479,-0.122631,1.0,0.159152,-0.00029,-0.010847,-0.007106,-0.043353
LoanAmount,-0.008853,0.172137,0.158508,-0.16939,0.06481,0.570708,0.159152,1.0,0.023239,-0.018156,0.032762,-0.036642
Loan_Amount_Term,0.067224,-0.089838,-0.054452,-0.08091,-0.039429,-0.062861,-0.00029,0.023239,1.0,0.008658,0.077699,-0.028601
Credit_History,-0.036533,0.013366,-0.082853,-0.064098,0.050234,-0.023779,-0.010847,-0.018156,0.008658,1.0,0.022635,0.547439


In [273]:
main_df.ApplicantIncome.isnull().sum()

0

In [274]:
main_df.CoapplicantIncome.isnull().sum()

0

In [276]:
main_df["Total_income"]=main_df.ApplicantIncome+main_df.CoapplicantIncome

In [277]:
main_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_income
1,LP001003,0,1,1,0,0,4583,1508.0,128.0,360.0,1.0,1,0,6091.0
2,LP001005,0,1,0,0,1,3000,0.0,66.0,360.0,1.0,0,1,3000.0
3,LP001006,0,1,0,1,0,2583,2358.0,120.0,360.0,1.0,0,1,4941.0
4,LP001008,0,0,0,0,0,6000,0.0,141.0,360.0,1.0,0,1,6000.0
5,LP001011,0,1,2,0,1,5417,4196.0,267.0,360.0,1.0,0,1,9613.0


In [278]:
main_df.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_income
Gender,1.0,-0.267226,-0.121249,-0.072985,0.038857,0.042558,-0.134343,-0.008853,0.067224,-0.036533,0.08725,-0.064047,-0.010103
Married,-0.267226,1.0,0.354413,0.009407,-0.036041,0.051737,0.110537,0.172137,-0.089838,0.013366,-0.007119,0.0986,0.092628
Dependents,-0.121249,0.354413,1.0,0.040694,-0.00498,0.106932,-0.019422,0.158508,-0.054452,-0.082853,-0.04759,0.008312,0.096486
Education,-0.072985,0.009407,0.040694,1.0,0.013549,-0.132015,-0.06052,-0.16939,-0.08091,-0.064098,0.0057,-0.078434,-0.151487
Self_Employed,0.038857,-0.036041,-0.00498,0.013549,1.0,0.110006,-0.019479,0.06481,-0.039429,0.050234,0.00878,0.03921,0.099452
ApplicantIncome,0.042558,0.051737,0.106932,-0.132015,0.110006,1.0,-0.122631,0.570708,-0.062861,-0.023779,-0.0154,-0.006281,0.924907
CoapplicantIncome,-0.134343,0.110537,-0.019422,-0.06052,-0.019479,-0.122631,1.0,0.159152,-0.00029,-0.010847,-0.007106,-0.043353,0.263902
LoanAmount,-0.008853,0.172137,0.158508,-0.16939,0.06481,0.570708,0.159152,1.0,0.023239,-0.018156,0.032762,-0.036642,0.615632
Loan_Amount_Term,0.067224,-0.089838,-0.054452,-0.08091,-0.039429,-0.062861,-0.00029,0.023239,1.0,0.008658,0.077699,-0.028601,-0.061205
Credit_History,-0.036533,0.013366,-0.082853,-0.064098,0.050234,-0.023779,-0.010847,-0.018156,0.008658,1.0,0.022635,0.547439,-0.027265


In [181]:
target_col=main_df.Loan_Status.copy()
feature_col=main_df["Credit_History"].copy()

In [182]:
target_col

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 529, dtype: int64

In [183]:
feature_col.shape

(529,)

In [184]:
clf=DecisionTreeClassifier()

In [187]:
dict_alphas=clf.cost_complexity_pruning_path(main_df[["Credit_History"]],target_col)

In [188]:
list(dict_alphas[ccp_alpha])

[0.0, 0.12777891359454963]

In [189]:
from sklearn.model_selection import cross_val_score

In [190]:
from sklearn.model_selection import train_test_split

In [195]:
x_train,x_test,y_train,y_test=train_test_split(main_df[["Credit_History"]],target_col,test_size=0.2,random_state=42)

In [196]:
x_train.shape

(423, 1)

In [197]:
x_test.shape

(106, 1)

In [198]:
train_score=[]
test_score=[]
for i in list(dict_alphas[ccp_alpha]):

    clf=DecisionTreeClassifier(ccp_alpha=i)
    clf.fit(x_train,y_train)
    train_score.append(clf.score(x_train,y_train))
    test_score.append(clf.score(x_test,y_test))

In [199]:
train_score

[0.8108747044917257, 0.6926713947990544]

In [200]:
test_score

[0.8301886792452831, 0.6886792452830188]

In [201]:
model=DecisionTreeClassifier(ccp_alpha=0.0004214403530210624)
model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0004214403530210624)

In [202]:
model.feature_importances_

array([1.])

In [203]:
model.score(x_test,y_test)

0.8301886792452831

In [204]:
model2=RandomForestClassifier(n_estimators=100,ccp_alpha=0.0004214403530210624)

In [205]:
model2.fit(x_train,y_train)

RandomForestClassifier(ccp_alpha=0.0004214403530210624)

In [206]:
model2.score(x_train,y_train)

0.8108747044917257

In [207]:
model2.feature_importances_

array([1.])

In [208]:
from sklearn.linear_model import LogisticRegression

In [211]:
estimators=[("model1",DecisionTreeClassifier(ccp_alpha=0.0004214403530210624)),
           ("model2",RandomForestClassifier(n_estimators=100,ccp_alpha=0.0004214403530210624)),
           ]

In [212]:
model3=StackingClassifier(estimators=estimators,final_estimator=LogisticRegression(),cv=5)

In [213]:
model3.fit(x_train,y_train)

StackingClassifier(cv=5,
                   estimators=[('model1',
                                DecisionTreeClassifier(ccp_alpha=0.0004214403530210624)),
                               ('model2',
                                RandomForestClassifier(ccp_alpha=0.0004214403530210624))],
                   final_estimator=LogisticRegression())

In [214]:
model3.score(x_train,y_train)

0.8108747044917257

In [215]:
model3.score(x_test,y_test)

0.8301886792452831

In [216]:
model4=AdaBoostClassifier(RandomForestClassifier(ccp_alpha=0.0004214403530210624),n_estimators=100)

In [217]:
model4.fit(x_train,y_train)

AdaBoostClassifier(base_estimator=RandomForestClassifier(ccp_alpha=0.0004214403530210624),
                   n_estimators=100)

In [218]:
model4.score(x_test,y_test)

0.8301886792452831

In [219]:
prediction=model4.predict(x_test)

In [220]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.94      0.48      0.64        33
           1       0.81      0.99      0.89        73

    accuracy                           0.83       106
   macro avg       0.88      0.74      0.76       106
weighted avg       0.85      0.83      0.81       106



In [221]:
confusion_matrix(y_test,prediction)

array([[16, 17],
       [ 1, 72]], dtype=int64)

In [222]:
test_data=pd.read_csv("test_df.csv")

In [223]:
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [245]:
test_data.shape

(367, 12)

In [224]:
test_data.Credit_History.unique()

array([ 1., nan,  0.])

In [225]:
test_data.Credit_History.isnull().sum()

29

In [233]:
test_data.Credit_History.value_counts(dropna=False)

1.0    279
0.0     59
NaN     29
Name: Credit_History, dtype: int64

In [235]:
test_data.Credit_History.fillna(0,inplace=True)

In [236]:
test_data.Credit_History.value_counts(dropna=False)

1.0    279
0.0     88
Name: Credit_History, dtype: int64

In [249]:
predict=model4.predict(test_data[["Credit_History"]])

In [250]:
len(predict)

367

In [246]:
len(prediction)

106

In [267]:
predicted=[]
for i in list(predict):
    if i==1:
        predicted.append("Y")
    else:
        predicted.append("N")

In [268]:
test_df=pd.DataFrame(predicted,index=test_data["Loan_ID"])

In [269]:
test_df.rename(columns={0:"Loan_Status"},inplace=True)

In [270]:
type(test_df)

pandas.core.frame.DataFrame

In [272]:
test_df.to_csv("submission1.csv")