In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from seaborn import *
from module import replacer, preprocessing, catconsep, CV_tune, model_builder
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def catconsep(df):
    cat = []
    con = []
    for i in df.columns:
        if(df[i].dtypes == "object"):
            cat.append(i)
        else:
            con.append(i)
    return cat,con

In [3]:
def replacer(df):
    import pandas as pd
    Q = pd.DataFrame(df.isna().sum(),columns=["ct"])
    for i in Q[Q.ct > 0].index:
        if(df[i].dtypes == "object"):
            x = df[i].mode()[0]
            df[i] = df[i].fillna(x)
        else:
            x = df[i].mean()
            df[i] = df[i].fillna(x)

In [4]:
def preprocessing(df):
    import pandas as pd
    cat = []
    con = []
    for i in df.columns:
        if(df[i].dtypes == "object"):
            cat.append(i)
        else:
            con.append(i)
    X1 = pd.get_dummies(df[cat])
    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    X2 = pd.DataFrame(ss.fit_transform(df[con]),columns=con)
    X3 = X2.join(X1)
    return X3

In [5]:
def ANOVA(df,cat,con):
    import pandas as pd
    from statsmodels.formula.api import ols
    rel = str(con) + " ~ " + str(cat)
    model = ols(rel,df).fit()
    from statsmodels.stats.anova import anova_lm
    anova_results = anova_lm(model)
    Q = pd.DataFrame(anova_results)
    a = Q['PR(>F)'][cat]
    print(round(a,5))


# Problem Statement 1: Loan Eligibility Status

# Read the dataset

In [6]:
train = pd.read_csv('C:/Users/Lenovo/Downloads/Loan/training_set (1).csv')
test = pd.read_csv('C:/Users/Lenovo/Downloads/Loan/testing_set (1).csv')

In [7]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y


# Missing data treatment

In [8]:
replacer(train)

In [9]:
#train.isna().sum()

In [10]:
train.shape

(614, 13)

# cat con separate

In [11]:
cat, con = catconsep(train)

In [12]:
cat

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [13]:
con

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [14]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,5405.54085,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y


# Declare x & y and remove unwanted columns

In [15]:
y = train[['Loan_Status']]
x = train.drop(labels=['Loan_Status','Loan_ID'],axis=1)

# Exploratory data analysis

In [16]:
for i in x.columns:
    if(x[i].dtypes == "object"):
        print(pd.crosstab(train['Loan_Status'],train[i]))
        print("------------------------------")
    else:
        print(i,ANOVA(train,"Loan_Status",i))
        print("------------------------------")

Gender       Female  Male
Loan_Status              
N                37   155
Y                75   347
------------------------------
Married       No  Yes
Loan_Status          
N             79  113
Y            134  288
------------------------------
Dependents     0   1   2  3+
Loan_Status                 
N            113  36  25  18
Y            247  66  76  33
------------------------------
Education    Graduate  Not Graduate
Loan_Status                        
N                 140            52
Y                 340            82
------------------------------
Self_Employed   No  Yes
Loan_Status            
N              166   26
Y              366   56
------------------------------
0.90252
ApplicantIncome None
------------------------------
0.14239
CoapplicantIncome None
------------------------------
0.36769
LoanAmount None
------------------------------
0.60396
Loan_Amount_Term None
------------------------------
0.0
Credit_History None
------------------------------
Prop

# preprocessing

In [17]:
x1 = preprocessing(x)

In [18]:
#y.isna().sum()

# train test split

In [19]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x1,y,test_size=0.2, random_state=31)

In [20]:
xtest.shape

(123, 20)

In [21]:
xtrain.shape

(491, 20)

# Models

# 1. Decision Tree Classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=31)
model = dtc.fit(xtrain,ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',accuracy_score(ytrain,pred_tr))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  1.0
testing_accuracy:   0.724


In [23]:
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier(random_state=31)
tg = {'max_depth' : range(2,50,1)}
cv = GridSearchCV(dtc, tg, scoring = 'accuracy', cv=3)
cvmodel = cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'max_depth': 2}

In [24]:
# MAX_DEPTH
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=31, max_depth=2)
model = dtc.fit(xtrain,ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.8
testing_accuracy:   0.854


In [25]:
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier(random_state=31)
tg = {'min_samples_leaf' : range(1,50,1)}
cv = GridSearchCV(dtc, tg, scoring = 'accuracy', cv=3)
cvmodel = cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'min_samples_leaf': 35}

In [26]:
# min_samples_leaf
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=31, min_samples_leaf=35)
model = dtc.fit(xtrain,ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.798
testing_accuracy:   0.854


In [27]:
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier(random_state=31)
tg = {'min_samples_split' : range(2,50,1)}
cv = GridSearchCV(dtc, tg, scoring = 'accuracy', cv=3)
cvmodel = cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'min_samples_split': 47}

In [28]:
# min_samples_split
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=31, min_samples_split=47)
model = dtc.fit(xtrain,ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.845
testing_accuracy:   0.829


# 2. Random forest classifier

In [29]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=31)
model = rfc.fit(xtrain, ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  1.0
testing_accuracy:   0.829


In [30]:
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(random_state=31)
tg = {'max_depth' : range(2,50,1)}
cv = GridSearchCV(rfc, tg, scoring = 'accuracy', cv=3)
cvmodel = cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'max_depth': 3}

In [31]:
# max depth
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=31,max_depth=3)
model = rfc.fit(xtrain, ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.804
testing_accuracy:   0.854


In [32]:
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(random_state=31)
tg = {'min_samples_leaf' : range(1,50,1)}
cv = GridSearchCV(rfc, tg, scoring = 'accuracy', cv=3)
cvmodel = cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'min_samples_leaf': 5}

In [33]:
# min_samples_leaf
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=31,min_samples_leaf=5)
model = rfc.fit(xtrain, ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.815
testing_accuracy:   0.854


In [34]:
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(random_state=31)
tg = {'min_samples_split' : range(2,50,1)}
cv = GridSearchCV(rfc, tg, scoring = 'accuracy', cv=3)
cvmodel = cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'min_samples_split': 20}

In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=31,min_samples_split=20)
model = rfc.fit(xtrain, ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.835
testing_accuracy:   0.854


# 3. Adaboost

In [36]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=31)

from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(dtc,random_state=31,n_estimators=25)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.835
testing_accuracy:   0.854


In [37]:
from sklearn.model_selection import GridSearchCV
tp = {'n_estimators' : range(2,50,1)}
cv = GridSearchCV(abc,tp,scoring="accuracy",cv=4)
cvmodel = cv.fit(xtrain,ytrain)
print(cvmodel.best_params_)

{'n_estimators': 2}


In [38]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=31)

from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(dtc,random_state=31,n_estimators=2)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.835
testing_accuracy:   0.854


# 4. KNC

In [39]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=3)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.835
testing_accuracy:   0.854


In [40]:
from sklearn.model_selection import GridSearchCV
tp = {'n_neighbors' : range(2,50,1)}
cv = GridSearchCV(knc,tp,scoring="accuracy",cv=4)
cvmodel = cv.fit(xtrain,ytrain)
print(cvmodel.best_params_)

{'n_neighbors': 11}


In [41]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=11)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import accuracy_score
print('training_accuracy: ',round(accuracy_score(ytrain,pred_tr),3))
print('testing_accuracy:  ', round(accuracy_score(ytest,pred_ts),3))

training_accuracy:  0.835
testing_accuracy:   0.854


# Compare the models

In [42]:
Model_Comparison = pd.DataFrame({'Model': ['DTC_max_depth', 'DTC_min_samples_split', 'DTC_min_samples_leaf','RFC_max_depth', 
                         'RFC_min_samples_split', 'RFC_min_samples_split','Adaboost', 'KNeighbors'], 
               
                'training_accuracy': [0.8, 0.845 , 0.798 , 0.804, 0.835, 0.815, 0.835, 0.835],
                'testing_accuracy': [0.854, 0.829, 0.854, 0.854, 0.854, 0.854, 0.854, 0.854]})

In [43]:
Model_Comparison.sort_values(by='testing_accuracy', ascending=False)

Unnamed: 0,Model,training_accuracy,testing_accuracy
0,DTC_max_depth,0.8,0.854
2,DTC_min_samples_leaf,0.798,0.854
3,RFC_max_depth,0.804,0.854
4,RFC_min_samples_split,0.835,0.854
5,RFC_min_samples_split,0.815,0.854
6,Adaboost,0.835,0.854
7,KNeighbors,0.835,0.854
1,DTC_min_samples_split,0.845,0.829


# Logistic regression model

In [44]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_model = lr.fit(xtrain,ytrain)

# Test data

In [45]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [46]:
test1 = test.drop(labels=['Loan_ID'],axis=1)

In [47]:
replacer(test1)

In [48]:
test2 = preprocessing(test1)

In [49]:
test2.shape

(367, 20)

# predictions

In [50]:
pred = lr_model.predict(test2)

In [51]:
pred

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [52]:
test["Predicted_loan_status"] = pred

In [53]:
result_1 = test[['Loan_ID','Predicted_loan_status']]

# Problem Statement 2: Loan amount prediction

# read the data

In [64]:
a1 = train[train['Loan_Status'] =='Y']
a2 = test[test['Predicted_loan_status'] == "Y"]

In [65]:
a1.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban,Y
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417.0,4196.0,267.0,360.0,1.0,Urban,Y


In [66]:
a2.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predicted_loan_status
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,Y
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,Y


# drop unnecessary columns

In [67]:
a1 = a1.drop(labels=['Loan_ID'],axis=1)
a2 = a2.drop(labels=['Loan_ID'],axis=1)

# missing data treatment

In [68]:
replacer(a1)
replacer(a2)

In [69]:
a1.shape

(422, 12)

In [70]:
a2

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predicted_loan_status
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,Y
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,Y
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,Y
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
362,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban,Y
363,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban,Y
364,Male,No,0,Graduate,No,3250,1993,126.0,360.0,1.0,Semiurban,Y
365,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural,Y


# concat the 'Y' datasets

In [71]:
a1.columns = a2.columns

In [72]:
a = pd.concat([a1,a2])

# separate cat-con columns

In [73]:
cat1,con1 = catconsep(a)

In [74]:
cat1

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Predicted_loan_status']

In [75]:
con1

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

# Declare x and y

In [76]:
x2 = a.drop(labels=["LoanAmount"],axis=1)
y2 = a[["LoanAmount"]]

# preprocessing

In [77]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in cat1:
    x2[i] = le.fit_transform(x2[i])

In [78]:
x2['Loan_Amount_Term'] = np.log(x2['Loan_Amount_Term'])

In [79]:
x2.shape

(724, 11)

In [80]:
y2.shape

(724, 1)

In [81]:
x2.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area,Predicted_loan_status
0,1,0,0,0,0,5849.0,0.0,5.886104,1.0,2,0
2,1,1,0,0,1,3000.0,0.0,5.886104,1.0,2,0
3,1,1,0,1,0,2583.0,2358.0,5.886104,1.0,2,0
4,1,0,0,0,0,6000.0,0.0,5.886104,1.0,2,0
5,1,1,2,0,1,5417.0,4196.0,5.886104,1.0,2,0


In [82]:
y2.head()

Unnamed: 0,LoanAmount
0,146.412162
2,66.0
3,120.0
4,141.0
5,267.0


# split the dataset 

In [83]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x2, y2, test_size=0.2, random_state=31)

# Regression models

In [84]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
linear_model = lr.fit(xtrain,ytrain)

# 1: Lasso

In [85]:
from sklearn.linear_model import Lasso
ls = Lasso()

tg = []          # tuning grid
x = 0.50

for i in range(0,100,1):
    x = x + 0.001
    x = round(x,3)
    tg.append(x)
    
tp = {'alpha': tg}     # tuning parameter

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(ls,tp, scoring='neg_mean_squared_error', cv = 4)
cvmodel = cv.fit(xtrain,ytrain)
l = cvmodel.best_params_['alpha']

ls = Lasso(alpha = l)
lasso_model = ls.fit(xtrain,ytrain)

# 2: Ridge

In [86]:
from sklearn.linear_model import Ridge
rg = Ridge()

tg = []          # tuning grid
x = 0.50

for i in range(0,100,1):
    x = x + 0.001
    x = round(x,3)
    tg.append(x)
    
tp = {'alpha': tg}     # tuning parameter

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(rg, tp, scoring='neg_mean_squared_error', cv = 4)
cvmodel = cv.fit(xtrain,ytrain)
r = cvmodel.best_params_['alpha']

rg = Ridge(alpha = r)
ridge_model = rg.fit(xtrain,ytrain)

# 2: Decision tree regressor

In [87]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr_model = dtr.fit(xtrain,ytrain)

# 3: Adaboost regressor

In [88]:
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(dtr,random_state=31)

from sklearn.model_selection import GridSearchCV
tp = {'n_estimators' : range(2,50,1)}
cv = GridSearchCV(abr,tp,scoring="neg_mean_absolute_error",cv=4)
cvmodel = cv.fit(xtrain,ytrain)
ne = cvmodel.best_params_['n_estimators']

abr = AdaBoostRegressor(dtr, random_state=31, n_estimators = ne)
abr_model = abr.fit(xtrain,ytrain)

# 4: KNN regressor

In [89]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()

from sklearn.model_selection import GridSearchCV
tp = {'n_neighbors' : range(2,50,1)}
cv = GridSearchCV(knr,tp,scoring="neg_mean_absolute_error",cv=4)
cvmodel = cv.fit(xtrain,ytrain)
nn = cvmodel.best_params_['n_neighbors']

knr = KNeighborsRegressor(n_neighbors = nn)
knr_model = knr.fit(xtrain,ytrain)

# testing 'N' data

In [90]:
b = test[test['Predicted_loan_status'] == "N"]

In [91]:
b.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predicted_loan_status
7,LP001056,Male,Yes,2,Not Graduate,No,3881,0,147.0,360.0,0.0,Rural,N
13,LP001094,Male,Yes,2,Graduate,,12173,0,166.0,360.0,0.0,Semiurban,N
25,LP001153,Male,No,0,Graduate,No,0,24000,148.0,360.0,0.0,Rural,N
35,LP001203,Male,No,0,Graduate,No,3150,0,176.0,360.0,0.0,Semiurban,N
55,LP001313,Male,No,0,Graduate,No,2750,0,130.0,360.0,0.0,Urban,N


In [92]:
b1 = b.drop(labels=['LoanAmount'],axis=1)

In [93]:
cat2,con2 = catconsep(b1)

In [94]:
cat2

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Predicted_loan_status']

In [95]:
con2

['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History']

In [96]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in cat2:
    b1[i] = le.fit_transform(b[i])

In [97]:
b1['Loan_Amount_Term'] = np.log(b1['Loan_Amount_Term'])

In [98]:
b1.shape

(65, 12)

In [99]:
b1.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area,Predicted_loan_status
7,0,1,1,2,1,0,3881,0,5.886104,0.0,0,0
13,1,1,1,2,0,2,12173,0,5.886104,0.0,1,0
25,2,1,0,0,0,0,0,24000,5.886104,0.0,0,0
35,3,1,0,0,0,0,3150,0,5.886104,0.0,1,0
55,4,1,0,0,0,0,2750,0,5.886104,0.0,2,0


In [100]:
b2 = b1.drop(labels=['Loan_ID'],axis=1)

In [101]:
replacer(b2)

# predictions

In [102]:
pred2 = linear_model.predict(b2)   # Linear 

In [103]:
pred3 =lasso_model.predict(b2) # lasso

In [104]:
pred4 = ridge_model.predict(b2)    # Ridge

In [105]:
pred5 = dtr_model.predict(b2)       # Decison Tree Regressor

In [106]:
pred6 = abr_model.predict(b2)        # Adaboost regressor

In [107]:
pred7 = knr_model.predict(b2)      # K neighbor regressor

In [108]:
b["Pred_loan_amt_Linear"] = pred2
b["Pred_loan_amt_lasso"] = pred3
b["Pred_loan_amt_ridge"] = pred4
b["Pred_loan_amt_dtr"] = pred5
b["Pred_loan_amt_abr"] = pred6
b["Pred_loan_amt_knr"] = pred7

In [109]:
result_2 = b[['LoanAmount','Pred_loan_amt_Linear','Pred_loan_amt_lasso','Pred_loan_amt_ridge','Pred_loan_amt_dtr','Pred_loan_amt_abr','Pred_loan_amt_knr']]

# Problem Statement 3: Loan term prediction

In [110]:
p1 = train[(train['Loan_Status'] == 'Y') & (train['Loan_Amount_Term'] <= 240)]
p2 = test[(test['Predicted_loan_status'] == 'Y') & (test['Loan_Amount_Term'] <= 240)]

In [111]:
p3 = p2.drop(labels=['Loan_ID'],axis=1)
p1 = p1.drop(labels=['Loan_ID'],axis=1)

In [112]:
replacer(p1)
replacer(p3)

In [113]:
p1.columns = p3.columns

In [114]:
p = pd.concat([p1,p3])

In [115]:
p.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predicted_loan_status
14,Male,Yes,2,Graduate,No,1299.0,1086.0,17.0,120.0,1.0,Urban,Y
16,Male,No,1,Not Graduate,No,3596.0,0.0,100.0,240.0,0.842199,Urban,Y
68,Male,Yes,3+,Not Graduate,Yes,7100.0,0.0,125.0,60.0,1.0,Urban,Y
84,Male,Yes,1,Graduate,No,3988.0,0.0,50.0,240.0,1.0,Urban,Y
91,Male,Yes,2,Graduate,No,6250.0,5654.0,188.0,180.0,1.0,Semiurban,Y


In [116]:
p.shape

(69, 12)

In [117]:
cat3 , con3 = catconsep(p)

In [118]:
cat3

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Predicted_loan_status']

In [119]:
con3

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [120]:
#define x and y
x3 = p.drop(labels=["Loan_Amount_Term"],axis=1)
y3 = p[["Loan_Amount_Term"]]

In [121]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in cat3:
    x3[i] = le.fit_transform(x3[i])

In [122]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x3, y3, test_size=0.2, random_state=31)

# Models

In [123]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
linear_model_2 = lr.fit(xtrain,ytrain)

# 1. Lasso

In [124]:
from sklearn.linear_model import Lasso
ls = Lasso()

tg = []          # tuning grid
x = 0.50

for i in range(0,100,1):
    x = x + 0.001
    x = round(x,3)
    tg.append(x)
    
tp = {'alpha': tg}     # tuning parameter

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(ls,tp, scoring='neg_mean_squared_error', cv = 4)
cvmodel = cv.fit(xtrain,ytrain)
l = cvmodel.best_params_['alpha']

ls = Lasso(alpha = l)
lasso_model = ls.fit(xtrain,ytrain)

# 2. Ridge

In [125]:
from sklearn.linear_model import Ridge
rg = Ridge()

tg = []          # tuning grid
x = 0.50

for i in range(0,100,1):
    x = x + 0.001
    x = round(x,3)
    tg.append(x)
    
tp = {'alpha': tg}     # tuning parameter

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(rg, tp, scoring='neg_mean_squared_error', cv = 4)
cvmodel = cv.fit(xtrain,ytrain)
r = cvmodel.best_params_['alpha']

rg = Ridge(alpha = r)
ridge_model = rg.fit(xtrain,ytrain)

# 3. DecisionTreeRegressor

In [126]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr_model = dtr.fit(xtrain,ytrain)

# 4. AdaBoostRegressor

In [127]:
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(dtr,random_state=31)

from sklearn.model_selection import GridSearchCV
tp = {'n_estimators' : range(2,50,1)}
cv = GridSearchCV(abr,tp,scoring="neg_mean_absolute_error",cv=4)
cvmodel = cv.fit(xtrain,ytrain)
ne = cvmodel.best_params_['n_estimators']

abr = AdaBoostRegressor(dtr, random_state=31, n_estimators = ne)
abr_model = abr.fit(xtrain,ytrain)

# 5. KNeighborsRegressor

In [128]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()

from sklearn.model_selection import GridSearchCV
tp = {'n_neighbors' : range(2,50,1)}
cv = GridSearchCV(knr,tp,scoring="neg_mean_absolute_error",cv=4)
cvmodel = cv.fit(xtrain,ytrain)
nn = cvmodel.best_params_['n_neighbors']

knr = KNeighborsRegressor(n_neighbors = nn)
knr_model = knr.fit(xtrain,ytrain)

# testing set

In [129]:
q = test[(test['Predicted_loan_status'] == 'N') & (test['Loan_Amount_Term'] <= 240)]

In [130]:
q1 = q.drop(labels=['Loan_Amount_Term'],axis=1)

In [131]:
q1

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Predicted_loan_status
84,LP001450,Male,Yes,0,Graduate,No,4456,0,131.0,0.0,Semiurban,N
164,LP001921,Male,No,1,Graduate,No,3180,2370,80.0,,Rural,N
173,LP001979,Male,No,0,Graduate,No,3017,2845,159.0,0.0,Urban,N
245,LP002355,,Yes,0,Graduate,No,3186,3145,150.0,0.0,Semiurban,N
325,LP002802,Male,No,0,Graduate,No,2875,2416,95.0,0.0,Semiurban,N
354,LP002921,Male,Yes,3+,Not Graduate,No,5316,187,158.0,0.0,Semiurban,N


In [132]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in cat3:
    q1[i] = le.fit_transform(q1[i])

In [133]:
q1

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Predicted_loan_status
84,LP001450,0,1,0,0,0,4456,0,131.0,0.0,1,0
164,LP001921,0,0,1,0,0,3180,2370,80.0,,0,0
173,LP001979,0,0,0,0,0,3017,2845,159.0,0.0,2,0
245,LP002355,1,1,0,0,0,3186,3145,150.0,0.0,1,0
325,LP002802,0,0,0,0,0,2875,2416,95.0,0.0,1,0
354,LP002921,0,1,2,1,0,5316,187,158.0,0.0,1,0


In [134]:
q1 = q1.drop(labels=['Loan_ID'],axis=1)

In [135]:
q1.shape

(6, 11)

In [136]:
replacer(q1)

# prediction

In [137]:
pred = linear_model_2.predict(q1)
pred2 = lasso_model.predict(q1)
pred3 = ridge_model.predict(q1)
pred4 = dtr_model.predict(q1)
pred5 = abr_model.predict(q1)
pred6 = knr_model.predict(q1)

In [138]:
q["Pred_loan_amt_term_Linear"] = pred
q["Pred_loan_term_lasso"] = pred2
q["Pred_loan_amt_term_ridge"] = pred3
q["Pred_loan_amt_term_dtr"] = pred4
q["Pred_loan_amt_term_abr"] = pred5
q["Pred_loan_amt_term_knr"] = pred6

In [139]:
result_3 = q[['Loan_Amount_Term','Pred_loan_amt_term_Linear','Pred_loan_term_lasso','Pred_loan_amt_term_ridge','Pred_loan_amt_term_dtr','Pred_loan_amt_term_abr','Pred_loan_amt_term_knr']]

# Results

In [140]:
result_1             # Loan Status

Unnamed: 0,Loan_ID,Predicted_loan_status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y


In [141]:
result_2               # Loan amount

Unnamed: 0,LoanAmount,Pred_loan_amt_Linear,Pred_loan_amt_lasso,Pred_loan_amt_ridge,Pred_loan_amt_dtr,Pred_loan_amt_abr,Pred_loan_amt_knr
7,147.0,171.204801,127.843725,166.876274,135.0,121.0,97.760000
13,166.0,261.276911,206.447222,256.640629,260.0,260.0,227.916352
25,148.0,267.624255,225.809031,263.282532,103.0,103.0,188.720000
35,176.0,146.971543,103.515135,142.562796,100.0,100.0,92.560000
55,130.0,137.812364,94.995843,133.399618,71.0,70.0,81.560000
...,...,...,...,...,...,...,...
336,145.0,134.719421,95.309040,130.524191,84.0,110.0,121.040000
339,162.0,178.822876,132.598378,174.358331,111.0,158.0,146.520000
346,133.0,186.131755,139.083175,181.605311,110.0,116.0,129.280000
351,106.0,232.187534,191.162012,227.978746,103.0,103.0,189.920000


In [142]:
result_3                 # loan amount term

Unnamed: 0,Loan_Amount_Term,Pred_loan_amt_term_Linear,Pred_loan_term_lasso,Pred_loan_amt_term_ridge,Pred_loan_amt_term_dtr,Pred_loan_amt_term_abr,Pred_loan_amt_term_knr
84,180.0,114.508385,135.216569,121.438452,84.0,180.0,152.487805
164,240.0,142.572318,154.639895,147.447734,180.0,180.0,151.02439
173,180.0,112.168624,128.638464,118.0429,180.0,60.0,151.02439
245,180.0,130.209632,141.257271,133.500835,180.0,180.0,151.02439
325,6.0,123.580199,138.687231,129.178277,180.0,180.0,151.02439
354,180.0,138.643787,153.306788,144.826198,84.0,180.0,153.95122
