In [55]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("cleaned_data")
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
df = df.drop(['loan_id','luxury_assets_value'],axis=1)

In [4]:
df.head(2)

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,3300000,Rejected


In [7]:
categorical_feature = df.dtypes==object
categorical_cols = df.columns[categorical_feature].tolist()

In [8]:
# import labelencoder
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()

In [9]:
# apply le on categorical feature columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))
df[categorical_cols].head(10)

Unnamed: 0,education,self_employed,loan_status
0,0,0,0
1,1,1,1
2,0,0,1
3,0,0,1
4,1,1,1
5,0,1,1
6,0,0,0
7,0,1,1
8,0,1,0
9,1,0,1


In [10]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,12800000,1
3,3,0,0,8200000,30700000,8,467,18200000,3300000,7900000,1
4,5,1,1,9800000,24200000,20,382,12400000,8200000,5000000,1


In [12]:
#the dependent and independent variable
X = df.drop(columns = ['loan_status'])
y = df['loan_status']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)
X_train.shape, X_test.shape

((3154, 10), (1052, 10))

In [16]:
##standard Scaling- Standardization
def scaler_standard(X_train, X_test):
    #scaling the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #saving the model
    file = open('Model/standardScalar.pkl','wb')
    pickle.dump(scaler,file)
    file.close()
    
    return X_train_scaled, X_test_scaled

In [19]:
X_train_scaled, X_test_scaled = scaler_standard(X_train, X_test)

In [20]:
X_train_scaled

array([[-0.87791353, -1.01404922, -1.00190416, ..., -0.82263847,
        -0.37649305,  0.14482528],
       [ 0.89628933, -1.01404922,  0.99809946, ..., -0.09743472,
         0.40489634,  1.89290297],
       [-1.46931448, -1.01404922, -1.00190416, ..., -0.7904072 ,
        -0.03176243, -0.07368443],
       ...,
       [ 0.89628933, -1.01404922, -1.00190416, ..., -1.11271997,
        -0.85911591, -1.10380164],
       [ 0.89628933, -1.01404922, -1.00190416, ..., -1.14495125,
        -1.06595428, -1.41595837],
       [-1.46931448, -1.01404922,  0.99809946, ...,  0.62776903,
         1.89872901,  0.11360961]])

In [45]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled,y_train)
y_pred = log_reg.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9125475285171103


In [46]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled,y_train)
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9068441064638784


In [47]:
svc = SVC()
svc.fit(X_train_scaled,y_train)
y_pred = svc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9363117870722434


In [56]:
DT = DecisionTreeClassifier()
DT.fit(X_train_scaled,y_train)
y_pred = DT.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.9923954372623575
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       669
           1       0.99      0.99      0.99       383

    accuracy                           0.99      1052
   macro avg       0.99      0.99      0.99      1052
weighted avg       0.99      0.99      0.99      1052



In [49]:
rfc = RandomForestClassifier()
rfc.fit(X_train_scaled,y_train)
y_pred = rfc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9866920152091255


In [50]:
abc = AdaBoostClassifier()
abc.fit(X_train_scaled,y_train)
y_pred = abc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9714828897338403


In [51]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train_scaled,y_train)
y_pred = gbc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9809885931558935


In [52]:
gnb = GaussianNB()
gnb.fit(X_train_scaled,y_train)
y_pred = gnb.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9258555133079848


In [53]:
xgb = XGBClassifier()
xgb.fit(X_train_scaled,y_train)
y_pred = xgb.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9866920152091255


Decision Tree Classifier gives 99% accuracy Score

In [57]:
file = open('Model/modelForPrediction.pkl','wb')
pickle.dump(DT,file)
file.close()