# READ THE DATASET

In [1]:
import pandas as pd

data = pd.read_csv('dataset/train_u6lujuX_CVtuZ9i.csv')
print(data.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

Features

# Loan_ID : Unique Loan ID

# Gender : Male/ Female

# Married : Applicant married (Y/N)

# Dependents : Number of dependents

# Education : Applicant Education (Graduate/ Under Graduate)

# Self_Employed : Self employed (Y/N)

# ApplicantIncome : Applicant income

# CoapplicantIncome : Coapplicant income

# LoanAmount : Loan amount in thousands of dollars

# Loan_Amount_Term : Term of loan in months

# Credit_History : Credit history meets guidelines yes or no

# Property_Area : Urban/ Semi Urban/ Rural

# Loan_Status : Loan approved (Y/N) this is the target variable

In [2]:
data.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


# data.shape --> (Num of Row, Num of Col)

In [3]:
data.shape

(614, 13)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# NULL VALUE COUNT IN COLUMNWISE 

In [5]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# NULL VALUE PARCENTAGE IN COLUMNWISE

In [6]:
data.isnull().sum()*100/len(data)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

# DROP UNNECESSARY COLUMN

In [7]:
data=data.drop('Loan_ID',axis=1)
data.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y


# DROP ALL THE NULL VALUE CONTAINS ROW 

In [8]:
columns = ["Gender", "Dependents", "LoanAmount", "Loan_Amount_Term"]
data = data.dropna(subset=columns, axis=0)
data.isnull().sum()


Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        30
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
data.isnull().sum()*100/len(data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [10]:
data.shape

(553, 12)

# REASSING VALUE IN THE NULL VALUE

In [11]:
data['Self_Employed'].mode()
# most frequent value in Self_Employed column 

0    No
Name: Self_Employed, dtype: object

In [12]:
data['Self_Employed']=data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

In [13]:
data.isnull().sum()


Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [14]:
data['Credit_History'].unique()

array([ 1.,  0., nan])

In [15]:
data['Credit_History'].mode()

0    1.0
Name: Credit_History, dtype: float64

In [16]:
data['Credit_History']=data['Credit_History'].fillna(data['Credit_History'].mode()[0])

In [17]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# RANDOM 5 SAMPLE

In [18]:
data.sample(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
149,Male,Yes,0,Graduate,No,4860,830.0,125.0,360.0,1.0,Semiurban,Y
110,Male,No,0,Graduate,No,5316,0.0,136.0,360.0,1.0,Urban,Y
146,Female,Yes,2,Graduate,No,14866,0.0,70.0,360.0,1.0,Urban,Y
268,Female,No,0,Graduate,No,3418,0.0,135.0,360.0,1.0,Rural,N
508,Male,Yes,0,Graduate,Yes,2479,3013.0,188.0,360.0,1.0,Urban,Y


In [19]:
data["Dependents"].unique()

array(['1', '0', '2', '3+'], dtype=object)

# DATA REPLACE

In [20]:
data["Dependents"]=data["Dependents"].replace(to_replace="3+", value="4")

In [21]:
data["Dependents"].unique()

array(['1', '0', '2', '4'], dtype=object)

# DATA UPDATE IN NUMBER

In [22]:
data["Gender"].unique()

array(['Male', 'Female'], dtype=object)

In [23]:
data["Gender"]=data["Gender"].map({'Male': 1, 'Female': 0}).astype(int)

In [24]:
data["Gender"].unique()

array([1, 0])

In [25]:
data["Married"]=data["Married"].map({'Yes': 1, 'No': 0}).astype(int)
data["Education"]=data["Education"].map({'Graduate': 1, 'Not Graduate': 0}).astype(int)
data["Self_Employed"]=data["Self_Employed"].map({'Yes': 1, 'No': 0}).astype(int)
data["Property_Area"]=data["Property_Area"].map({'Urban': 2, 'Rural': 0, 'Semiurban': 1}).astype(int)
data["Loan_Status"]=data["Loan_Status"].map({'Y': 1, 'N': 0}).astype(int)

In [26]:
data.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,2,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,2,1
613,0,0,0,1,1,4583,0.0,133.0,360.0,0.0,1,0


# STORE FEATURE MATRIX

In [27]:
X=data.drop('Loan_Status', axis=1)
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,2
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,2


In [28]:
y=data['Loan_Status']
y

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 553, dtype: int64

# FEATURE SCALLING

In [29]:
cols = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]


In [30]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[cols]= scaler.fit_transform(X[cols])
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,2
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,2
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,2
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,2
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,2


# SPLIT DATASET AND K-FOLD CROSS VALIDATION

In [31]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

model_df={}
def model_val(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model} Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred)}")
    score = cross_val_score(model, X, y, cv=5)
    print(f"{model} Cross-Validation Score: {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100, 2)
    

# Logistic Regession

In [32]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model, X, y)

model_df

LogisticRegression() Accuracy: 0.7747747747747747
LogisticRegression() Cross-Validation Score: 0.802964782964783


{LogisticRegression(): np.float64(80.3)}

# SUPPORT VECTOR CLASSIFIER

In [33]:
from sklearn import svm
model = svm.SVC()
model_val(model, X, y)

model_df

SVC() Accuracy: 0.7477477477477478
SVC() Cross-Validation Score: 0.7938902538902539


{LogisticRegression(): np.float64(80.3), SVC(): np.float64(79.39)}

# DECISION TREE CLASSIFIER

In [34]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model, X, y)

model_df

DecisionTreeClassifier() Accuracy: 0.6666666666666666
DecisionTreeClassifier() Cross-Validation Score: 0.7124488124488125


{LogisticRegression(): np.float64(80.3),
 SVC(): np.float64(79.39),
 DecisionTreeClassifier(): np.float64(71.24)}

# RANDOM FOREST CLASSIFIER

In [35]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model_val(model, X, y)

model_df

RandomForestClassifier() Accuracy: 0.7837837837837838
RandomForestClassifier() Cross-Validation Score: 0.7830466830466831


{LogisticRegression(): np.float64(80.3),
 SVC(): np.float64(79.39),
 DecisionTreeClassifier(): np.float64(71.24),
 RandomForestClassifier(): np.float64(78.3)}

# GRADIENT BOOSTING CLASSIFIER  

In [36]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model_val(model, X, y)

model_df

GradientBoostingClassifier() Accuracy: 0.7477477477477478
GradientBoostingClassifier() Cross-Validation Score: 0.7757739557739558


{LogisticRegression(): np.float64(80.3),
 SVC(): np.float64(79.39),
 DecisionTreeClassifier(): np.float64(71.24),
 RandomForestClassifier(): np.float64(78.3),
 GradientBoostingClassifier(): np.float64(77.58)}

# HYPERPARAMETER TUNING

In [37]:
from sklearn.model_selection import RandomizedSearchCV

# LOGISTIC REGRESSION

In [38]:
log_reg_grid = {
    "C": np.logspace(-4, 4, 20),
    "solver": ["liblinear"],
}

In [None]:
rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions=log_reg_grid, n_iter=10, cv=5, verbose=True)
rs_log_reg.fit(X, y)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


0,1,2
,estimator,LogisticRegression()
,param_distributions,"{'C': array([1.0000...00000000e+04]), 'solver': ['liblinear']}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(10000.0)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [40]:
rs_log_reg.best_score_

np.float64(0.802964782964783)

In [41]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': np.float64(10000.0)}

# SVC

In [45]:
svc_grid = {"C": [0.25, 0.5, 0.75, 1], "kernel":["linear"]}
rs_svc = RandomizedSearchCV(svm.SVC(), param_distributions=svc_grid, n_iter=10, cv=5, verbose=True)
rs_svc.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




0,1,2
,estimator,SVC()
,param_distributions,"{'C': [0.25, 0.5, ...], 'kernel': ['linear']}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,C,0.25
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [46]:
rs_svc.best_score_

np.float64(0.8066011466011467)

In [47]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

# RANDOM FOREST CLASSIFIER

In [49]:
rf_grid = {
    "n_estimators": np.arange(10, 1000, 10),
    "max_features":["auto", "sqrt"],
    "max_depth": [None, 3, 5, 10, 20, 30],
    "min_samples_leaf": [1, 2, 4, 10],
    "min_samples_split": [2, 5, 10, 50, 100]
}

rs_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=rf_grid, cv=5, n_iter=10, verbose=True)
rs_rf.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Reyad's Projects\Test Project\Loan_prediction\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Reyad's Projects\Test Project\Loan_prediction\venv\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "d:\Reyad's Projects\Test Project\Loan_prediction\venv\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'max_depth': [None, 3, ...], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,np.int64(230)
,criterion,'gini'
,max_depth,20
,min_samples_split,50
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [50]:
rs_rf.best_score_

np.float64(0.8066175266175266)

In [51]:
rs_rf.best_params_

{'n_estimators': np.int64(230),
 'min_samples_split': 50,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20}

# COMPARISON

Logistic regression without hyperparameter: 80.3
Logistic regression with hyperparameter: 80.3

----------------------------------------------------

SVC without hyperparameter: 79.3
SVC with hyperparameter: 80.6

-----------------------------------------------------

Random Forest Classifier without hyperparameter: 78.3
Random Forest Classifier with hyperparameter: 80.6

# BEST MODEL SELECTION BASED ON THE ACCURACY

In [68]:
rf = RandomForestClassifier(n_estimators=200, max_features='sqrt', max_depth=5, min_samples_leaf=5, min_samples_split=5)

In [69]:
rf.fit(X, y)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,5
,min_samples_split,5
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# TEST DATA PREPARE

In [53]:
import pandas as pd

test_data = pd.read_csv('dataset/test_Y3wMUE5_7gLdaTN.csv')
print(test_data.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001015   Male     Yes          0      Graduate            No   
1  LP001022   Male     Yes          1      Graduate            No   
2  LP001031   Male     Yes          2      Graduate            No   
3  LP001035   Male     Yes          2      Graduate            No   
4  LP001051   Male      No          0  Not Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5720                  0       110.0             360.0   
1             3076               1500       126.0             360.0   
2             5000               1800       208.0             360.0   
3             2340               2546       100.0             360.0   
4             3276                  0        78.0             360.0   

   Credit_History Property_Area  
0             1.0         Urban  
1             1.0         Urban  
2             1.0         Urban  
3             NaN     

In [55]:
test_data.shape

(367, 12)

In [56]:
test_data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [57]:
test_data = test_data.drop('Loan_ID', axis=1)

In [58]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             356 non-null    object 
 1   Married            367 non-null    object 
 2   Dependents         357 non-null    object 
 3   Education          367 non-null    object 
 4   Self_Employed      344 non-null    object 
 5   ApplicantIncome    367 non-null    int64  
 6   CoapplicantIncome  367 non-null    int64  
 7   LoanAmount         362 non-null    float64
 8   Loan_Amount_Term   361 non-null    float64
 9   Credit_History     338 non-null    float64
 10  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 31.7+ KB


In [59]:
test_data.isnull().sum()*100/len(test_data)

Gender               2.997275
Married              0.000000
Dependents           2.724796
Education            0.000000
Self_Employed        6.267030
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           1.362398
Loan_Amount_Term     1.634877
Credit_History       7.901907
Property_Area        0.000000
dtype: float64

In [60]:
test_cols = ["Gender", "Dependents", "LoanAmount", "Loan_Amount_Term"]
test_data = test_data.dropna(subset=test_cols, axis=0)
test_data.isnull().sum()*100/len(test_data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        6.824926
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       7.715134
Property_Area        0.000000
dtype: float64

In [62]:
test_data["Self_Employed"].mode()[0]

'No'

In [63]:
test_data["Self_Employed"] = test_data["Self_Employed"].fillna(test_data["Self_Employed"].mode()[0])
test_data.isnull().sum()*100/len(test_data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        0.000000
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       7.715134
Property_Area        0.000000
dtype: float64

In [64]:
test_data["Credit_History"] = test_data["Credit_History"].fillna(test_data["Credit_History"].mode()[0])
test_data.isnull().sum()*100/len(test_data)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
dtype: float64

In [65]:
test_data["Dependents"] = test_data["Dependents"].replace(to_replace="3+", value="4")
test_data["Dependents"].unique()

array(['0', '1', '2', '4'], dtype=object)

In [66]:
test_data["Gender"] = test_data["Gender"].map({"Male":1, "Female": 0}).astype(int)
test_data["Married"] = test_data["Married"].map({"Yes": 1, "No": 0}).astype(int)
test_data["Education"] = test_data["Education"].map({"Graduate": 1, "Not Graduate": 0}).astype(int)
test_data["Self_Employed"] = test_data["Self_Employed"].map({"Yes": 1, "No": 0}).astype(int)
test_data["Property_Area"] = test_data["Property_Area"].map({"Urban": 2, "Rural": 0, "Semiurban": 1}).astype(int)
test_data.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
362,1,1,4,0,1,4009,1777,113.0,360.0,1.0,2
363,1,1,0,1,0,4158,709,115.0,360.0,1.0,2
364,1,0,0,1,0,3250,1993,126.0,360.0,1.0,1
365,1,1,0,1,0,5000,2393,158.0,360.0,1.0,0
366,1,0,0,1,1,9200,0,98.0,180.0,1.0,0


In [67]:
scalling_calls = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]
test_scaller = StandardScaler()
test_data[scalling_calls] = test_scaller.fit_transform(test_data[scalling_calls])
test_data.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,1,0,0.203074,-0.654399,-0.433904,0.271195,1.0,2
1,1,1,1,1,0,-0.343098,-0.026659,-0.178404,0.271195,1.0,2


# SAVE THE MODEL AND PREDICT FROM THE MODEL

In [74]:
import joblib
joblib.dump(rf, 'loan_prediction_model')
model = joblib.load('loan_prediction_model')
model.predict(test_data)


array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [80]:
import pandas as pd
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':5000,
    'CoapplicantIncome':4000,
    'LoanAmount':150,
    'Loan_Amount_Term':360,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [81]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,2,0,0,5000,4000,150,360,0,1


In [84]:
result = model.predict(df)
result

array([0])

In [85]:
if result[0] == 1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Not Approved
