In [125]:
import numpy as np
import pandas as pd

from sklearn.model_selection import  train_test_split,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 

import matplotlib.pyplot as plt 




In [59]:
loan_train = pd.read_csv('./train_csv.csv')
loan_train.shape
                        

(614, 13)

In [60]:
loan_train.head(20)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [61]:
total_null = loan_train.isnull().sum().sort_values(ascending=False)

In [62]:
total_null

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Loan_ID               0
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
Loan_Status           0
dtype: int64

In [63]:
loan_train['Gender'] = loan_train['Gender'].fillna( loan_train['Gender'].dropna().mode().values[0] )
loan_train['Married'] = loan_train['Married'].fillna( loan_train['Married'].dropna().mode().values[0])
loan_train['Dependents'] = loan_train['Dependents'].fillna ( loan_train['Dependents'].dropna().mode().values[0])
loan_train['Self_Employed'] = loan_train['Self_Employed'].fillna( loan_train['Self_Employed'].dropna().mode().values[0] )
loan_train['LoanAmount'] = loan_train['LoanAmount'].fillna( loan_train['LoanAmount'].dropna().mean() )

loan_train['Loan_Amount_Term'] = loan_train['Loan_Amount_Term'].fillna( loan_train['Loan_Amount_Term'].dropna().mode().values[0])

loan_train['Credit_History'] = loan_train['Credit_History'].fillna( loan_train['Credit_History'].dropna().mode().values[0] )
                                                           

In [64]:
loan_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [65]:
#Looking at uniques Values
print(set(loan_train['Gender'].values.tolist()))
print(set(loan_train['Married'].values.tolist()))
print(set(loan_train['Dependents'].values.tolist()))
print(set(loan_train['Education'].values.tolist()))
print(set(loan_train['Self_Employed'].values.tolist()))
print(set(loan_train['Property_Area'].values.tolist()))
print(set(loan_train['Loan_Status'].values.tolist()))

{'Female', 'Male'}
{'Yes', 'No'}
{'0', '3+', '2', '1'}
{'Not Graduate', 'Graduate'}
{'Yes', 'No'}
{'Semiurban', 'Urban', 'Rural'}
{'Y', 'N'}


In [75]:
loan_train['Loan_Status'] = loan_train['Loan_Status'].map({'N':0, 'Y':1}).astype(int)

loan_train = pd.get_dummies(loan_train, columns = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area',])

In [118]:
#splitting the dataset

y = loan_train['Loan_Status']
X = loan_train.drop(['Loan_Status','Loan_ID'],axis=1)

X_train,X_test ,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42 )


In [107]:
gbm_param_grid = {
    'n_estimators':range(1,1000,10),
    'max_depth':range(1,20),
    'learning_rate':[.1,.4,.45,.5,.55,.6],
    'colsample_bytree':[.6,.7,.8,.9,1],
}

In [110]:
rf_param_grid = {
    'n_estimators':range(1,1000,10)
}
rf  = RandomForestClassifier()
rf_random = RandomizedSearchCV(
    param_distributions = rf_param_grid,
    estimator = rf, scoring = 'accuracy',
    verbose = 0,
    n_iter = 100,
    cv = 4)
    

In [114]:
rf_random.fit(X_train,y_train)

In [115]:
best_params = rf_random.best_params_

In [116]:
print(f'Best Params: {best_params}')

Best Params: {'n_estimators': 381}


In [120]:
y_pred1  = rf_random.predict(X_test)


In [122]:
accuracy = np.sum(y_pred1 == y_test ) / len(y_test)

In [124]:
print(f'Accuracy:{accuracy}')

Accuracy:0.7723577235772358
