# Read the csv file

In [101]:
import pandas as pd
import numpy as np

data = pd.read_csv("train.csv")
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# Cleaning data

In [102]:
class Cleaner:
    def __init__(self, df):
        self.df = df

    def clean(self):
        self.drop_unwanted()
        self.convert_text()
        self.convert_symbol()
        self.convert_int()
        return self.df
    
    def drop_unwanted(self):
        self.df = self.df.drop(["Loan_ID"], axis=1)

    def convert_text(self):
        tab = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
        for col in tab:
            properties = self.df[col].unique()
            property2index = {prop:i for (i, prop) in enumerate(properties)}
            max_index = max(list(property2index.values()))
            self.df.loc[:, col] = self.df[col].replace(property2index)
            self.df.loc[self.df[col].map(type).eq(str),col] = np.nan
            self.df[col].fillna(max_index + 1)

    def convert_symbol(self):
        self.df["Dependents"] = self.df["Dependents"].str.replace("+", "")

    def convert_int(self):
        self.df = self.df.fillna(0)
        self.df = self.df.astype("int64")

cleaner = Cleaner(data)
data = cleaner.clean()
data

  self.df.loc[:, col] = self.df[col].replace(property2index)
  self.df[col].fillna(max_index + 1)
  self.df = self.df.fillna(0)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,0,0,5849,0,0,360,1,0,0
1,0,1,1,0,0,4583,1508,128,360,1,1,1
2,0,1,0,0,1,3000,0,66,360,1,0,0
3,0,1,0,1,0,2583,2358,120,360,1,0,0
4,0,0,0,0,0,6000,0,141,360,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
609,1,0,0,0,0,2900,0,71,360,1,1,0
610,0,1,3,0,0,4106,0,40,180,1,1,0
611,0,1,1,0,0,8072,240,253,360,1,0,0
612,0,1,2,0,0,7583,0,187,360,1,0,0


# Import Libriaries

In [103]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# DecisionTreeClassifier

In [104]:
from sklearn.tree import DecisionTreeClassifier

X = data.drop(["Loan_Status"], axis=1)
Y = data["Loan_Status"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=42)

dtc = DecisionTreeClassifier()
rfc.fit(X_train, Y_train)

Y_pred = rfc.predict(X_test)

print("test accuracy",accuracy_score(Y_test, Y_pred))

test accuracy 0.6504065040650406


# RandomForestClassifier

In [126]:
from sklearn.ensemble import RandomForestClassifier


X = data.drop(["Loan_Status"], axis=1)
Y = data["Loan_Status"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=42)

test_scores = []

tab = []
for i in range(1, 20):
    tab.append(i*5)

tab

tab1 = []
tab2 = []
tab3 = []

for max_dep in range(1, 10):
    # Iterate over different values of random_state
    for rand_state in tab:
        # Iterate over different values of n_estimators
        for n_est in tab:

            tab1.append(max_dep)
            tab2.append(rand_state)
            tab3.append(n_est)

            rfc = RandomForestClassifier(n_estimators=n_est, max_depth=max_dep, min_samples_split=5, random_state=rand_state)
            rfc.fit(X_train, Y_train)

            Y_pred = rfc.predict(X_test)

            test_scores.append(accuracy_score(Y_test, Y_pred))

            print("---------------")
            print("score")
            print(accuracy_score(Y_test, Y_pred))
            print("max_dep, rand_state, n_est")
            print(max_dep, rand_state, n_est)

print("--------------------------------------------------------------------")
print("test accuracy",max(test_scores))
print("parameter")
print("max_depth",tab1[test_scores.index(max(test_scores))])
print("random_state",tab2[test_scores.index(max(test_scores))])
print("n_estimators",tab3[test_scores.index(max(test_scores))])


---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 5
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 10
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 15
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 20
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 25
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 30
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 35
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 40
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 45
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 50
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 55
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 60
---------------
score
0.6504065040650406
max_dep, rand_state, n_est
1 5 65
---------------
score
0.65

# Best model with best parameters

In [129]:
from sklearn.ensemble import RandomForestClassifier

X = data.drop(["Loan_Status"], axis=1)
Y = data["Loan_Status"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=0)

rfc = RandomForestClassifier(n_estimators=45, max_depth=3, min_samples_split=5, random_state=70)
rfc.fit(X_train, Y_train)

Y_pred = rfc.predict(X_test)

test_scores.append(accuracy_score(Y_test, Y_pred))

print("score")
print(accuracy_score(Y_test, Y_pred))

score
0.8130081300813008
