In [33]:
import pandas as pd
from sklearn import metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

In [34]:
df = pd.read_csv('loan_prediction_dataset.csv')

In [35]:
print(df.isnull().sum())
print(df.duplicated().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
0


In [36]:
df = df.drop(["Loan_ID"], axis=1)

In [37]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df["Self_Employed"] = df["Self_Employed"].fillna(df["Self_Employed"].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(int(df['LoanAmount'].mean()))
df["Loan_Amount_Term"] = df["Loan_Amount_Term"].fillna(int(df['LoanAmount'].mean()))
df['Credit_History'] = df['Credit_History'].fillna(int(df['Credit_History'].mode()[0]))

In [38]:
print(df.isnull().sum())

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [39]:
label = LabelEncoder()
df['Gender'] = label.fit_transform(df['Gender'])
df['Married'] = label.fit_transform(df['Married'])
df['Dependents'] = label.fit_transform(df['Dependents'])
df['Education'] = label.fit_transform(df['Education'])
df["Self_Employed"] = label.fit_transform(df["Self_Employed"])
df["Property_Area"] = label.fit_transform(df["Property_Area"])

In [40]:
X = df.drop(["Loan_Status"], axis=1).values
Y = df["Loan_Status"]

In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=8)

In [42]:
st_x = StandardScaler()
X_train = st_x.fit_transform(X_train)
X_test = st_x.transform(X_test)

In [43]:
ID3 = DecisionTreeClassifier(criterion="gini", max_depth=1)
ID3 = ID3.fit(X_train, Y_train)
Y_pred_ID3 = ID3.predict(X_test)

In [44]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train, Y_train)
Y_pred_SVM = clf.predict(X_test)

In [45]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
Y_pred_LogReg = model.predict(X_test)

In [46]:
RForest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
RForest.fit(X_train, Y_train)
Y_pred_RForest = RForest.predict(X_test)

In [47]:
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
Y_pred_BNB = bnb.predict(X_test)

In [48]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, Y_train)
Y_pred_KNN = knn.predict(X_test)

In [49]:
print("ID3 Accuracy:", metrics.accuracy_score(Y_test, Y_pred_ID3))
print("SVM Accuracy:", metrics.accuracy_score(Y_test, Y_pred_SVM))
print("Logistic Regression Accuracy:", metrics.accuracy_score(Y_test, Y_pred_LogReg))
print("Random Forest Accuracy:", metrics.accuracy_score(Y_test, Y_pred_RForest))
print("Naive Bayes Accuracy:", metrics.accuracy_score(Y_test, Y_pred_BNB))
print("knn Accuracy:", metrics.accuracy_score(Y_test, Y_pred_KNN))


ID3 Accuracy: 0.8648648648648649
SVM Accuracy: 0.8648648648648649
Logistic Regression Accuracy: 0.8594594594594595
Random Forest Accuracy: 0.8594594594594595
Naive Bayes Accuracy: 0.8648648648648649
knn Accuracy: 0.8432432432432433
