In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv("Loan_Analyzed.csv")
df.head()

Unnamed: 0,Credit Policy,Purpose,Interest Rate,Installment,Annual Income,DTI,Fico,Days_CR_Line,Revolve Balance,Revolve Util,Inq_6months,Delinq_2years,Public Records,Not Fully Paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,75.099656,13.033213,52.1,0.0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,52.535702,13.541261,76.7,0.0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,68.62944,7.697642,25.6,1.0,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,51.961123,13.545689,73.2,1.0,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,63.765194,8.297449,39.5,0.0,1,0,0


In [3]:
X = df.drop("Credit Policy", axis = 1)
y = df["Credit Policy"]

# Train Test Split

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

***Finding Best Random State***

In [9]:
train = []
test = []
train_acc = []
test_acc = []
cv = []
mae = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)

    step = ColumnTransformer(transformers = [
        ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
        ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    ], remainder = "passthrough")

    pipe = Pipeline([
        ("step", step),
        ("model", LogisticRegression())
    ])

    pipe.fit(X_train, y_train)
    pred_train = pipe.predict(X_train)
    pred_test = pipe.predict(X_test)

    train_accuracy = pipe.score(X_train, y_train)
    test_accuracy = pipe.score(X_test, y_test)

    train_score = accuracy_score(y_train, pred_train)
    test_score = accuracy_score(y_test, pred_test)

    mae_score = mean_absolute_error(y_test, pred_test)
    cv_score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()

    train.append(train_accuracy)
    test.append(test_accuracy)
    train_acc.append(train_score)
    test_acc.append(test_score)
    cv.append(cv_score)
    mae.append(mae_score)

ff = pd.DataFrame({
    "Train Model": train,
    "Test Model": test,
    "Train acc": train_acc,
    "Test acc": test_acc,
    "CV": cv,
    "MAE": mae
})

In [10]:
ff.sort_values(by = "MAE", ascending = True)

Unnamed: 0,Train Model,Test Model,Train acc,Test acc,CV,MAE
82,0.876142,0.895094,0.876142,0.895094,0.874968,0.104906
70,0.873140,0.894572,0.873140,0.894572,0.871575,0.105428
65,0.877839,0.892484,0.877839,0.892484,0.878232,0.107516
51,0.876273,0.890919,0.876273,0.890919,0.874706,0.109081
55,0.877447,0.890397,0.877447,0.890397,0.875098,0.109603
...,...,...,...,...,...,...
99,0.882798,0.867432,0.882798,0.867432,0.881494,0.132568
98,0.883190,0.866910,0.883190,0.866910,0.881232,0.133090
2,0.883712,0.866388,0.883712,0.866388,0.881493,0.133612
54,0.886975,0.864301,0.886975,0.864301,0.884885,0.135699


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 82)

# ***GridSearchCV - SVC***

In [13]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", SVC())
])

svc_param_grid = {"model__C": [0.001, 0.1, 0, 10, 100], "model__kernel": ["linear", "poly", "sigmoid", "rbf"]}

svc_grid = GridSearchCV(pipe, svc_param_grid, cv = 5, scoring = "accuracy")

svc_grid.fit(X_train, y_train)

svc_grid.best_params_

{'C': 10, 'kernel': 'rbf'}

# ***KNeighborsClassifier***

In [11]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", KNeighborsClassifier())
])

knn_param_grid = {"model__n_neighbors": list(range(1, 20))}

knn_grid = GridSearchCV(pipe, knn_param_grid, cv = 5, scoring = "accuracy")

knn_grid.fit(X_train, y_train)

knn_grid.best_params_

{'model__n_neighbors': 8}

# ***DecisionTreeClassifier***

In [19]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", DecisionTreeClassifier())
])

decision_param_grid = {"model__max_depth": list(range(19, 40)), "model__criterion": ["gini", "entropy"]}

decision_grid = GridSearchCV(pipe, decision_param_grid, cv = 5, scoring = "accuracy")

decision_grid.fit(X_train, y_train)

decision_grid.best_params_

{'criterion': 'entropy', 'max_depth': 29}

# ***RandomForestClassifier***

In [22]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", RandomForestClassifier())
])

random_param_grid = {"model__n_estimators": list(range(19, 40))}

random_grid = GridSearchCV(pipe, random_param_grid, cv = 5, scoring = "accuracy")

random_grid.fit(X_train, y_train)

random_grid.best_params_

{'n_estimators': 35}

# ***AdaBoostClassifier***

In [23]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", AdaBoostClassifier())
])

ada_param_grid = {"model__n_estimators": list(range(1, 20))}

ada_grid = GridSearchCV(pipe, ada_param_grid, cv = 5, scoring = "accuracy")

ada_grid.fit(X_train, y_train)

ada_grid.best_params_

{'n_estimators': 16}

# ***GradientBoostingClassifier***

In [26]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", GradientBoostingClassifier())
])

gradient_param_grid = {"model__n_estimators": list(range(9, 20)), "model__learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gradient_grid = GridSearchCV(pipe, gradient_param_grid, cv = 5, scoring = "accuracy")

gradient_grid.fit(X_train, y_train)

gradient_grid.best_params_

{'learning_rate': 0.5, 'n_estimators': 15}

# ***XGBClassifier***

In [28]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", XGBClassifier())
])

xgb_param_grid = {"model__n_estimators": list(range(19, 40)), "model__learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5],
                 "model__gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}

xgb_grid = GridSearchCV(pipe, xgb_param_grid, cv = 5, scoring = "accuracy")

xgb_grid.fit(X_train, y_train)

xgb_grid.best_params_

{'gamma': 0.3, 'learning_rate': 0.3, 'n_estimators': 34}