In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv("Loan_Analyzed.csv")
df.head()

Unnamed: 0,Credit Policy,Purpose,Interest Rate,Installment,Annual Income,DTI,Fico,Days_CR_Line,Revolve Balance,Revolve Util,Inq_6months,Delinq_2years,Public Records,Not Fully Paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,75.099656,13.033213,52.1,0.0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,52.535702,13.541261,76.7,0.0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,68.62944,7.697642,25.6,1.0,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,51.961123,13.545689,73.2,1.0,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,63.765194,8.297449,39.5,0.0,1,0,0


In [3]:
df.shape

(9578, 14)

In [7]:
X = df.drop("Credit Policy", axis = 1)
y = df["Credit Policy"]

# ***Libraries***

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

***Train_Test_Split***

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 82)

# ***LogisticRegression***

In [21]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = LogisticRegression()

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.8761419994779431
Testing Accuracy : 0.8950939457202505
CV Score : 0.874968108062997
MAE : 0.10490605427974947


In [22]:
confusion_matrix(y_test, pred_test)

array([[ 201,  149],
       [  52, 1514]], dtype=int64)

In [23]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.79      0.57      0.67       350
           1       0.91      0.97      0.94      1566

    accuracy                           0.90      1916
   macro avg       0.85      0.77      0.80      1916
weighted avg       0.89      0.90      0.89      1916



# ***SVM - SVC***

In [24]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = SVC(C = 10, kernel = "rbf")

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9862960062646828
Testing Accuracy : 0.9634655532359081
CV Score : 0.9449243705493929
MAE : 0.03653444676409186


In [25]:
confusion_matrix(y_test, pred_test)

array([[ 306,   44],
       [  26, 1540]], dtype=int64)

In [26]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.92      0.87      0.90       350
           1       0.97      0.98      0.98      1566

    accuracy                           0.96      1916
   macro avg       0.95      0.93      0.94      1916
weighted avg       0.96      0.96      0.96      1916



# ***KNeighborsClassifier***

In [30]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = KNeighborsClassifier(n_neighbors = 8)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9250848342469329
Testing Accuracy : 0.9117954070981211
CV Score : 0.9006782891274469
MAE : 0.08820459290187892


In [31]:
confusion_matrix(y_test, pred_test)

array([[ 214,  136],
       [  33, 1533]], dtype=int64)

In [32]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.87      0.61      0.72       350
           1       0.92      0.98      0.95      1566

    accuracy                           0.91      1916
   macro avg       0.89      0.80      0.83      1916
weighted avg       0.91      0.91      0.91      1916



# ***DecisionTreeClassifier***

In [33]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = DecisionTreeClassifier(max_depth = 29, criterion = "entropy", random_state = 1)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 0.9916492693110647
CV Score : 0.9899505057575805
MAE : 0.008350730688935281


In [34]:
confusion_matrix(y_test, pred_test)

array([[ 339,   11],
       [   5, 1561]], dtype=int64)

In [35]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       350
           1       0.99      1.00      0.99      1566

    accuracy                           0.99      1916
   macro avg       0.99      0.98      0.99      1916
weighted avg       0.99      0.99      0.99      1916



# ***RandomForestClassifier***

In [36]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = RandomForestClassifier(n_estimators = 35, random_state = 0)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 0.9869519832985386
CV Score : 0.9881231701522127
MAE : 0.013048016701461378


In [37]:
confusion_matrix(y_test, pred_test)

array([[ 330,   20],
       [   5, 1561]], dtype=int64)

In [38]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96       350
           1       0.99      1.00      0.99      1566

    accuracy                           0.99      1916
   macro avg       0.99      0.97      0.98      1916
weighted avg       0.99      0.99      0.99      1916



# ***AdaBoostClassifier***

In [39]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = AdaBoostClassifier(n_estimators = 16, random_state = 0, learning_rate = 0.8)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9829026363873662
Testing Accuracy : 0.9832985386221295
CV Score : 0.9813363615770712
MAE : 0.016701461377870562


In [40]:
confusion_matrix(y_test, pred_test)

array([[ 324,   26],
       [   6, 1560]], dtype=int64)

In [41]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       350
           1       0.98      1.00      0.99      1566

    accuracy                           0.98      1916
   macro avg       0.98      0.96      0.97      1916
weighted avg       0.98      0.98      0.98      1916



# ***GradientBoostingClassifier***

In [42]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = GradientBoostingClassifier(n_estimators = 15, random_state = 0, learning_rate = 0.5)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9945184025058731
Testing Accuracy : 0.9921711899791231
CV Score : 0.9912554778340393
MAE : 0.007828810020876827


In [43]:
confusion_matrix(y_test, pred_test)

array([[ 339,   11],
       [   4, 1562]], dtype=int64)

In [44]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       350
           1       0.99      1.00      1.00      1566

    accuracy                           0.99      1916
   macro avg       0.99      0.98      0.99      1916
weighted avg       0.99      0.99      0.99      1916



# ***XGBClassifier***

In [45]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = XGBClassifier(n_estimators = 34, random_state = 0, learning_rate = 0.3, gamma = 0.3)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9992169146436961
Testing Accuracy : 0.9911273486430062
CV Score : 0.99125573331017
MAE : 0.008872651356993737


In [46]:
confusion_matrix(y_test, pred_test)

array([[ 336,   14],
       [   3, 1563]], dtype=int64)

In [47]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       350
           1       0.99      1.00      0.99      1566

    accuracy                           0.99      1916
   macro avg       0.99      0.98      0.98      1916
weighted avg       0.99      0.99      0.99      1916



# ***FINAL MODEL***

In [69]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0]),
    ("scaler", StandardScaler(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
], remainder = "passthrough")

model = DecisionTreeClassifier(max_depth = 29, criterion = "entropy", random_state = 1)

dt_pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

dt_pipe.fit(X_train, y_train)
pred_test = dt_pipe.predict(X_test)
print("Training Accuracy :", dt_pipe.score(X_train, y_train))
print("Testing Accuracy :", dt_pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(dt_pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 0.9916492693110647
CV Score : 0.9899505057575805
MAE : 0.008350730688935281


In [147]:
final = pd.DataFrame({
    "Purpose": "credit_card",
    "Interest Rate": 11.32,
    "Installment": 5600,
    "Annual Income": 890000,
    "DTI": 567,
    "Fico": 500,
    "Days_CR_Line": 110,
    "Revolve Balance": 8000,
    "Revolve Util": 67,
    "Inq_6months": 5,
    "Delinq_2years": 4,
    "Public Records": 8,
    "Not Fully Paid": 1,
}, index = [0])

final["Interest Rate"] = final["Interest Rate"]/100
final["Inq_6months"] = final["Inq_6months"].clip(upper = 5)
final["Delinq_2years"] = final["Delinq_2years"].clip(upper = 3)
final["Public Records"] = final["Public Records"].clip(upper = 1)
loaded.predict(final)

array([0], dtype=int64)

In [104]:
loaded.predict(final)

array([1], dtype=int64)

In [50]:
import pickle
with open("loan_dt_model.pkl", "wb") as file:
    pickle.dump(pipe, file)

In [55]:
with open("loan_dt_model.pkl", "rb") as file:
    loaded = pickle.load(file)