In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import joblib

# Load the dataset
df = pd.read_csv("loan_data.csv")

# Data Exploration
df.info()
df.describe()
df.isnull().sum().sum()

# Preprocessing
df['purpose'] = LabelEncoder().fit_transform(df['purpose'])

# Data Visualization
sns.set_style('darkgrid')
plt.hist(df['fico'].loc[df['credit.policy'] == 1], bins=30, label='Credit.Policy=1')
plt.hist(df['fico'].loc[df['credit.policy'] == 0], bins=30, label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')
plt.show()

plt.figure(figsize=(10,6))
df[df['not.fully.paid'] == 1]['fico'].hist(bins=30, alpha=0.5, color='blue', label='not.fully.paid=1')
df[df['not.fully.paid'] == 0]['fico'].hist(bins=30, alpha=0.5, color='green', label='not.fully.paid=0')
plt.legend()
plt.xlabel('FICO')
plt.show()

plt.figure(figsize=(12,6))
sns.countplot(data=df, x='purpose', hue='not.fully.paid')
plt.show()

plt.figure(figsize=(10,6))
sns.jointplot(x='fico', y='int.rate', data=df)
plt.show()

sns.lmplot(data=df, x='fico', y='int.rate', hue='credit.policy', col='not.fully.paid', palette='Set2')
plt.show()

plt.figure(figsize=(20, 15)) 
sns.heatmap(df.corr(), cmap='BuPu', annot=True)
plt.show()

# Feature and target variables
X = df.drop('not.fully.paid', axis=1)
y = df['not.fully.paid']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Model training and evaluation
# Decision Tree
dt_clf = DecisionTreeClassifier(max_depth=2)
dt_clf.fit(X_train, y_train)
y_pred_test = dt_clf.predict(X_test)

print("Decision Tree")
print("Confusion Matrix \n", confusion_matrix(y_test, y_pred_test))
print("\nClassification Report\n", classification_report(y_test, y_pred_test))
print('Test Accuracy score:', accuracy_score(y_test, y_pred_test))

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=600)
rf_clf.fit(X_train, y_train)
y_pred_test = rf_clf.predict(X_test)

print("Random Forest")
print("Confusion Matrix \n", confusion_matrix(y_test, y_pred_test))
print("\nClassification Report\n", classification_report(y_test, y_pred_test))
print('Test Accuracy score:', accuracy_score(y_test, y_pred_test))

# Gradient Boosting
gb_clf = GradientBoostingClassifier(learning_rate=0.05)
gb_clf.fit(X_train, y_train)
y_pred_test = gb_clf.predict(X_test)

print("Gradient Boosting")
print("Confusion Matrix \n", confusion_matrix(y_test, y_pred_test))
print("\nClassification Report\n", classification_report(y_test, y_pred_test))
print('Test Accuracy score:', accuracy_score(y_test, y_pred_test))

# Save models
joblib.dump(dt_clf, 'models/decision_tree.pkl')
joblib.dump(rf_clf, 'models/random_forest.pkl')
joblib.dump(gb_clf, 'models/gradient_boosting.pkl')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: decision_tree
Test Accuracy: 0.8458594293667363
Confusion Matrix:
 [[2431    0]
 [ 443    0]]
Classification Report:
 {'0': {'precision': 0.8458594293667363, 'recall': 1.0, 'f1-score': 0.9164938737040528, 'support': 2431.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 443.0}, 'accuracy': 0.8458594293667363, 'macro avg': {'precision': 0.42292971468336815, 'recall': 0.5, 'f1-score': 0.4582469368520264, 'support': 2874.0}, 'weighted avg': {'precision': 0.7154781742486207, 'recall': 0.8458594293667363, 'f1-score': 0.7752249850294197, 'support': 2874.0}}
Model: random_forest
Test Accuracy: 0.8465553235908142
Confusion Matrix:
 [[2424    7]
 [ 434    9]]
Classification Report:
 {'0': {'precision': 0.8481455563331001, 'recall': 0.9971205265322912, 'f1-score': 0.9166193987521271, 'support': 2431.0}, '1': {'precision': 0.5625, 'recall': 0.020316027088036117, 'f1-score': 0.0392156862745098, 'support': 443.0}, 'accuracy': 0.8465553235908142, 'macro avg': {'precision'