In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("loan_data_export.csv")
df.head()

In [None]:
df['loan_to_request_ratio'] = df.apply(lambda row: row['loanamount'] / row['requestedamount'] if row['requestedamount'] else 0, axis=1)
df['is_high_interest'] = df['interestrate'].apply(lambda x: 1 if x > 6 else 0)

def term_category(term):
    if term <= 60:
        return 0
    elif term <= 180:
        return 1
    else:
        return 2

df['loan_term_category'] = df['term'].apply(term_category)
df['log_requestedamount'] = np.log1p(df['requestedamount'])
df['log_loanamount'] = np.log1p(df['loanamount'])

features = ['log_requestedamount', 'log_loanamount', 'interestrate', 'loan_to_request_ratio', 'is_high_interest', 'loan_term_category', 'loantype_encoded']
X = df[features]
y = df['high_risk']

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df[features + ['high_risk']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap with Target')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

In [None]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    'XGBoost': XGBClassifier(learning_rate=0.05, max_depth=4, n_estimators=200, use_label_encoder=False, eval_metric='logloss'),
    'NaiveBayes': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

results = {}

for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else [0] * len(y_pred)

    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "report": classification_report(y_test, y_pred),
        "conf_matrix": confusion_matrix(y_test, y_pred)
    }

results

In [None]:
accuracies = [res["accuracy"] for res in results.values()]
model_names = list(results.keys())

plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, accuracies, color='skyblue')
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')

for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, f'{acc:.2f}', ha='center', va='bottom')

plt.show()