#  Credit Risk Prediction using Machine Learning and Resampling Techniques
---
**Objective:** Predict whether a loan applicant is a good or bad credit risk using multiple ML algorithms, comparing Random Oversampling and SMOTE for class imbalance handling.

**Dataset:** [UCI Statlog German Credit Data](https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data)


In [None]:
# Install dependencies
!pip install imbalanced-learn xgboost tqdm seaborn matplotlib
# Install EBM model (Explainable Boosting Machine)
!pip install interpret


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from interpret.glassbox import ExplainableBoostingClassifier


from imblearn.over_sampling import RandomOverSampler, SMOTE
from tqdm import tqdm

### Load Dataset

In [None]:
url = '/content/drive/MyDrive/statlog/german.data'  # update path if necessary
column_names = [
    'status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings',
    'employment', 'installment_rate', 'personal_status', 'guarantors', 'residence_since',
    'property', 'age', 'other_installment_plans', 'housing', 'existing_credits',
    'job', 'liable_people', 'telephone', 'foreign_worker', 'target'
]

df = pd.read_csv(url, sep=' ', names=column_names)
df['target'] = df['target'].map({1: 0, 2: 1})  # 0=Good, 1=Bad
df = pd.get_dummies(df, drop_first=True)

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print('Data loaded and preprocessed successfully.')

###  Define Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),Preview
    'Explainable Boosting (EBM)' : ExplainableBoostingClassifier(random_state=42)


}

###  Evaluation Function

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    try:
        model.fit(X_train, y_train)

        if hasattr(model, 'predict_proba'):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            # Handle models without predict_proba (e.g., pyGAM)
            y_prob = model.predict(X_test)
            y_prob = np.nan_to_num(y_prob, nan=0.5, posinf=1.0, neginf=0.0)
            y_prob = np.clip(y_prob, 0, 1)

        y_pred = (y_prob >= 0.5).astype(int)

        return {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_prob)
        }

    except Exception as e:
        print(f"⚠️ {type(model).__name__} failed: {e}")
        return {
            'Accuracy': np.nan, 'Precision': np.nan, 'Recall': np.nan,
            'F1': np.nan, 'ROC-AUC': np.nan
        }


###  Sampling Methods and Model Evaluation

In [None]:
samplers = {
    'Original': None,
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

results = []
for sampler_name, sampler in samplers.items():
    if sampler:
        X_res, y_res = sampler.fit_resample(X_train, y_train)
    else:
        X_res, y_res = X_train, y_train

    print(f'\n=== Sampling: {sampler_name} ===')
    for name, model in tqdm(models.items()):
        metrics = evaluate_model(model, X_res, y_res, X_test, y_test)
        metrics.update({'Model': name, 'Sampling': sampler_name})
        results.append(metrics)

results_df = pd.DataFrame(results)
results_df = results_df[['Model', 'Sampling', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC']]
display(results_df.sort_values(['Sampling', 'ROC-AUC'], ascending=[True, False]))

###  Visualization

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(data=results_df, x='Model', y='ROC-AUC', hue='Sampling')
plt.xticks(rotation=45)
plt.title('Model Performance Comparison by Sampling Method')
plt.show()

###  Confusion Matrix for Best Model

In [None]:
best = results_df.sort_values('ROC-AUC', ascending=False).iloc[0]
print(f"Best Model: {best['Model']} | Sampling: {best['Sampling']}")

sampler = samplers[best['Sampling']]
if sampler:
    X_res, y_res = sampler.fit_resample(X_train, y_train)
else:
    X_res, y_res = X_train, y_train

model = models[best['Model']]
model.fit(X_res, y_res)

if hasattr(model, 'predict_proba'):
    y_pred = (model.predict_proba(X_test)[:,1] >= 0.5).astype(int)
else:
    y_pred = (model.predict(X_test) >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
plt.title(f"Confusion Matrix - {best['Model']} ({best['Sampling']})")
plt.show()