### Random forest

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
import pickle

In [None]:
# Load the dataset
df_3 = pd.read_csv("TobeBalanced1.csv")

In [None]:
# Split data into features and target
X = df_3.drop('Default', axis=1)
y = df_3['Default']

In [None]:
# Apply RandomUnderSampler to the entire dataset to handle class imbalance
under_sampler = RandomUnderSampler(random_state=42)
X_under, y_under = under_sampler.fit_resample(X, y)

In [None]:
# Check the class distribution after undersampling
print("Class distribution after undersampling:")
print(pd.Series(y_under).value_counts())

In [None]:
# Train-test split on the undersampled dataset
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.2, random_state=42, stratify=y_under)

In [None]:
# Initialize and train RandomForestClassifier with class_weight='balanced'
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_clf.fit(X_train, y_train)

In [None]:
# Predictions for both train and test sets
y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)

In [None]:
# Train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [None]:
# Plot heatmaps for confusion matrix
def plot_confusion_matrix_heatmap(conf_matrix, title):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {title}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

In [None]:
# Plot heatmaps for train and test confusion matrices
plot_confusion_matrix_heatmap(conf_matrix_train, "Training Set")
plot_confusion_matrix_heatmap(conf_matrix_test, "Testing Set")

In [None]:
# Classification Report
classification_rep_train = classification_report(y_train, y_train_pred)
classification_rep_test = classification_report(y_test, y_test_pred)

In [None]:
# Print results
print("\nTrain Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nTrain Classification Report:")
print(classification_rep_train)
print("\nTest Classification Report:")
print(classification_rep_test)

In [None]:
# Hyperparameter Tuning using RandomizedSearchCV
param_distributions = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample']
}

In [None]:
rf_clf = RandomForestClassifier(random_state=42)

In [None]:
# RandomizedSearchCV for Hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf_clf, param_distributions=param_distributions, n_iter=20, cv=5, n_jobs=-1, verbose=2, random_state=42)

In [None]:
# Train the model with the best parameters
random_search.fit(X_train, y_train)
best_rf_clf = random_search.best_estimator_

In [None]:
# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

In [None]:
# Train the best model on the undersampled dataset
best_rf_clf.fit(X_train, y_train)

In [None]:
# Predictions for both train and test sets
y_train_pred = best_rf_clf.predict(X_train)
y_test_pred = best_rf_clf.predict(X_test)

In [None]:
# Train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)


In [None]:
# Plot heatmaps for the best model's confusion matrices
plot_confusion_matrix_heatmap(conf_matrix_train, "Best Model - Training Set")
plot_confusion_matrix_heatmap(conf_matrix_test, "Best Model - Testing Set")

In [None]:
# Classification Report
classification_rep_train = classification_report(y_train, y_train_pred)
classification_rep_test = classification_report(y_test, y_test_pred)

In [None]:
# Print results for best model
print("\nBest Model Train Accuracy:", train_accuracy)
print("Best Model Test Accuracy:", test_accuracy)
print("\nBest Model Train Classification Report:")
print(classification_rep_train)
print("\nBest Model Test Classification Report:")
print(classification_rep_test)


In [None]:
# Plot Learning Curve to detect overfitting/underfitting
def plot_learning_curve(estimator, X, y, title="Learning Curve", cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    # Compute the learning curve
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)

    # Calculate mean and standard deviation for train and test scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
# Plot the learning curve for the best RandomForestClassifier
plot_learning_curve(best_rf_clf, X_train, y_train, title="Random Forest Learning Curve", cv=5)
plt.show()

In [None]:
# Save the trained model
with open('loan_underSample3(RF).pkl', 'wb') as file:
    pickle.dump(best_rf_clf, file)

print("Model saved as 'loan_underSample3(RF).pkl'")