# Credit Risk Model

This notebook presents the implementation of a credit risk model using three classification algorithms: Logistic Regression, Random Forest, and XGBoost. The model evaluates the accuracy and AUC score for each algorithm, and it also generates confusion matrices and ROC curves to assess the performance of each model.

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

### Class for the Credit Risk Model

In [None]:
class CreditRiskModel:
    def __init__(self, data):
        self.data = data
        self.predictor_columns = ['Age', 'Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card',
                                  'Num_of_Loan', 'Outstanding_Debt', 'Monthly_Balance', 'Credit_Utilization_Ratio']
        self.response_column = 'Score_Category'
        self.model = None
        self.scaler = StandardScaler()
        self.data_train, self.data_test = train_test_split(self.data, test_size=0.2, random_state=42)

        # Initialize LabelEncoder for converting categorical target labels to numerical values
        self.label_encoder = LabelEncoder()
    
    def preprocess_data(self, data):
        X = data[self.predictor_columns]
        y = data[self.response_column]

        # Convert labels to numeric values
        y_encoded = self.label_encoder.fit_transform(y)

        # Normalize the predictor variables
        X_scaled = self.scaler.fit_transform(X)

        return X_scaled, y_encoded

    def train_classifier(self, model_type: str):
        X_train, y_train = self.preprocess_data(self.data_train)
        X_test, y_test = self.preprocess_data(self.data_test)

        # Train according to the specified model
        if model_type == 'logistic':
            self.model = LogisticRegression(max_iter=1000, multi_class='ovr')
        elif model_type == 'random_forest':
            self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model_type == 'xgboost':
            self.model = XGBClassifier(n_estimators=100, random_state=42)
        else:
            raise ValueError("Model type not supported. Choose 'logistic', 'random_forest' or 'xgboost'.")

        # Fit the model
        self.model.fit(X_train, y_train)

        # Predictions and probabilities
        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)  # This gives probabilities for each class

        # Calculate metrics
        validation_accuracy = accuracy_score(y_test, y_pred)

        # Calculate the AUC score for each class using one-vs-rest (ovr) strategy
        lb = LabelBinarizer()
        y_test_bin = lb.fit_transform(y_test)
        auc_score = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')

        # Confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)

        return validation_accuracy, auc_score, conf_matrix, X_test, y_test

    def plot_roc_curve(self, X_test, y_test):
        # Plot the AUC-ROC curve for each class
        y_prob = self.model.predict_proba(X_test)
        
        lb = LabelBinarizer()
        y_test_bin = lb.fit_transform(y_test)
        
        # Set up the plot
        plt.figure(figsize=(10, 8))
        n_classes = y_test_bin.shape[1]
        
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'Class {i} (AUC = {roc_auc:.2f})')
        
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.show()

    def plot_confusion_matrix(self, conf_matrix):
        # Plot the confusion matrix
        plt.figure(figsize=(7, 5))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Poor', 'Standard', 'Good'], yticklabels=['Poor', 'Standard', 'Good'])
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

    def evaluate_model(self, model_type: str):
        validation_accuracy, auc_score, conf_matrix, X_test, y_test = self.train_classifier(model_type=model_type)
        print(f"Model: {model_type}")
        print(f"Accuracy: {validation_accuracy:.4f}")
        print(f"AUC Score: {auc_score:.4f}")
        
        self.plot_roc_curve(X_test, y_test)
        self.plot_confusion_matrix(conf_matrix)


### Load data and evaluate

In [None]:
# Load the data
data = pd.read_csv('train_cleaned.csv')

data

In [None]:
# Create an instance of CreditRiskModel
model = CreditRiskModel(data)

# Evaluate each model
model.evaluate_model('logistic')
model.evaluate_model('random_forest')
model.evaluate_model('xgboost')