# Credit Risk Model

This notebook presents the implementation of a credit risk model using three classification algorithms: Logistic Regression, Random Forest, and XGBoost. The model evaluates the accuracy and AUC score for each algorithm, and it also generates confusion matrices and ROC curves to assess the performance of each model.

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve
from xgboost import XGBClassifier

### Class for the Credit Risk Model

In [None]:
class CreditRiskModel:
    def __init__(self, data_train, data_test):
        self.data_train = data_train
        self.data_test = data_test
        # Define predictor variables
        self.predictor_columns = ['Age', 'Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card',
                                  'Num_of_Loan', 'Outstanding_Debt', 'Monthly_Balance', 'Credit_Utilization_Ratio']
        # Response variable
        self.response_column = 'Score_Category'
        # Initialize the model and scaler
        self.model = None
        self.scaler = StandardScaler()

    def preprocess_data(self, data):
        X = data[self.predictor_columns]
        y = data[self.response_column]

        # Normalize the predictor variables
        X_scaled = self.scaler.fit_transform(X)

        return X_scaled, y

    def train_classifier(self, model_type: str):
        # Process data
        X_train, y_train = self.preprocess_data(self.data_train)
        X_test, y_test = self.preprocess_data(self.data_test)

        # Train according to the specified model
        if model_type == 'logistic':
            self.model = LogisticRegression(max_iter=1000)
        elif model_type == 'random_forest':
            self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model_type == 'xgboost':
            self.model = XGBClassifier(n_estimators=100, random_state=42)
        else:
            raise ValueError("Model type not supported. Choose 'logistic', 'random_forest' or 'xgboost'.")

        # Fit the model
        self.model.fit(X_train, y_train)

        # Predictions and probabilities
        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)[:, 1] 

        # Calculate metrics
        validation_accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_prob)

        # Confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)

        return validation_accuracy, auc_score, conf_matrix, X_test, y_test

    def plot_roc_curve(self, X_test, y_test):
        # Plot the AUC-ROC curve
        y_prob = self.model.predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        plt.figure()
        plt.plot(fpr, tpr, color='blue', label='ROC curve (area = %0.2f)' % roc_auc_score(y_test, y_prob))
        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc='lower right')
        plt.show()

    def plot_confusion_matrix(self, conf_matrix):
        # Plot the confusion matrix
        plt.figure(figsize=(7, 5))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Poor', 'Standard', 'Good'], yticklabels=['Poor', 'Standard', 'Good'])
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

    def evaluate_model(self, model_type: str):
        # Evaluate the model and get the metrics
        validation_accuracy, auc_score, conf_matrix, X_test, y_test = self.train_classifier(model_type=model_type)
        print(f"Model: {model_type}")
        print(f"Accuracy: {validation_accuracy:.4f}")
        print(f"AUC Score: {auc_score:.4f}")
        
        self.plot_roc_curve(X_test, y_test)
        
        self.plot_confusion_matrix(conf_matrix)

### Load data and evaluate

In [None]:
# Load the data
data_train = pd.read_csv('train_cleaned.csv')
data_test = pd.read_csv('data_test.csv')

# Create an instance of the model
classifier = CreditRiskModel(data_train, data_test)

# Evaluate the three models
for model in ['logistic', 'random_forest', 'xgboost']:
    classifier.evaluate_model(model_type=model)