In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
class CreditRiskModel:
    def __init__(self, file_path, min_age, max_age, min_income, max_income, min_loan_amount, random_state=4, train_size=0.75):
        self.file_path = file_path
        self.min_age = min_age
        self.max_age = max_age
        self.min_income = min_income
        self.max_income = max_income
        self.min_loan_amount = min_loan_amount
        self.random_state = random_state
        self.train_size = train_size
        self.df = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(drop='first')

    def load_dataset(self):
        self.df = pd.read_csv(self.file_path)

    def handle_missing_values(self):
        self.df['person_emp_length'] = self.df['person_emp_length'].fillna(value=4)
        self.df['loan_int_rate'] = self.df['loan_int_rate'].fillna(value=10.99)

    def filter_dataframe(self):
        self.df = self.df[(self.df['person_age'] >= self.min_age) & (self.df['person_age'] <= self.max_age) &
                          (self.df['person_income'] >= self.min_income) & (self.df['person_income'] <= self.max_income) &
                          (self.df['loan_amnt'] >= self.min_loan_amount) & (self.df['person_home_ownership'] == 'OWN')]

    def extract_features_and_target(self):
        self.x = self.df[['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent',
                          'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                          'cb_person_default_on_file', 'cb_person_cred_hist_length']]
        self.y = self.df['loan_status']

    def split_train_test_data(self):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y,
                                                                                random_state=self.random_state,
                                                                                train_size=self.train_size)

    def preprocess_data(self, x):
        x_num_scaled = self.scaler.transform(x.select_dtypes(include=['int64', 'float64']))
        x_num_pre = pd.DataFrame(x_num_scaled, columns=x.select_dtypes(include=['int64', 'float64']).columns)

        x_cat = x.select_dtypes(include=['object'])
        x_cat_encoded = pd.DataFrame(self.encoder.transform(x_cat).toarray(),
                                     columns=self.encoder.get_feature_names_out(x_cat.columns))

        if not x_num_pre.index.equals(x_cat_encoded.index):
            x_cat_encoded.index = x_num_pre.index

        return pd.concat([x_num_pre, x_cat_encoded], axis=1)

    def train_model(self, model):
        model.fit(self.x_train_transformed, self.y_train)
        return model

    def evaluate_model(self, model):
        y_test_pred = model.predict(self.x_test_transformed)
        accuracy = accuracy_score(self.y_test, y_test_pred)
        confusion = confusion_matrix(self.y_test, y_test_pred)
        return accuracy, confusion

    def train_and_evaluate_models(self):
        self.load_dataset()
        self.handle_missing_values()
        self.filter_dataframe()
        self.extract_features_and_target()
        self.split_train_test_data()

        self.scaler.fit(self.x_train.select_dtypes(include=['int64', 'float64']))
        self.encoder.fit(self.x_train.select_dtypes(include=['object']))

        for model_name, model in self.get_models().items():
            print(f"Evaluating {model_name}:")
            self.x_train_transformed = self.preprocess_data(self.x_train)
            self.x_test_transformed = self.preprocess_data(self.x_test)
            model = self.train_model(model)
            accuracy, confusion = self.evaluate_model(model)
            print('Accuracy:', accuracy)
            print('Confusion Matrix:', confusion)

    def get_models(self):
        return {
            'KNN': KNeighborsClassifier(),
            'Logistic Regression': LogisticRegression(),
            'SVM': SVC(),
            'Decision Tree': DecisionTreeClassifier(),
            'Random Forest': RandomForestClassifier()
        }

In [6]:
def main():
    file_path = '/Users/tarunreddy/Downloads/credit_risk_dataset.csv'
    credit_model = CreditRiskModel(file_path, 23, 60, 66074, 6000000, 5000)
    credit_model.train_and_evaluate_models()

if __name__ == "__main__":
    main()

Evaluating KNN:
Accuracy: 0.9722222222222222
Confusion Matrix: [[140   0]
 [  4   0]]
Evaluating Logistic Regression:
Accuracy: 0.9861111111111112
Confusion Matrix: [[140   0]
 [  2   2]]
Evaluating SVM:
Accuracy: 0.9722222222222222
Confusion Matrix: [[140   0]
 [  4   0]]
Evaluating Decision Tree:
Accuracy: 0.9722222222222222
Confusion Matrix: [[139   1]
 [  3   1]]
Evaluating Random Forest:
Accuracy: 0.9791666666666666
Confusion Matrix: [[140   0]
 [  3   1]]
