In [19]:
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Import StandardScaler for feature scaling and LabelEncoder for encoding target labels
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression for logistic regression model
from sklearn.ensemble import RandomForestClassifier, StackingClassifier  # Import RandomForestClassifier for random forest model and StackingClassifier for model stacking
from sklearn.svm import SVC  # Import SVC for support vector classifier
from xgboost import XGBClassifier  # Import XGBClassifier for XGBoost model
from lightgbm import LGBMClassifier  # Import LGBMClassifier for LightGBM model
from catboost import CatBoostClassifier  # Import CatBoostClassifier for CatBoost model
from sklearn import metrics  # Import metrics from sklearn for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score  # Import specific metrics for model evaluation
import warnings  # Import warnings to manage warnings
import category_encoders as ce  # Import category_encoders for encoding categorical features
import optuna
from sklearn.preprocessing import LabelEncoder
# Ignore warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

In [20]:
path = r'C:\Users\User\Desktop\Rashad\DATA\Credit Score Classification Dataset.csv'

# Load dataset from Excel file
df = pd.read_csv(path)

# Set option to display all columns
pd.set_option('display.max_columns', None)

#Display the loaded dataset
df

Unnamed: 0,Age,Gender,Income,Education,Marital Status,Number of Children,Home Ownership,Credit Score
0,25,Female,50000,Bachelor's Degree,Single,0,Rented,High
1,30,Male,100000,Master's Degree,Married,2,Owned,High
2,35,Female,75000,Doctorate,Married,1,Owned,High
3,40,Male,125000,High School Diploma,Single,0,Owned,High
4,45,Female,100000,Bachelor's Degree,Married,3,Owned,High
...,...,...,...,...,...,...,...,...
159,29,Female,27500,High School Diploma,Single,0,Rented,Low
160,34,Male,47500,Associate's Degree,Single,0,Rented,Average
161,39,Female,62500,Bachelor's Degree,Married,2,Owned,High
162,44,Male,87500,Master's Degree,Single,0,Owned,High


In [21]:
# Counting the occurrences of each value in the 'Credit Score' column
df['Credit Score'].value_counts()

Credit Score
High       113
Average     36
Low         15
Name: count, dtype: int64

In [22]:
# Instantiating a LabelEncoder object
label_encoder = LabelEncoder()

In [23]:
# Converting 'Credit Score' values to numeric values
# Assuming 'High' is considered as 1 and other values as 0
df['Credit Score'] = df['Credit Score'].apply(lambda x: 1 if x == 'High' else 0)

In [24]:
# Counting the occurrences of each value in the 'Credit Score' column after transformation
df['Credit Score'].value_counts()

Credit Score
1    113
0     51
Name: count, dtype: int64

In [25]:
# Printing the columns of object data type in the DataFrame
print(df.select_dtypes('object').columns)

Index(['Gender', 'Education', 'Marital Status', 'Home Ownership'], dtype='object')


In [26]:
models = ["logistic_regression", "random_forest", "svc", "xgboost", "lightgbm", "catboost", "catboost_custom"]

# Create copies of the DataFrame for different preprocessing approaches
df_copy = df.copy()
df_boost = df.copy()
df_cat = df.copy()

# Iterate through each model in the list
for model in models:
    if model == "logistic_regression" or model == "svc":
        # Preprocessing for logistic regression and SVC models
        
        # Fill missing values with mean for numerical columns and mode for categorical columns
        for col in df_copy.columns:
            if df_copy[col].dtype in ['int64', 'float64']:
                df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
            elif df_copy[col].dtype == 'object':
                df_copy[col] = df_copy[col].fillna(df_copy[col].mode().iloc[0])
        
        # Perform Weight of Evidence Encoding on all columns except the target column 'default'
        all_columns = df_copy.columns.tolist()
        all_columns.remove('Credit Score')
        encoder = ce.WOEEncoder(cols=all_columns)
        df_copy[all_columns] = encoder.fit_transform(df_copy[all_columns], df_copy['Credit Score'])
        
        # Remove duplicate rows and 'ID' column if exists
        df_copy.drop_duplicates(inplace=True)
        if 'ID' in df_copy.columns:
            df_copy.drop(columns=['ID'], inplace=True)
                
    elif model in ["random_forest", "xgboost", "lightgbm", "catboost"]:
        # Preprocessing for tree-based models
        
        # Perform Weight of Evidence Encoding on all columns except the target column 'default'
        all_columns = df_boost.columns.tolist()
        all_columns.remove('Credit Score')
        encoder = ce.WOEEncoder(cols=all_columns)
        df_boost[all_columns] = encoder.fit_transform(df_boost[all_columns], df_boost['Credit Score'])
             
    elif model ==  "catboost_custom":
        # Preprocessing for CatBoost with custom handling of missing values
        
        # Fill missing values in specified categorical columns with 'Missing Value'
        numerical_cols = df_cat.select_dtypes(include=['int64', 'float64']).columns.tolist()
        numerical_cols.remove('Credit Score')
        columns_to_fill = cat_features=['Gender', 'Education', 'Marital Status', 'Home Ownership']
        df_cat[columns_to_fill] = df_cat[columns_to_fill].fillna('Missing Value')
        
        # Perform Weight of Evidence Encoding on numerical columns
        encoder = ce.WOEEncoder(cols=numerical_cols)
        df_cat[numerical_cols] = encoder.fit_transform(df_cat[numerical_cols], df_cat['Credit Score'])

# Split data into features (X) and target (y) for each preprocessing approach
x_log = df_copy.drop(columns=['Credit Score'])
y_log = df_copy['Credit Score']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_log, y_log, test_size=0.3, random_state=42)

x_boost = df_boost.drop(columns=['Credit Score'])
y_boost = df_boost['Credit Score']
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(x_boost, y_boost, test_size=0.3, random_state=42)

x_cat = df_cat.drop(columns=['Credit Score'])
y_cat = df_cat['Credit Score']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

In [28]:
models = []  # Initialize an empty list to store model definitions

# Define default models for each algorithm
xgb_model_def = XGBClassifier()
lgb_model_def = LGBMClassifier()
catboost_model_def = CatBoostClassifier()
catboost_model_custom = CatBoostClassifier(cat_features=['Gender', 'Education', 'Marital Status', 'Home Ownership'])
lg = LogisticRegression()
rf = RandomForestClassifier()
svc_model_def = SVC(probability=True)  # Add SVC model definition

# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def),
                   ('CatBoost_Custom', catboost_model_custom),
                   ('LogisticRegression', lg),
                   ('RandomForest', rf),
                   ('SVC', svc_model_def)]  # Add SVC to stacking_models

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom),
    ('LogisticRegression', lg),
    ('RandomForest', rf),
    ('SVC', svc_model_def)  # Add SVC to models list
])

In [29]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    """
    Train and evaluate the given model on the training and testing data.

    Parameters:
    model_name (str): Name of the model for display purposes.
    model : Machine learning model object.
    X_train : Features of the training data.
    y_train : Target labels of the training data.
    X_test : Features of the testing data.
    y_test : Target labels of the testing data.

    Returns:
    float: Gini coefficient calculated from the model's predictions.
    """

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels and probabilities on the testing data
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate ROC AUC and Gini coefficient
    roc_prob = roc_auc_score(y_test, y_prob)
    gini_prob = roc_prob * 2 - 1

    # Calculate confusion matrix and classification report
    confusion_matrix_result = confusion_matrix(y_test, y_pred)
    classification_report_result = classification_report(y_test, y_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Gini prob is', gini_prob * 100)
    print(classification_report_result)
    print(confusion_matrix_result)

    return gini_prob

In [30]:
gini_df = pd.DataFrame(columns=['Model', 'Gini_prob'])

# Iterate through each model in the list of models
for model_name, model in models:
    # Train and evaluate the model, and calculate the Gini coefficient
    if model_name == 'CatBoost_Custom':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    elif model_name in ['LogisticRegression', 'SVC']:
        gini_prob = train_and_evaluate_model(model_name, model, X_train_log, y_train_log, X_test_log, y_test_log)
    else:
        gini_prob = train_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
    # Add model name and Gini coefficient to the DataFrame
    if gini_prob is not None:
        gini_df = pd.concat([gini_df, pd.DataFrame({'Model': [model_name], 'Gini_prob': [gini_prob]})], ignore_index=True)

# Sort the DataFrame by Gini coefficient in descending order
gini_df_sorted = gini_df.sort_values(by='Gini_prob', ascending=False)

Model Performance for XGBoost
Gini prob is 97.56944444444446
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        18
           1       0.91      1.00      0.96        32

    accuracy                           0.94        50
   macro avg       0.96      0.92      0.93        50
weighted avg       0.95      0.94      0.94        50

[[15  3]
 [ 0 32]]
[LightGBM] [Info] Number of positive: 81, number of negative: 33
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 42
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.710526 -> initscore=0.897942
[LightGBM] [Info] Start training from score 0.897942
Model Performance for LightGBM
Gini prob is 96.5

In [31]:
gini_df_sorted

Unnamed: 0,Model,Gini_prob
2,CatBoost,0.989583
3,CatBoost_Custom,0.989583
5,RandomForest,0.984375
0,XGBoost,0.975694
1,LightGBM,0.965278
4,LogisticRegression,0.955357
6,SVC,0.9375
