In [1]:
# !pip install catboost

In [2]:
# Basic library
import pandas as pd
import numpy as np

# Model library
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
df_down = pd.read_csv('clean data/train_transformed_downsampled.csv')
df_up = pd.read_csv('clean data/train_transformed_upsampled.csv')
df_original = pd.read_csv('clean data/train_transformed_original.csv')
df_test = pd.read_csv('clean data/test_transformed.csv')

print(f"Downsampled: {df_down.shape}")
print(f"Upsampled: {df_up.shape}")
print(f"Original: {df_original.shape}")
print(f"Test: {df_test.shape}")

Downsampled: (42903, 35)
Upsampled: (127725, 35)
Original: (80000, 35)
Test: (20000, 35)


In [4]:
def prepare_data(df):
    X = df.drop('Credit_Score', axis=1)
    y = df['Credit_Score']
    return X, y

X_test, y_test = prepare_data(df_test)

In [5]:
rf_results = {}
datasets = {
    'original': df_original,
    'downsampled': df_down,
    'upsampled': df_up
}

# Dataset Testing

In [6]:
for dataset_name, df in datasets.items():
    print(f"\n--- Random Forest on {dataset_name.upper()} dataset ---")
    
    X_train, y_train = prepare_data(df)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = rf.predict(X_train)
    y_test_pred = rf.predict(X_test)
    
    # Store results
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='macro')
    test_f1 = f1_score(y_test, y_test_pred, average='macro')
    rf_results[dataset_name] = {
        'train_acc': train_acc, 
        'test_acc': test_acc, 
        'train_f1': train_f1, 
        'test_f1': test_f1, 
        'model': rf
    }
    
    print(f"Train Accuracy: {train_acc:.4f}, Train F1-macro: {train_f1:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}, Test F1-macro: {test_f1:.4f}")
    
    print("\nTrain Classification Report:")
    print(classification_report(y_train, y_train_pred, digits=4))
    
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred, digits=4))


--- Random Forest on ORIGINAL dataset ---
Train Accuracy: 1.0000, Train F1-macro: 1.0000
Test Accuracy: 0.8159, Test F1-macro: 0.8078

Train Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     23124
           1     1.0000    1.0000    1.0000     42575
           2     1.0000    1.0000    1.0000     14301

    accuracy                         1.0000     80000
   macro avg     1.0000    1.0000    1.0000     80000
weighted avg     1.0000    1.0000    1.0000     80000

Test Classification Report:
              precision    recall  f1-score   support

           0     0.8046    0.8425    0.8231      5874
           1     0.8318    0.8188    0.8252     10599
           2     0.7878    0.7630    0.7752      3527

    accuracy                         0.8159     20000
   macro avg     0.8080    0.8081    0.8078     20000
weighted avg     0.8160    0.8159    0.8158     20000


--- Random Forest on DOWNSAMPLED dataset ---

In [7]:
best_dataset = max(rf_results.keys(), key=lambda x: rf_results[x]['test_f1'])
print(f"Best Dataset: {best_dataset.upper()} (Test F1-macro: {rf_results[best_dataset]['test_f1']:.4f})")

Best Dataset: UPSAMPLED (Test F1-macro: 0.8153)


# Baseline Model

In [8]:
X_train_best, y_train_best = prepare_data(datasets[best_dataset])

In [9]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='mlogloss', verbosity=0),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=False),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'LightGBM': LGBMClassifier(random_state=42)
}

In [10]:
model_results = {}

for model_name, model in models.items():
    print(f"\n--- {model_name} ---")
    
    # Train model
    model.fit(X_train_best, y_train_best)
    
    # Predictions
    y_train_pred = model.predict(X_train_best)
    y_test_pred = model.predict(X_test)
    
    # Store results
    train_acc = accuracy_score(y_train_best, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    train_f1 = f1_score(y_train_best, y_train_pred, average='macro')
    test_f1 = f1_score(y_test, y_test_pred, average='macro')
    model_results[model_name] = {
        'train_acc': train_acc, 
        'test_acc': test_acc, 
        'train_f1': train_f1, 
        'test_f1': test_f1, 
        'model': model
    }
    
    print(f"Train Accuracy: {train_acc:.4f}, Train F1-macro: {train_f1:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}, Test F1-macro: {test_f1:.4f}")
    
    print("\nTrain Classification Report:")
    print(classification_report(y_train_best, y_train_pred, digits=4))
    
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred, digits=4))


--- Random Forest ---
Train Accuracy: 1.0000, Train F1-macro: 1.0000
Test Accuracy: 0.8206, Test F1-macro: 0.8153

Train Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     42575
           1     1.0000    1.0000    1.0000     42575
           2     1.0000    1.0000    1.0000     42575

    accuracy                         1.0000    127725
   macro avg     1.0000    1.0000    1.0000    127725
weighted avg     1.0000    1.0000    1.0000    127725

Test Classification Report:
              precision    recall  f1-score   support

           0     0.7957    0.8655    0.8292      5874
           1     0.8533    0.7999    0.8258     10599
           2     0.7750    0.8078    0.7911      3527

    accuracy                         0.8206     20000
   macro avg     0.8080    0.8244    0.8153     20000
weighted avg     0.8226    0.8206    0.8206     20000


--- XGBoost ---
Train Accuracy: 0.8624, Train F1-macro: 0.8612
T

In [11]:
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['test_f1'])
best_model_f1 = model_results[best_model_name]['test_f1']
best_model_acc = model_results[best_model_name]['test_acc']

print(f"\nBest Baseline Model: {best_model_name} with Test F1-macro: {best_model_f1:.4f}, Test Accuracy: {best_model_acc:.4f}")


Best Baseline Model: Random Forest with Test F1-macro: 0.8153, Test Accuracy: 0.8206
