In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, make_scorer
import optuna
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_up = pd.read_csv('/kaggle/input/credit-score-data/clean data/train_transformed_upsampled.csv')
df_test = pd.read_csv('/kaggle/input/credit-score-data/clean data/test_transformed.csv')

print(f"Upsampled training: {df_up.shape}")
print(f"Test: {df_test.shape}")

Upsampled training: (127725, 35)
Test: (20000, 35)


In [3]:
def prepare_data(df):
    X = df.drop('Credit_Score', axis=1)
    y = df['Credit_Score']
    return X, y

X_train, y_train = prepare_data(df_up)
X_test, y_test = prepare_data(df_test)

# Baseline

In [4]:
rf_baseline = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_baseline.fit(X_train, y_train)

y_train_pred_baseline = rf_baseline.predict(X_train)
y_test_pred_baseline = rf_baseline.predict(X_test)

baseline_train_f1 = f1_score(y_train, y_train_pred_baseline, average='macro')
baseline_test_f1 = f1_score(y_test, y_test_pred_baseline, average='macro')
baseline_train_acc = accuracy_score(y_train, y_train_pred_baseline)
baseline_test_acc = accuracy_score(y_test, y_test_pred_baseline)

print(f"Baseline Train F1-macro: {baseline_train_f1:.4f}, Train Accuracy: {baseline_train_acc:.4f}")
print(f"Baseline Test F1-macro: {baseline_test_f1:.4f}, Test Accuracy: {baseline_test_acc:.4f}")

Baseline Train F1-macro: 1.0000, Train Accuracy: 1.0000
Baseline Test F1-macro: 0.8153, Test Accuracy: 0.8206


# Fine-Tuning

## 1) Grid-Search

In [5]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'max_features': ['sqrt', 'log2', None]
}

In [6]:
f1_scorer = make_scorer(f1_score, average='macro')

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid,
    cv=3,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [7]:
print(f"\nBest GridSearch F1-macro (CV): {grid_search.best_score_:.4f}")
print(f"Best GridSearch parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

rf_grid = grid_search.best_estimator_
y_train_pred_grid = rf_grid.predict(X_train)
y_test_pred_grid = rf_grid.predict(X_test)


Best GridSearch F1-macro (CV): 0.7507
Best GridSearch parameters:
  max_depth: 8
  max_features: None
  n_estimators: 300


In [8]:
grid_train_f1 = f1_score(y_train, y_train_pred_grid, average='macro')
grid_test_f1 = f1_score(y_test, y_test_pred_grid, average='macro')
grid_train_acc = accuracy_score(y_train, y_train_pred_grid)
grid_test_acc = accuracy_score(y_test, y_test_pred_grid)

print(f"\nGridSearch Train F1-macro: {grid_train_f1:.4f}, Train Accuracy: {grid_train_acc:.4f}")
print(f"GridSearch Test F1-macro: {grid_test_f1:.4f}, Test Accuracy: {grid_test_acc:.4f}")

print("\nGridSearch Train Classification Report:")
print(classification_report(y_train, y_train_pred_grid, digits=4))

print("GridSearch Test Classification Report:")
print(classification_report(y_test, y_test_pred_grid, digits=4))


GridSearch Train F1-macro: 0.7575, Train Accuracy: 0.7628
GridSearch Test F1-macro: 0.6884, Test Accuracy: 0.6926

GridSearch Train Classification Report:
              precision    recall  f1-score   support

           0     0.7612    0.8284    0.7934     42575
           1     0.7755    0.5898    0.6700     42575
           2     0.7559    0.8702    0.8090     42575

    accuracy                         0.7628    127725
   macro avg     0.7642    0.7628    0.7575    127725
weighted avg     0.7642    0.7628    0.7575    127725

GridSearch Test Classification Report:
              precision    recall  f1-score   support

           0     0.6437    0.8107    0.7176      5874
           1     0.8551    0.5823    0.6928     10599
           2     0.5420    0.8273    0.6549      3527

    accuracy                         0.6926     20000
   macro avg     0.6802    0.7401    0.6884     20000
weighted avg     0.7378    0.6926    0.6934     20000



## 2) Optuna

In [11]:
best_f1_score = 0
best_params = {}

def objective(trial):
    global best_f1_score, best_params
    
    n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50)
    max_depth = trial.suggest_int('max_depth', 3, 15, step=3)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train, y_train)
    
    y_val_pred = rf.predict(X_test)
    f1 = f1_score(y_test, y_val_pred, average='macro')
    
    return f1

In [12]:
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2025-08-22 08:21:25,082] A new study created in memory with name: no-name-8191267d-8994-4ca9-b5cc-113f19c19540


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-08-22 08:21:53,058] Trial 0 finished with value: 0.7256832241732054 and parameters: {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7256832241732054.
[I 2025-08-22 08:22:44,261] Trial 1 finished with value: 0.6934006328910756 and parameters: {'n_estimators': 450, 'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7256832241732054.
[I 2025-08-22 08:23:03,731] Trial 2 finished with value: 0.6620218064858948 and parameters: {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 0 with value: 0.7256832241732054.
[I 2025-08-22 08:23:10,503] Trial 3 finished with value: 0.6660884741990993 and parameters: {'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7256832241732054.


In [13]:
print(f"\nBest Optuna F1-macro: {study.best_value:.4f}")
print(f"Best Optuna parameters:")
for param, value in study.best_params.items():
    print(f"  {param}: {value}")


Best Optuna F1-macro: 0.7697
Best Optuna parameters:
  n_estimators: 100
  max_depth: 15
  min_samples_split: 7
  min_samples_leaf: 3
  max_features: None


In [15]:
rf_optuna = RandomForestClassifier(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)

rf_optuna.fit(X_train, y_train)

y_train_pred_optuna = rf_optuna.predict(X_train)
y_test_pred_optuna = rf_optuna.predict(X_test)

optuna_train_f1 = f1_score(y_train, y_train_pred_optuna, average='macro')
optuna_test_f1 = f1_score(y_test, y_test_pred_optuna, average='macro')
optuna_train_acc = accuracy_score(y_train, y_train_pred_optuna)
optuna_test_acc = accuracy_score(y_test, y_test_pred_optuna)

print(f"\nOptuna Train F1-macro: {optuna_train_f1:.4f}, Train Accuracy: {optuna_train_acc:.4f}")
print(f"Optuna Test F1-macro: {optuna_test_f1:.4f}, Test Accuracy: {optuna_test_acc:.4f}")

print("\nOptuna Train Classification Report:")
print(classification_report(y_train, y_train_pred_optuna, digits=4))

print("Optuna Test Classification Report:")
print(classification_report(y_test, y_test_pred_optuna, digits=4))


Optuna Train F1-macro: 0.8581, Train Accuracy: 0.8595
Optuna Test F1-macro: 0.7697, Test Accuracy: 0.7774

Optuna Train Classification Report:
              precision    recall  f1-score   support

           0     0.8702    0.8993    0.8845     42575
           1     0.8515    0.7630    0.8048     42575
           2     0.8559    0.9163    0.8850     42575

    accuracy                         0.8595    127725
   macro avg     0.8592    0.8595    0.8581    127725
weighted avg     0.8592    0.8595    0.8581    127725

Optuna Test Classification Report:
              precision    recall  f1-score   support

           0     0.7547    0.8408    0.7955      5874
           1     0.8613    0.7242    0.7868     10599
           2     0.6455    0.8316    0.7268      3527

    accuracy                         0.7774     20000
   macro avg     0.7538    0.7989    0.7697     20000
weighted avg     0.7919    0.7774    0.7788     20000



# Model Saving

In [16]:
import joblib

model_filename = 'rf_optuna_model.joblib'
joblib.dump(rf_optuna, model_filename)

['rf_optuna_model.joblib']

Save model RF optuna because of better result in both supressing overfitting & F1-Macro test