# Step 0: Imports

In [1]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

from sklearn.model_selection import train_test_split

# Go to project root: .../Scrabble
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, project_root)  # Add root

from src.utils import create_dataset, find_best_model

# Step 1: Create Training+Validation/Testing sets

In [4]:
# Create new features and build one df to contains X_train_val, y_train_val and X_test 
dataset = create_dataset()

# Filter out rows of testing data (rows with user_rating == 0)
training_examples = dataset[dataset['user_rating'] != 0]

X_train_val = training_examples.drop(columns=['user_rating'])  # Traning + validation features df
y_train_val = training_examples['user_rating']  # Train + validation target vector

# Extract rows of testing data
testing_examples = dataset[dataset['user_rating'] == 0]
X_test = testing_examples.drop(columns=['user_rating']) # Test features df

# Step 2: Find the best model after tuning hyperparameters

In [5]:
best_model, avg_train_rmse, avg_val_rmse = find_best_model(X_train_val, y_train_val)

print(best_model, end='\n')
print(f"RMSE on the training set = {avg_train_rmse:.3f}", end='\n')
print(f"RMSE on the validation set = {avg_val_rmse:.3f}")

print(f"Train RMSE: {avg_train_rmse:.3f}")
print(f"Validation RMSE: {avg_val_rmse:.3f}")

gap = ((avg_val_rmse / avg_train_rmse) - 1) * 100
print(f"Generalization Gap: {gap:.3f}")

[I 2025-05-03 11:36:09,838] A new study created in memory with name: no-name-64931b5a-d355-4f9e-8eae-ae82c1761973



Optimizing: Random Forest


[I 2025-05-03 11:37:13,982] Trial 0 finished with value: 135.63121091356007 and parameters: {'n_estimators': 250, 'max_depth': 48, 'min_samples_split': 15}. Best is trial 0 with value: 135.63121091356007.
[I 2025-05-03 11:38:09,277] Trial 1 finished with value: 134.67584752763474 and parameters: {'n_estimators': 340, 'max_depth': 12, 'min_samples_split': 4}. Best is trial 1 with value: 134.67584752763474.
[I 2025-05-03 11:38:40,635] Trial 2 finished with value: 135.88986567769865 and parameters: {'n_estimators': 123, 'max_depth': 44, 'min_samples_split': 13}. Best is trial 1 with value: 134.67584752763474.
[I 2025-05-03 11:39:07,510] Trial 3 finished with value: 140.7234494851781 and parameters: {'n_estimators': 383, 'max_depth': 5, 'min_samples_split': 20}. Best is trial 1 with value: 134.67584752763474.
[I 2025-05-03 11:40:26,166] Trial 4 finished with value: 134.777724966935 and parameters: {'n_estimators': 433, 'max_depth': 14, 'min_samples_split': 5}. Best is trial 1 with value: 1

Best score for Random Forest: 134.6758

Best params: {'n_estimators': 340, 'max_depth': 12, 'min_samples_split': 4}

Optimizing: XGBoost


[I 2025-05-03 11:45:51,113] Trial 0 finished with value: 142.13113403320312 and parameters: {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.1205712628744377, 'subsample': 0.7993292420985183}. Best is trial 0 with value: 142.13113403320312.
[I 2025-05-03 11:45:54,527] Trial 1 finished with value: 141.61570739746094 and parameters: {'n_estimators': 162, 'max_depth': 4, 'learning_rate': 0.012184186502221764, 'subsample': 0.9330880728874675}. Best is trial 1 with value: 141.61570739746094.
[I 2025-05-03 11:46:16,004] Trial 2 finished with value: 134.92880249023438 and parameters: {'n_estimators': 341, 'max_depth': 10, 'learning_rate': 0.010725209743171996, 'subsample': 0.9849549260809971}. Best is trial 2 with value: 134.92880249023438.
[I 2025-05-03 11:46:22,723] Trial 3 finished with value: 134.5030975341797 and parameters: {'n_estimators': 433, 'max_depth': 5, 'learning_rate': 0.01855998084649059, 'subsample': 0.5917022549267169}. Best is trial 3 with value: 134.5030975341797.

Best score for XGBoost: 134.3904

Best params: {'n_estimators': 222, 'max_depth': 8, 'learning_rate': 0.04345454109729477, 'subsample': 0.645614570099021}

Optimizing: LightGBM


[I 2025-05-03 11:47:27,283] Trial 0 finished with value: 137.05493571117086 and parameters: {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.1205712628744377, 'num_leaves': 98}. Best is trial 0 with value: 137.05493571117086.
[I 2025-05-03 11:47:30,322] Trial 1 finished with value: 141.64432684343515 and parameters: {'n_estimators': 162, 'max_depth': 4, 'learning_rate': 0.012184186502221764, 'num_leaves': 133}. Best is trial 0 with value: 137.05493571117086.
[I 2025-05-03 11:48:01,975] Trial 2 finished with value: 134.426827202161 and parameters: {'n_estimators': 341, 'max_depth': 10, 'learning_rate': 0.010725209743171996, 'num_leaves': 147}. Best is trial 2 with value: 134.426827202161.
[I 2025-05-03 11:48:09,438] Trial 3 finished with value: 134.74236740646762 and parameters: {'n_estimators': 433, 'max_depth': 5, 'learning_rate': 0.01855998084649059, 'num_leaves': 44}. Best is trial 2 with value: 134.426827202161.
[I 2025-05-03 11:48:16,888] Trial 4 finished with value: 134.

Best score for LightGBM: 134.1542

Best params: {'n_estimators': 282, 'max_depth': 10, 'learning_rate': 0.019721610970574007, 'num_leaves': 87}
           model  Mean_CV_RMSE
0       LightGBM    134.154189
1        XGBoost    134.390436
2  Random Forest    134.675848
{'Random Forest': Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['avg_word_length',
                                                   'bingo_count',
                                                   'hard_letter_plays',
                                                   'negative_turns_count',
                                                   'pass_count',
                                                   'exchange_count',
                                                   'user_score',
                                                   'avg_extra_po