## Step 0: Imports

In [1]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

from sklearn.model_selection import train_test_split

# Go to project root: .../Scrabble
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, project_root)  # 👈 Add root, not src!

from src.constants import GAMES_FILE_PATH, TURNS_FILE_PATH, TRAIN_FILE_PATH
from src.utils import create_dataset, evaluate_multiple_models, tune_all_models

## Step 1: Create Training+Validation/Test sets

In [2]:
# Create new features and build one df to contains X_train_val, y_train_val and X_test 
dataset = create_dataset()

# Filter out rows of testing data (rows with user_rating == 0)
training_examples = dataset[dataset['user_rating'] != 0]

X_train_val = training_examples.drop(columns=['user_rating'])  # Traning + validation features df
y_train_val = training_examples['user_rating']  # Train + validation target vector

# Extract rows of testing data
testing_examples = dataset[dataset['user_rating'] == 0]
X_test = testing_examples.drop(columns=['user_rating']) # Test features df

## Step 2: Tune the best model

In [3]:
# Tune all models
cross_val_scores_df, tuned_models = tune_all_models(X_train_val, y_train_val)
print(cross_val_scores_df, end='\n')
print(tuned_models, end='\n')
best_model = tuned_models[cross_val_scores_df.iloc[0]['model']]

[I 2025-04-29 14:47:26,346] A new study created in memory with name: no-name-2beef31a-d1a7-44fa-8ac0-75518daea67b



Optimizing: Random Forest


[I 2025-04-29 14:48:11,066] Trial 0 finished with value: 135.63121091356007 and parameters: {'n_estimators': 250, 'max_depth': 48, 'min_samples_split': 15}. Best is trial 0 with value: 135.63121091356007.
[I 2025-04-29 14:49:14,264] Trial 1 finished with value: 134.67584752763474 and parameters: {'n_estimators': 340, 'max_depth': 12, 'min_samples_split': 4}. Best is trial 1 with value: 134.67584752763474.
[I 2025-04-29 14:49:55,489] Trial 2 finished with value: 135.88986567769865 and parameters: {'n_estimators': 123, 'max_depth': 44, 'min_samples_split': 13}. Best is trial 1 with value: 134.67584752763474.
[I 2025-04-29 14:50:38,755] Trial 3 finished with value: 140.7234494851781 and parameters: {'n_estimators': 383, 'max_depth': 5, 'min_samples_split': 20}. Best is trial 1 with value: 134.67584752763474.
[I 2025-04-29 14:52:02,548] Trial 4 finished with value: 134.777724966935 and parameters: {'n_estimators': 433, 'max_depth': 14, 'min_samples_split': 5}. Best is trial 1 with value: 1

Best score for Random Forest: 134.6758

Best params: {'n_estimators': 340, 'max_depth': 12, 'min_samples_split': 4}

Optimizing: XGBoost


[I 2025-04-29 14:59:56,977] Trial 0 finished with value: 142.13113403320312 and parameters: {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.1205712628744377, 'subsample': 0.7993292420985183}. Best is trial 0 with value: 142.13113403320312.
[I 2025-04-29 14:59:59,996] Trial 1 finished with value: 141.61570739746094 and parameters: {'n_estimators': 162, 'max_depth': 4, 'learning_rate': 0.012184186502221764, 'subsample': 0.9330880728874675}. Best is trial 1 with value: 141.61570739746094.
[I 2025-04-29 15:00:19,458] Trial 2 finished with value: 134.92880249023438 and parameters: {'n_estimators': 341, 'max_depth': 10, 'learning_rate': 0.010725209743171996, 'subsample': 0.9849549260809971}. Best is trial 2 with value: 134.92880249023438.
[I 2025-04-29 15:00:26,660] Trial 3 finished with value: 134.5030975341797 and parameters: {'n_estimators': 433, 'max_depth': 5, 'learning_rate': 0.01855998084649059, 'subsample': 0.5917022549267169}. Best is trial 3 with value: 134.5030975341797.

Best score for XGBoost: 134.3904

Best params: {'n_estimators': 222, 'max_depth': 8, 'learning_rate': 0.04345454109729477, 'subsample': 0.645614570099021}

Optimizing: LightGBM


[I 2025-04-29 15:01:37,678] Trial 0 finished with value: 137.05493571117086 and parameters: {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.1205712628744377, 'num_leaves': 98}. Best is trial 0 with value: 137.05493571117086.
[I 2025-04-29 15:01:42,809] Trial 1 finished with value: 141.64432684343515 and parameters: {'n_estimators': 162, 'max_depth': 4, 'learning_rate': 0.012184186502221764, 'num_leaves': 133}. Best is trial 0 with value: 137.05493571117086.
[I 2025-04-29 15:02:36,544] Trial 2 finished with value: 134.426827202161 and parameters: {'n_estimators': 341, 'max_depth': 10, 'learning_rate': 0.010725209743171996, 'num_leaves': 147}. Best is trial 2 with value: 134.426827202161.
[I 2025-04-29 15:02:52,243] Trial 3 finished with value: 134.74236740646762 and parameters: {'n_estimators': 433, 'max_depth': 5, 'learning_rate': 0.01855998084649059, 'num_leaves': 44}. Best is trial 2 with value: 134.426827202161.
[I 2025-04-29 15:03:06,728] Trial 4 finished with value: 134.

Best score for LightGBM: 134.1542

Best params: {'n_estimators': 282, 'max_depth': 10, 'learning_rate': 0.019721610970574007, 'num_leaves': 87}
{'Random Forest': Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['avg_word_length',
                                                   'bingo_count',
                                                   'hard_letter_plays',
                                                   'negative_turns_count',
                                                   'pass_count',
                                                   'exchange_count',
                                                   'user_score',
                                                   'avg_extra_points_per_turn',
                                                   'bot_score', 'bot_rating',
                              