In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from scripts.datasets import create_folds
from joblib import dump

Add all models to test in the list

In [10]:
models = [(RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42), "Forest"),
          (xgb.XGBRegressor(n_estimators=100, n_jobs=-1, random_state=42), "XGBoost"),
          (AdaBoostRegressor(n_estimators=100, random_state=42), "AdaBoost")]

In [11]:
X, y, groups, group_kfold = create_folds()
best_model = None
best_accuracy = 0
for model, name in models:
    error = 0
    total = 0
    for fold_idx, (train_idx, test_idx) in enumerate(group_kfold.split(X, y, groups)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        X_train = X_train.drop(columns=["Year", "Player", "Team"])
        model.fit(X_train, y_train)
        X_test_clean = X_test.drop(columns=["Year", "Player", "Team"])
        y_preds = model.predict(X_test_clean)
        X_test2 = X_test.copy()
        X_test2["Predicted"] = y_preds
        X_test2["Actual"] = y_test
        X_predict = X_test2.sort_values("Predicted", ascending=False)
        X_real = X_test2.sort_values("Actual", ascending=False)
        total += 1
        if X_real["Player"].iloc[0] != X_predict["Player"].iloc[0]:
            error += 1
    
    accuracy = (total - error) / total
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
            
    print(f"Accuracy for {name}: {(total - error) / total}")
    
print(f"Best model: {best_model}, accuracy: {best_accuracy}")

Accuracy for Forest: 0.6818181818181818
Accuracy for XGBoost: 0.6818181818181818
Accuracy for AdaBoost: 0.6818181818181818
Best model: RandomForestRegressor(n_jobs=-1, random_state=42), accuracy: 0.6818181818181818


Save the best model

In [12]:
dump(best_model, "../models/model.joblib")

['../models/model.joblib']