In [1]:
# --- Step 1: Import Libraries and Load Data --- #
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
# Make sure the path is correct for your project structure
file_path = '../datasets/spreadspoke_scores.csv'
df_nfl = pd.read_csv(file_path)

print("Data loaded successfully.")

Data loaded successfully.


In [3]:
# --- Step 2: Initial Cleaning and Target Variable Creation --- #
df_clean = df_nfl.copy()
df_clean['result'] = (df_clean['score_home'] > df_clean['score_away']).astype(int)
columns_to_drop = [
    'team_favorite_id', 'spread_favorite', 'over_under_line',
    'weather_humidity', 'weather_detail'
]
df_clean = df_clean.drop(columns=columns_to_drop)
df_clean = df_clean.dropna()
print("Initial cleaning complete.")

Initial cleaning complete.


In [4]:
# --- Step 3: Advanced Feature Engineering --- #
df_clean['schedule_date'] = pd.to_datetime(df_clean['schedule_date'])
df_clean = df_clean.sort_values(by='schedule_date').reset_index(drop=True)

def calculate_team_form(df, window_size=5):
    """Calculates dynamic features like win streaks and recent performance."""
    team_stats = defaultdict(lambda: {'streak': 0, 'recent_scores_for': [], 'recent_scores_against': []})

    home_streaks, away_streaks = [], []
    home_avg_pts_for, home_avg_pts_against = [], []
    away_avg_pts_for, away_avg_pts_against = [], []

    print("Calculating dynamic features...")

    for index, row in df.iterrows():
        home_team, away_team = row['team_home'], row['team_away']
        score_home, score_away = row['score_home'], row['score_away']
        home_win = row['result'] == 1

        home_streaks.append(team_stats[home_team]['streak'])
        home_avg_pts_for.append(np.mean(team_stats[home_team]['recent_scores_for']) if team_stats[home_team]['recent_scores_for'] else 0)
        home_avg_pts_against.append(np.mean(team_stats[home_team]['recent_scores_against']) if team_stats[home_team]['recent_scores_against'] else 0)

        away_streaks.append(team_stats[away_team]['streak'])
        away_avg_pts_for.append(np.mean(team_stats[away_team]['recent_scores_for']) if team_stats[away_team]['recent_scores_for'] else 0)
        away_avg_pts_against.append(np.mean(team_stats[away_team]['recent_scores_against']) if team_stats[away_team]['recent_scores_against'] else 0)

        team_stats[home_team]['recent_scores_for'].append(score_home)
        team_stats[home_team]['recent_scores_against'].append(score_away)
        team_stats[home_team]['streak'] = team_stats[home_team]['streak'] + 1 if home_win else -1

        team_stats[away_team]['recent_scores_for'].append(score_away)
        team_stats[away_team]['recent_scores_against'].append(score_home)
        team_stats[away_team]['streak'] = team_stats[away_team]['streak'] + 1 if not home_win else -1

        if len(team_stats[home_team]['recent_scores_for']) > window_size:
            team_stats[home_team]['recent_scores_for'].pop(0)
            team_stats[home_team]['recent_scores_against'].pop(0)
        if len(team_stats[away_team]['recent_scores_for']) > window_size:
            team_stats[away_team]['recent_scores_for'].pop(0)
            team_stats[away_team]['recent_scores_against'].pop(0)

    df['home_streak'] = home_streaks
    df['away_streak'] = away_streaks
    df['home_avg_pts_for'] = home_avg_pts_for
    df['home_avg_pts_against'] = home_avg_pts_against
    df['away_avg_pts_for'] = away_avg_pts_for
    df['away_avg_pts_against'] = away_avg_pts_against

    return df

df_dynamic = calculate_team_form(df_clean.copy())
print("Dynamic features created.")

Calculating dynamic features...
Dynamic features created.


In [5]:
# --- Step 4: Prepare Final Data for Modeling --- #
y = df_dynamic['result']
X = df_dynamic.drop(columns=['result', 'score_home', 'score_away', 'schedule_date', 'stadium'])
X_encoded = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Align columns to prevent errors
train_cols = X_train.columns
X_test = X_test.reindex(columns=train_cols, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Final data preparation complete.")

Final data preparation complete.


In [6]:
# --- Step 5: Hyperparameter Tuning with GridSearchCV --- #
print("\n--- Starting Hyperparameter Tuning (This will take a few minutes)... ---")

# Define the "grid" of settings to test
param_grid = {
    'n_estimators': [100, 200],         # Number of trees in the forest
    'max_depth': [10, 20, None],       # Maximum depth of the trees
    'min_samples_leaf': [1, 2, 4],     # Minimum samples required at a leaf node
    'min_samples_split': [2, 5]        # Minimum samples required to split a node
}

# Create a Random Forest model
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
# cv=3 means 3-fold cross-validation. n_jobs=-1 uses all CPU cores.
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Run the grid search on the training data
grid_search.fit(X_train_scaled, y_train)

print("--- Tuning Complete ---")


--- Starting Hyperparameter Tuning (This will take a few minutes)... ---
Fitting 3 folds for each of 36 candidates, totalling 108 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total tim

In [7]:
# --- Step 6: Evaluate the Best Model --- #
# Get the best parameters found by the search
print("\nBest Hyperparameters Found:")
print(grid_search.best_params_)

# Use the best model found to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_scaled)
final_accuracy = accuracy_score(y_test, predictions)

print("\n--- Final Model Evaluation ---")
print(f"Final Tuned Model Accuracy: {final_accuracy * 100:.2f}%")


Best Hyperparameters Found:
{'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}

--- Final Model Evaluation ---
Final Tuned Model Accuracy: 62.31%
