In [2]:
# --- Step 1: Import Libraries and Load Data ---
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
# Make sure the path is correct for your project structure
file_path = '../datasets/spreadspoke_scores.csv'
df_nfl = pd.read_csv(file_path)

print("Data loaded successfully.")

# --- Step 2: Initial Cleaning and Target Variable Creation ---
df_clean = df_nfl.copy()

# Create the target variable: 1 for home win, 0 otherwise
df_clean['result'] = (df_clean['score_home'] > df_clean['score_away']).astype(int)

# Drop columns with too many missing values or that are not useful for a simple model
columns_to_drop = [
    'team_favorite_id', 'spread_favorite', 'over_under_line',
    'weather_humidity', 'weather_detail'
]
df_clean = df_clean.drop(columns=columns_to_drop)
df_clean = df_clean.dropna()

print("Initial cleaning complete and target variable created.")

# --- Step 3: Advanced Feature Engineering ---
# First, sort the DataFrame by date to process games chronologically
df_clean['schedule_date'] = pd.to_datetime(df_clean['schedule_date'])
df_clean = df_clean.sort_values(by='schedule_date').reset_index(drop=True)

def calculate_team_form(df, window_size=5):
    """Calculates dynamic features like win streaks and recent performance."""
    team_stats = defaultdict(lambda: {'streak': 0, 'recent_scores_for': [], 'recent_scores_against': []})

    home_streaks, away_streaks = [], []
    home_avg_pts_for, home_avg_pts_against = [], []
    away_avg_pts_for, away_avg_pts_against = [], []

    print("Calculating dynamic features for each game...")

    for index, row in df.iterrows():
        home_team, away_team = row['team_home'], row['team_away']
        score_home, score_away = row['score_home'], row['score_away']
        home_win = row['result'] == 1

        # Get stats for teams *before* this game
        home_streaks.append(team_stats[home_team]['streak'])
        home_avg_pts_for.append(np.mean(team_stats[home_team]['recent_scores_for']) if team_stats[home_team]['recent_scores_for'] else 0)
        home_avg_pts_against.append(np.mean(team_stats[home_team]['recent_scores_against']) if team_stats[home_team]['recent_scores_against'] else 0)

        away_streaks.append(team_stats[away_team]['streak'])
        away_avg_pts_for.append(np.mean(team_stats[away_team]['recent_scores_for']) if team_stats[away_team]['recent_scores_for'] else 0)
        away_avg_pts_against.append(np.mean(team_stats[away_team]['recent_scores_against']) if team_stats[away_team]['recent_scores_against'] else 0)

        # Update stats *after* this game
        team_stats[home_team]['recent_scores_for'].append(score_home)
        team_stats[home_team]['recent_scores_against'].append(score_away)
        team_stats[home_team]['streak'] = team_stats[home_team]['streak'] + 1 if home_win else -1

        team_stats[away_team]['recent_scores_for'].append(score_away)
        team_stats[away_team]['recent_scores_against'].append(score_home)
        team_stats[away_team]['streak'] = team_stats[away_team]['streak'] + 1 if not home_win else -1

        # Keep the "recent" lists from getting too long
        if len(team_stats[home_team]['recent_scores_for']) > window_size:
            team_stats[home_team]['recent_scores_for'].pop(0)
            team_stats[home_team]['recent_scores_against'].pop(0)
        if len(team_stats[away_team]['recent_scores_for']) > window_size:
            team_stats[away_team]['recent_scores_for'].pop(0)
            team_stats[away_team]['recent_scores_against'].pop(0)

    df['home_streak'] = home_streaks
    df['away_streak'] = away_streaks
    df['home_avg_pts_for'] = home_avg_pts_for
    df['home_avg_pts_against'] = home_avg_pts_against
    df['away_avg_pts_for'] = away_avg_pts_for
    df['away_avg_pts_against'] = away_avg_pts_against

    return df

df_dynamic = calculate_team_form(df_clean.copy())
print("Dynamic features created.")

# --- Step 4: Prepare Data for Modeling (for both scenarios) ---
y = df_dynamic['result']

# Define Original Features (from the first model)
original_features_df = df_dynamic.drop(columns=[
    'result', 'score_home', 'score_away', 'schedule_date', 'stadium',
    'home_streak', 'away_streak', 'home_avg_pts_for', 'home_avg_pts_against',
    'away_avg_pts_for', 'away_avg_pts_against'
])
X_orig_encoded = pd.get_dummies(original_features_df)

# Define Advanced Features (with our new dynamic stats)
advanced_features_df = df_dynamic.drop(columns=[
    'result', 'score_home', 'score_away', 'schedule_date', 'stadium'
])
X_adv_encoded = pd.get_dummies(advanced_features_df)

# Align columns after one-hot encoding to ensure they match perfectly
X_orig_train, X_orig_test, y_train, y_test = train_test_split(X_orig_encoded, y, test_size=0.2, random_state=42)
X_adv_train, X_adv_test, _, _ = train_test_split(X_adv_encoded, y, test_size=0.2, random_state=42)

# Align the columns of the test sets with the training sets
orig_train_cols = X_orig_train.columns
adv_train_cols = X_adv_train.columns

X_orig_test = X_orig_test.reindex(columns=orig_train_cols, fill_value=0)
X_adv_test = X_adv_test.reindex(columns=adv_train_cols, fill_value=0)

# Scale both datasets
scaler = StandardScaler()
X_orig_train_scaled = scaler.fit_transform(X_orig_train)
X_orig_test_scaled = scaler.transform(X_orig_test)

X_adv_train_scaled = scaler.fit_transform(X_adv_train)
X_adv_test_scaled = scaler.transform(X_adv_test)

print("Data preparation for all scenarios is complete.")

# --- Step 5: The Showdown - Train and Evaluate All Models ---
print("\n--- Starting Model Showdown ---")

# Models
lr_model = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Scenario 1: Logistic Regression (Original)
lr_model.fit(X_orig_train_scaled, y_train)
lr_orig_accuracy = accuracy_score(y_test, lr_model.predict(X_orig_test_scaled))

# Scenario 2: Logistic Regression (Advanced)
lr_model.fit(X_adv_train_scaled, y_train)
lr_adv_accuracy = accuracy_score(y_test, lr_model.predict(X_adv_test_scaled))

# Scenario 3: Random Forest (Original)
rf_model.fit(X_orig_train_scaled, y_train)
rf_orig_accuracy = accuracy_score(y_test, rf_model.predict(X_orig_test_scaled))

# Scenario 4: Random Forest (Advanced)
rf_model.fit(X_adv_train_scaled, y_train)
rf_adv_accuracy = accuracy_score(y_test, rf_model.predict(X_adv_test_scaled))

print("--- Model Showdown Complete ---")

# --- Step 6: Display Final Results ---
results = {
    "Model": ["Logistic Regression", "Logistic Regression", "Random Forest", "Random Forest"],
    "Features": ["Original", "Advanced", "Original", "Advanced"],
    "Accuracy": [lr_orig_accuracy, lr_adv_accuracy, rf_orig_accuracy, rf_adv_accuracy]
}
results_df = pd.DataFrame(results)
results_df['Accuracy'] = (results_df['Accuracy'] * 100).map('{:.2f}%'.format)

print("\n--- Final Results ---")
print(results_df)


Data loaded successfully.
Initial cleaning complete and target variable created.
Calculating dynamic features for each game...
Dynamic features created.
Data preparation for all scenarios is complete.

--- Starting Model Showdown ---


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


--- Model Showdown Complete ---

--- Final Results ---
                 Model  Features Accuracy
0  Logistic Regression  Original   58.20%
1  Logistic Regression  Advanced   62.23%
2        Random Forest  Original   56.80%
3        Random Forest  Advanced   62.07%
