In [45]:
# StuzeBets
# Niko Tsiolas
# June 24th, 2024

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression

# Load datasets
stadiums = pd.read_csv('nfl_stadiums.csv', encoding='latin1')
teams = pd.read_csv('nfl_teams.csv', encoding='latin1')
scores = pd.read_csv('spreadspoke_scores.csv', encoding='latin1')

# Fill missing values using forward fill method
stadiums.ffill(inplace=True)
teams.ffill(inplace=True)
scores.ffill(inplace=True)

# Verify the data cleaning
print(stadiums.info())
print(scores.info())
print(teams.info())

print(scores.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   stadium_name                     120 non-null    object 
 1   stadium_location                 120 non-null    object 
 2   stadium_open                     120 non-null    float64
 3   stadium_close                    114 non-null    float64
 4   stadium_type                     120 non-null    object 
 5   stadium_address                  120 non-null    object 
 6   stadium_weather_station_zipcode  120 non-null    object 
 7   stadium_weather_type             120 non-null    object 
 8   stadium_capacity                 120 non-null    object 
 9   stadium_surface                  120 non-null    object 
 10  stadium_weather_station          120 non-null    object 
 11  stadium_weather_station_name     120 non-null    object 
 12  stadium_latitude      

In [46]:
# Define win columns
scores['home_win'] = (scores['score_home'] > scores['score_away']).astype(int)
scores['away_win'] = (scores['score_away'] > scores['score_home']).astype(int)

# Calculate average points scored by home and away teams
scores['team_points_avg_home'] = scores.groupby('team_home')['score_home'].transform('mean')
scores['team_points_avg_away'] = scores.groupby('team_away')['score_away'].transform('mean')

# Adding rolling average for the last 5 games for home and away teams
scores['home_team_avg_last_5'] = scores.groupby('team_home')['score_home'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
scores['away_team_avg_last_5'] = scores.groupby('team_away')['score_away'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

# Calculate performance metrics based on weather conditions using available columns
# Group by home team and weather conditions
home_performance = scores.groupby(['team_home', 'weather_temperature', 'weather_wind_mph', 'weather_humidity'])['home_win'].mean().reset_index()
home_performance.rename(columns={'team_home': 'team', 'home_win': 'win_rate'}, inplace=True)

# Group by away team and weather conditions
away_performance = scores.groupby(['team_away', 'weather_temperature', 'weather_wind_mph', 'weather_humidity'])['away_win'].mean().reset_index()
away_performance.rename(columns={'team_away': 'team', 'away_win': 'win_rate'}, inplace=True)

# Combine home and away performance data
performance = pd.concat([home_performance, away_performance])
performance = performance.groupby(['team', 'weather_temperature', 'weather_wind_mph', 'weather_humidity'])['win_rate'].mean().reset_index()


# Verify the new features
print(scores[['team_home', 'team_points_avg_home', 'team_away', 'team_points_avg_away']].head())



            team_home  team_points_avg_home        team_away  \
0      Miami Dolphins             22.871102  Oakland Raiders   
1      Houston Oilers             21.135593   Denver Broncos   
2  San Diego Chargers             23.104218    Buffalo Bills   
3      Miami Dolphins             22.871102    New York Jets   
4   Green Bay Packers             23.716942  Baltimore Colts   

   team_points_avg_away  
0             20.024316  
1             19.841102  
2             19.232558  
3             18.863445  
4             19.263158  


In [47]:
def get_weather_performance(team, temp, wind, humidity, performance):
    perf = performance[(performance['team'] == team) &
                       (performance['weather_temperature'] == temp) &
                       (performance['weather_wind_mph'] == wind) &
                       (performance['weather_humidity'] == humidity)]
    return perf['win_rate'].values[0] if not perf.empty else 0.5  # Default to 0.5 if no data available


# Apply to home and away teams
scores['home_weather_performance'] = scores.apply(lambda row: get_weather_performance(row['team_home'], row['weather_temperature'], row['weather_wind_mph'], row['weather_humidity'], performance), axis=1)
scores['away_weather_performance'] = scores.apply(lambda row: get_weather_performance(row['team_away'], row['weather_temperature'], row['weather_wind_mph'], row['weather_humidity'], performance), axis=1)


In [48]:
# Define the target variable
scores['home_win'] = (scores['score_home'] > scores['score_away']).astype(int)

# Define the features and target variables
features = ['team_points_avg_home', 'team_points_avg_away', 'spread_favorite', 'over_under_line',
            'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'home_weather_performance', 'away_weather_performance']

for feature in features:
    scores[feature] = pd.to_numeric(scores[feature], errors='coerce')

X = scores[features].copy()
y = scores['home_win']

# Handling missing values
X.fillna(0, inplace=True)



# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')



X_train shape: (11258, 9)
X_test shape: (2815, 9)
y_train shape: (11258,)
y_test shape: (2815,)


In [49]:
print ('First few entries of y_train:') 
print(y_train.head())

First few entries of y_train:
6601     1
7053     1
1897     1
12628    1
13912    1
Name: home_win, dtype: int64


In [50]:
# Train and evaluate the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{classification_rep}')
print(f'AUC-ROC: {roc_auc}')


Accuracy: 0.9282415630550621
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      1174
           1       0.93      0.95      0.94      1641

    accuracy                           0.93      2815
   macro avg       0.93      0.92      0.93      2815
weighted avg       0.93      0.93      0.93      2815

AUC-ROC: 0.98686215763646
