In [16]:
#StuzeBets
#Niko Tsiolas
#June 24th, 2024

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load datasets
stadiums = pd.read_csv('nfl_stadiums.csv', encoding='latin1')
teams = pd.read_csv('nfl_teams.csv', encoding='latin1')
scores = pd.read_csv('spreadspoke_scores.csv', encoding='latin1')

# Fill missing values using forward fill method
stadiums.ffill(inplace=True)
teams.ffill(inplace=True)
scores.ffill(inplace=True)

# Verify the data cleaning
print(stadiums.info())
print(scores.info())
print(teams.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   stadium_name                     120 non-null    object 
 1   stadium_location                 120 non-null    object 
 2   stadium_open                     120 non-null    float64
 3   stadium_close                    114 non-null    float64
 4   stadium_type                     120 non-null    object 
 5   stadium_address                  120 non-null    object 
 6   stadium_weather_station_zipcode  120 non-null    object 
 7   stadium_weather_type             120 non-null    object 
 8   stadium_capacity                 120 non-null    object 
 9   stadium_surface                  120 non-null    object 
 10  stadium_weather_station          120 non-null    object 
 11  stadium_weather_station_name     120 non-null    object 
 12  stadium_latitude      

In [17]:
# Calculate average points scored by home and away teams
scores['team_points_avg_home'] = scores.groupby('team_home')['score_home'].transform('mean')
scores['team_points_avg_away'] = scores.groupby('team_away')['score_away'].transform('mean')

# Verify the new features
print(scores[['team_home', 'team_points_avg_home', 'team_away', 'team_points_avg_away']].head())



            team_home  team_points_avg_home        team_away  \
0      Miami Dolphins             22.871102  Oakland Raiders   
1      Houston Oilers             21.135593   Denver Broncos   
2  San Diego Chargers             23.104218    Buffalo Bills   
3      Miami Dolphins             22.871102    New York Jets   
4   Green Bay Packers             23.716942  Baltimore Colts   

   team_points_avg_away  
0             20.024316  
1             19.841102  
2             19.232558  
3             18.863445  
4             19.263158  


In [21]:
scores['home_win'] = (scores['score_home'] > scores ['score_away']).astype(int)
#defining the features and target V's
features = ['team_points_avg_home', 'team_points_avg_away', 'spread_favorite', 'over_under_line', 'weather_temperature', 'weather_wind_mph', 'weather_humidity']
X = scores[features]
y = scores['home_win']

#just handling missing variables
X = X.copy()

X.fillna(0, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state=42)

X_train['over_under_line'] = pd.to_numeric(X_train['over_under_line'], errors='coerce').fillna(0)
X_test['over_under_line'] = pd.to_numeric(X_test['over_under_line'], errors='coerce').fillna(0)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')


X_train shape: (11258, 7)
X_test shape: (2815, 7)
y_train shape: (11258,)
y_test shape: (2815,)


In [22]:
print ('First few entries of y_train:') 
print(y_train.head())

First few entries of y_train:
6601     1
7053     1
1897     1
12628    1
13912    1
Name: home_win, dtype: int64


In [24]:
model = RandomForestClassifier(n_estimators=100 , random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test,y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{classification_rep}')
print(f'AUC-ROC: {roc_auc}')



Accuracy: 0.5854351687388988
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.41      0.45      1174
           1       0.63      0.71      0.67      1641

    accuracy                           0.59      2815
   macro avg       0.57      0.56      0.56      2815
weighted avg       0.58      0.59      0.58      2815

AUC-ROC: 0.5997454496001628
