<a href="https://colab.research.google.com/github/Pranav-Karra/NFLPredictor/blob/main/NFLPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NFL Predictor Model
Author: Pranav Karra

In [318]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Data Cleaning and Prep

In [319]:
old_play_data_2023 = pd.read_csv("/content/nfl-pbp-2023.csv")
old_play_data_2024 = pd.read_csv("/content/nfl-pbp-2024.csv")
old_score_data_2023 = pd.read_csv("/content/nfl-scores-2023.csv")
old_score_data_2024 = pd.read_csv("/content/nfl-scores-2024.csv")

In [320]:
old_play_data_2023.head()

Unnamed: 0,GameId,GameDate,Quarter,Minute,Second,OffenseTeam,DefenseTeam,Down,ToGo,YardLine,...,IsTwoPointConversion,IsTwoPointConversionSuccessful,RushDirection,YardLineFixed,YardLineDirection,IsPenaltyAccepted,PenaltyTeam,IsNoPlay,PenaltyType,PenaltyYards
0,2023110504,2023-11-05,2,3,15,LA,GB,2,2,63,...,0,0,CENTER,37,OPP,0,,0,,0
1,2023110504,2023-11-05,2,0,5,GB,LA,2,4,70,...,0,0,,30,OPP,0,,0,,0
2,2023110504,2023-11-05,2,0,4,GB,LA,3,4,70,...,0,0,,30,OPP,0,,0,,0
3,2023110504,2023-11-05,2,0,0,GB,LA,0,0,100,...,0,0,,0,OPP,0,,0,,0
4,2023110504,2023-11-05,3,2,13,GB,LA,4,6,92,...,0,0,,8,OPP,0,,0,,0


In [321]:
old_score_data_2023.head()

Unnamed: 0,Week,Date,Visitor,VisitorScore,Home,HomeScore,OT
0,Week 1,09/07/2023,Detroit Lions,21,Kansas City Chiefs,20,
1,Week 1,09/10/2023,Tampa Bay Buccaneers,20,Minnesota Vikings,17,
2,Week 1,09/10/2023,Tennessee Titans,15,New Orleans Saints,16,
3,Week 1,09/10/2023,Carolina Panthers,10,Atlanta Falcons,24,
4,Week 1,09/10/2023,Houston Texans,9,Baltimore Ravens,25,


In [322]:
print(old_play_data_2023.columns)

Index(['GameId', 'GameDate', 'Quarter', 'Minute', 'Second', 'OffenseTeam',
       'DefenseTeam', 'Down', 'ToGo', 'YardLine', 'Unnamed: 10',
       'SeriesFirstDown', 'Unnamed: 12', 'NextScore', 'Description', 'TeamWin',
       'Unnamed: 16', 'Unnamed: 17', 'SeasonYear', 'Yards', 'Formation',
       'PlayType', 'IsRush', 'IsPass', 'IsIncomplete', 'IsTouchdown',
       'PassType', 'IsSack', 'IsChallenge', 'IsChallengeReversed',
       'Challenger', 'IsMeasurement', 'IsInterception', 'IsFumble',
       'IsPenalty', 'IsTwoPointConversion', 'IsTwoPointConversionSuccessful',
       'RushDirection', 'YardLineFixed', 'YardLineDirection',
       'IsPenaltyAccepted', 'PenaltyTeam', 'IsNoPlay', 'PenaltyType',
       'PenaltyYards'],
      dtype='object')


In [323]:
print(old_score_data_2023.columns)

Index(['Week', 'Date', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT'], dtype='object')


In [324]:
old_play_data_2023['GameDate'] = pd.to_datetime(old_play_data_2023['GameDate']).dt.strftime('%m/%d/%Y')
old_play_data_2023.rename(columns={'GameDate':'Date'}, inplace=True)
old_play_data_2024['GameDate'] = pd.to_datetime(old_play_data_2024['GameDate']).dt.strftime('%m/%d/%Y')
old_play_data_2024.rename(columns={'GameDate':'Date'}, inplace=True)

old_play_data_2023.head(), old_play_data_2024.head()

(       GameId        Date  Quarter  Minute  Second OffenseTeam DefenseTeam  \
 0  2023110504  11/05/2023        2       3      15          LA          GB   
 1  2023110504  11/05/2023        2       0       5          GB          LA   
 2  2023110504  11/05/2023        2       0       4          GB          LA   
 3  2023110504  11/05/2023        2       0       0          GB          LA   
 4  2023110504  11/05/2023        3       2      13          GB          LA   
 
    Down  ToGo  YardLine  ...  IsTwoPointConversion  \
 0     2     2        63  ...                     0   
 1     2     4        70  ...                     0   
 2     3     4        70  ...                     0   
 3     0     0       100  ...                     0   
 4     4     6        92  ...                     0   
 
    IsTwoPointConversionSuccessful  RushDirection  YardLineFixed  \
 0                               0         CENTER             37   
 1                               0            NaN       

In [325]:
unique_home_teams_2023 = old_score_data_2023['Home'].unique()
unique_visitor_teams_2023 = old_score_data_2023['Visitor'].unique()
unique_home_teams_2024 = old_score_data_2024['Home'].unique()
unique_visitor_teams_2024 = old_score_data_2023['Visitor'].unique()

all_unique_teams = set(unique_home_teams_2023).union(set(unique_visitor_teams_2023)).union(set(unique_home_teams_2024)).union(set(unique_visitor_teams_2024))

unique_teams_abbrev_2023 = set(old_play_data_2023['OffenseTeam'].dropna().unique()).union(set(old_play_data_2023['DefenseTeam'].dropna().unique()))
unique_teams_abbrev_2024 = set(old_play_data_2024['OffenseTeam'].dropna().unique()).union(set(old_play_data_2024['DefenseTeam'].dropna().unique()))

all_unique_abbrev = unique_teams_abbrev_2023.union(unique_teams_abbrev_2024)

all_unique_teams, all_unique_abbrev

({'Arizona Cardinals',
  'Atlanta Falcons',
  'Baltimore Ravens',
  'Buffalo Bills',
  'Carolina Panthers',
  'Chicago Bears',
  'Cincinnati Bengals',
  'Cleveland Browns',
  'Dallas Cowboys',
  'Denver Broncos',
  'Detroit Lions',
  'Green Bay Packers',
  'Houston Texans',
  'Indianapolis Colts',
  'Jacksonville Jaguars',
  'Kansas City Chiefs',
  'Las Vegas Raiders',
  'Los Angeles Chargers',
  'Los Angeles Rams',
  'Miami Dolphins',
  'Minnesota Vikings',
  'New England Patriots',
  'New Orleans Saints',
  'New York Giants',
  'New York Jets',
  'Philadelphia Eagles',
  'Pittsburgh Steelers',
  'San Francisco 49ers',
  'Seattle Seahawks',
  'Tampa Bay Buccaneers',
  'Tennessee Titans',
  'Washington Commanders'},
 {'ARI',
  'ATL',
  'BAL',
  'BUF',
  'CAR',
  'CHI',
  'CIN',
  'CLE',
  'DAL',
  'DEN',
  'DET',
  'GB',
  'HOU',
  'IND',
  'JAX',
  'KC',
  'LA',
  'LAC',
  'LV',
  'MIA',
  'MIN',
  'NE',
  'NO',
  'NYG',
  'NYJ',
  'PHI',
  'PIT',
  'SEA',
  'SF',
  'TB',
  'TEN',
  '

In [326]:
team_mapping = {
    'Arizona Cardinals': 'ARI',
    'Atlanta Falcons': 'ATL',
    'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF',
    'Carolina Panthers': 'CAR',
    'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN',
    'Cleveland Browns': 'CLE',
    'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN',
    'Detroit Lions': 'DET',
    'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU',
    'Indianapolis Colts': 'IND',
    'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC',
    'Las Vegas Raiders': 'LV',
    'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LA',
    'Miami Dolphins': 'MIA',
    'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE',
    'New Orleans Saints': 'NO',
    'New York Giants': 'NYG',
    'New York Jets': 'NYJ',
    'Philadelphia Eagles': 'PHI',
    'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF',
    'Seattle Seahawks': 'SEA',
    'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN',
    'Washington Commanders': 'WAS'
}

old_score_data_2023['Home'] = old_score_data_2023['Home'].map(team_mapping)
old_score_data_2023['Visitor'] = old_score_data_2023['Visitor'].map(team_mapping)
old_score_data_2024['Home'] = old_score_data_2024['Home'].map(team_mapping)
old_score_data_2024['Visitor'] = old_score_data_2024['Visitor'].map(team_mapping)

old_score_data_2023.head(), old_score_data_2024.head()

(     Week        Date Visitor  VisitorScore Home  HomeScore   OT
 0  Week 1  09/07/2023     DET            21   KC         20  NaN
 1  Week 1  09/10/2023      TB            20  MIN         17  NaN
 2  Week 1  09/10/2023     TEN            15   NO         16  NaN
 3  Week 1  09/10/2023     CAR            10  ATL         24  NaN
 4  Week 1  09/10/2023     HOU             9  BAL         25  NaN,
      Week        Date Visitor  VisitorScore Home  HomeScore   OT
 0  Week 1  09/05/2024     BAL            20   KC         27  NaN
 1  Week 1  09/06/2024      GB            29  PHI         34  NaN
 2  Week 1  09/08/2024     CAR            10   NO         47  NaN
 3  Week 1  09/08/2024     TEN            17  CHI         24  NaN
 4  Week 1  09/08/2024      NE            16  CIN         10  NaN)

In [327]:
merged_data_2023 = old_play_data_2023.merge(old_score_data_2023, left_on=['Date', 'OffenseTeam', 'DefenseTeam'], right_on=['Date', 'Visitor', 'Home'], how='left')
merged_data_2023 = merged_data_2023.merge(old_score_data_2023, left_on=['Date', 'OffenseTeam', 'DefenseTeam'], right_on=['Date', 'Home', 'Visitor'], how='left', suffixes=('', '_reverse'))

for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']:
  merged_data_2023[col] = merged_data_2023[col].combine_first(merged_data_2023[col + '_reverse'])

cols_drop = [col + '_reverse' for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']]
merged_data_2023 = merged_data_2023.drop(columns=cols_drop)

merged_data_2023[['Date', 'OffenseTeam', 'DefenseTeam', 'Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']].head()

Unnamed: 0,Date,OffenseTeam,DefenseTeam,Week,Visitor,VisitorScore,Home,HomeScore,OT
0,11/05/2023,LA,GB,Week 9,LA,3.0,GB,20.0,
1,11/05/2023,GB,LA,Week 9,LA,3.0,GB,20.0,
2,11/05/2023,GB,LA,Week 9,LA,3.0,GB,20.0,
3,11/05/2023,GB,LA,Week 9,LA,3.0,GB,20.0,
4,11/05/2023,GB,LA,Week 9,LA,3.0,GB,20.0,


In [328]:
merged_data_2024 = old_play_data_2024.merge(old_score_data_2024, left_on=['Date', 'OffenseTeam', 'DefenseTeam'], right_on=['Date', 'Visitor', 'Home'], how='left')
merged_data_2024 = merged_data_2024.merge(old_score_data_2024, left_on=['Date', 'OffenseTeam', 'DefenseTeam'], right_on=['Date', 'Home', 'Visitor'], how='left', suffixes=('', '_reverse'))

for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']:
  merged_data_2024[col] = merged_data_2024[col].combine_first(merged_data_2024[col + '_reverse'])

cols_drop = [col + '_reverse' for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']]
merged_data_2024 = merged_data_2024.drop(columns=cols_drop)

merged_data_2024[['Date', 'OffenseTeam', 'DefenseTeam', 'Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']].head()

Unnamed: 0,Date,OffenseTeam,DefenseTeam,Week,Visitor,VisitorScore,Home,HomeScore,OT
0,09/05/2024,BAL,KC,Week 1,BAL,20.0,KC,27.0,
1,09/05/2024,BAL,KC,Week 1,BAL,20.0,KC,27.0,
2,09/06/2024,PHI,GB,Week 1,GB,29.0,PHI,34.0,
3,09/06/2024,PHI,GB,Week 1,GB,29.0,PHI,34.0,
4,09/05/2024,BAL,KC,Week 1,BAL,20.0,KC,27.0,


In [329]:
merged_data_2023['HomeWon'] = merged_data_2023['HomeScore'] > merged_data_2023['VisitorScore']
merged_data_2024['HomeWon'] = merged_data_2024['HomeScore'] > merged_data_2024['VisitorScore']

merged_data_2023[['Date', 'Home', 'Visitor', 'HomeScore', 'VisitorScore', 'HomeWon']].head(), merged_data_2024[['Date', 'Home', 'Visitor', 'HomeScore', 'VisitorScore', 'HomeWon']].head()

(         Date Home Visitor  HomeScore  VisitorScore  HomeWon
 0  11/05/2023   GB      LA       20.0           3.0     True
 1  11/05/2023   GB      LA       20.0           3.0     True
 2  11/05/2023   GB      LA       20.0           3.0     True
 3  11/05/2023   GB      LA       20.0           3.0     True
 4  11/05/2023   GB      LA       20.0           3.0     True,
          Date Home Visitor  HomeScore  VisitorScore  HomeWon
 0  09/05/2024   KC     BAL       27.0          20.0     True
 1  09/05/2024   KC     BAL       27.0          20.0     True
 2  09/06/2024  PHI      GB       34.0          29.0     True
 3  09/06/2024  PHI      GB       34.0          29.0     True
 4  09/05/2024   KC     BAL       27.0          20.0     True)

# Analyzing Team Features

In [330]:
data_2023 = merged_data_2023
data_2024 = merged_data_2024

upcoming_games = pd.read_csv("/content/Upcoming_Schedule_Week 6.csv")

combined_data = pd.concat([data_2023, data_2024])

avg_points_scored_home = combined_data.groupby('Home')['HomeScore'].mean()
avg_points_scored_visitor = combined_data.groupby('Visitor')['VisitorScore'].mean()
avg_points_allowed_home = combined_data.groupby('Home')['VisitorScore'].mean()
avg_points_allowed_visitor = combined_data.groupby('Visitor')['HomeScore'].mean()
overall_avg_points_scored = (avg_points_scored_home + avg_points_scored_visitor) / 2
overall_avg_points_allowed = (avg_points_allowed_home + avg_points_allowed_visitor) / 2
home_wins = combined_data.groupby('Home')['HomeWon'].sum()
visitor_wins = combined_data.groupby('Visitor').apply(lambda x: len(x) - x['HomeWon'].sum())
total_games_home = combined_data['Home'].value_counts()
total_games_visitor = combined_data['Visitor'].value_counts()
total_wins = home_wins + visitor_wins
total_games = total_games_home + total_games_visitor
win_rate = total_wins / total_games

offensive_features = pd.DataFrame({
    'AvgPointsScored': overall_avg_points_scored,
    'AvgPointsAllowed': overall_avg_points_allowed,
    'WinRate': win_rate
})

offensive_features.reset_index(inplace=True)
offensive_features.rename(columns={'Home': 'Team'}, inplace=True)

offensive_features.head()

  visitor_wins = combined_data.groupby('Visitor').apply(lambda x: len(x) - x['HomeWon'].sum())


Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate
0,ARI,20.043893,26.676782,0.236653
1,ATL,18.830107,22.0572,0.427999
2,BAL,27.822132,17.815528,0.707196
3,BUF,27.085107,19.134047,0.654438
4,CAR,15.144543,26.036992,0.141088


In [331]:
upcoming_games.head()

Unnamed: 0,Home,Visitor
0,SEA,SF
1,CHI,JAX
2,GB,ARI
3,TEN,IND
4,NO,TB


In [332]:
combined_data['SuccessfulPlay'] = combined_data['IsTouchdown'] | (~combined_data['IsInterception'] & ~combined_data['IsFumble'])

avg_conceded_plays_home = combined_data.groupby('Home')['SuccessfulPlay'].mean()
avg_conceded_plays_visitor = combined_data.groupby('Visitor')['SuccessfulPlay'].mean()
overall_avg_conceded_plays = (avg_conceded_plays_home + avg_conceded_plays_visitor) / 2

combined_data['Turnovers'] = combined_data['IsInterception'] | combined_data['IsFumble']
avg_forced_turnovers_home = combined_data.groupby('Home')['Turnovers'].mean()
avg_forced_turnovers_visitor = combined_data.groupby('Visitor')['Turnovers'].mean()
overall_avg_forced_turnovers = (avg_forced_turnovers_home + avg_forced_turnovers_visitor) / 2

defensive_features = pd.DataFrame({
    'Team': offensive_features['Team'].values,
    'AvgPointsDefended': offensive_features['AvgPointsAllowed'].values,
    'AvgConcededPlays': overall_avg_conceded_plays,
    'AvgForcedTurnovers': overall_avg_forced_turnovers.values
})

team_features_combined = offensive_features.merge(defensive_features, on='Team')

team_features_combined.head()

Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate,AvgPointsDefended,AvgConcededPlays,AvgForcedTurnovers
0,ARI,20.043893,26.676782,0.236653,26.676782,-1.016297,0.018085
1,ATL,18.830107,22.0572,0.427999,22.0572,-1.021492,0.022497
2,BAL,27.822132,17.815528,0.707196,17.815528,-1.019121,0.019828
3,BUF,27.085107,19.134047,0.654438,19.134047,-1.022799,0.023853
4,CAR,15.144543,26.036992,0.141088,26.036992,-1.014536,0.016916


In [333]:
avg_yards_per_play_home = combined_data.groupby('Home')['Yards'].mean()
avg_yards_per_play_visitor = combined_data.groupby('Visitor')['Yards'].mean()
overall_avg_yards_per_play = (avg_yards_per_play_home + avg_yards_per_play_visitor) / 2
total_yards_per_game_home = combined_data.groupby(['SeasonYear', 'Home'])['Yards'].sum() / combined_data.groupby(['SeasonYear', 'Home']).size()
total_yards_per_game_visitor = combined_data.groupby(['SeasonYear', 'Visitor'])['Yards'].sum() / combined_data.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_yards_per_game = (total_yards_per_game_home + total_yards_per_game_visitor).groupby(level=1).mean()
avg_pass_completion_rate_home = combined_data.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
avg_pass_completion_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
overall_avg_pass_completion_rate = (avg_pass_completion_rate_home + avg_pass_completion_rate_visitor) / 2
avg_touchdowns_per_game_home = combined_data.groupby(['SeasonYear', 'Home'])['IsTouchdown'].sum() / combined_data.groupby(['SeasonYear', 'Home']).size()
avg_touchdowns_per_game_visitor = combined_data.groupby(['SeasonYear', 'Visitor'])['IsTouchdown'].sum() / combined_data.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_touchdowns_per_game = (avg_touchdowns_per_game_home + avg_touchdowns_per_game_visitor).groupby(level=1).mean()
avg_rush_success_rate_home = combined_data.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
avg_rush_success_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
overall_avg_rush_success_rate = (avg_rush_success_rate_home + avg_rush_success_rate_visitor) / 2

more_offensive_features = pd.DataFrame({
    'Team': team_features_combined['Team'],
    'AvgYardsPerPlay': overall_avg_yards_per_play.values,
    'AvgYardsPerGame': overall_avg_yards_per_game.values,
    'AvgPassCompletionRate': overall_avg_pass_completion_rate.values,
    'AvgTouchdownsPerGame': overall_avg_touchdowns_per_game.values,
    'AvgRushSuccessRate': overall_avg_rush_success_rate.values
})

more_team_features = team_features_combined.merge(more_offensive_features, on='Team')

more_team_features.head()

  avg_pass_completion_rate_home = combined_data.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_pass_completion_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_rush_success_rate_home = combined_data.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
  avg_rush_success_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())


Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate,AvgPointsDefended,AvgConcededPlays,AvgForcedTurnovers,AvgYardsPerPlay,AvgYardsPerGame,AvgPassCompletionRate,AvgTouchdownsPerGame,AvgRushSuccessRate
0,ARI,20.043893,26.676782,0.236653,26.676782,-1.016297,0.018085,3.858547,7.700217,0.889645,0.068933,4.988382
1,ATL,18.830107,22.0572,0.427999,22.0572,-1.021492,0.022497,3.708785,7.679778,0.875529,0.04523,4.439918
2,BAL,27.822132,17.815528,0.707196,17.815528,-1.019121,0.019828,3.767911,7.74654,0.869581,0.055342,5.046116
3,BUF,27.085107,19.134047,0.654438,19.134047,-1.022799,0.023853,3.833218,7.565109,0.880223,0.065497,4.773855
4,CAR,15.144543,26.036992,0.141088,26.036992,-1.014536,0.016916,3.381419,7.010071,0.867508,0.056112,4.497754


In [334]:
avg_yards_allowed_per_play_home = combined_data.groupby('Home')['Yards'].mean()
avg_yards_allowed_per_play_visitor = combined_data.groupby('Visitor')['Yards'].mean()
overall_avg_yards_allowed_per_play = (avg_yards_allowed_per_play_home + avg_yards_allowed_per_play_visitor) / 2
total_yards_allowed_per_game_home = combined_data.groupby(['SeasonYear', 'Home'])['Yards'].sum() / combined_data.groupby(['SeasonYear', 'Home']).size()
total_yards_allowed_per_game_visitor = combined_data.groupby(['SeasonYear', 'Visitor'])['Yards'].sum() / combined_data.groupby(['SeasonYear', 'Visitor']).size()
overall_total_yards_allowed_per_game = (total_yards_allowed_per_game_home + total_yards_allowed_per_game_visitor).groupby(level=1).mean()
avg_pass_completion_allowed_rate_home = combined_data.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
avg_pass_completion_allowed_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
overall_avg_pass_completion_allowed_rate = (avg_pass_completion_allowed_rate_home + avg_pass_completion_allowed_rate_visitor) / 2
avg_touchdowns_allowed_per_game_home = combined_data.groupby(['SeasonYear', 'Home'])['IsTouchdown'].sum() / combined_data.groupby(['SeasonYear', 'Home']).size()
avg_touchdowns_allowed_per_game_visitor = combined_data.groupby(['SeasonYear', 'Visitor'])['IsTouchdown'].sum() / combined_data.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_touchdowns_allowed_per_game = (avg_touchdowns_allowed_per_game_home + avg_touchdowns_allowed_per_game_visitor).groupby(level=1).mean()
avg_rush_success_allowed_rate_home = combined_data.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
avg_rush_success_allowed_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
overall_avg_rush_success_allowed_rate = (avg_rush_success_allowed_rate_home + avg_rush_success_allowed_rate_visitor) / 2

more_defensive_features = pd.DataFrame({
    'Team': more_team_features['Team'],
    'AvgYardsAllowedPerPlay': overall_avg_yards_allowed_per_play.values,
    'AvgYardsAllowedPerGame': overall_total_yards_allowed_per_game.values,
    'AvgPassCompletionAllowedRate': overall_avg_pass_completion_allowed_rate.values,
    'AvgTouchdownsAllowedPerGame': overall_avg_touchdowns_allowed_per_game.values,
    'AvgRushSuccessAllowedRate': overall_avg_rush_success_allowed_rate.values
})

all_team_features = more_team_features.merge(more_defensive_features, on='Team')

all_team_features

  avg_pass_completion_allowed_rate_home = combined_data.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_pass_completion_allowed_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_rush_success_allowed_rate_home = combined_data.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
  avg_rush_success_allowed_rate_visitor = combined_data.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())


Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate,AvgPointsDefended,AvgConcededPlays,AvgForcedTurnovers,AvgYardsPerPlay,AvgYardsPerGame,AvgPassCompletionRate,AvgTouchdownsPerGame,AvgRushSuccessRate,AvgYardsAllowedPerPlay,AvgYardsAllowedPerGame,AvgPassCompletionAllowedRate,AvgTouchdownsAllowedPerGame,AvgRushSuccessAllowedRate
0,ARI,20.043893,26.676782,0.236653,26.676782,-1.016297,0.018085,3.858547,7.700217,0.889645,0.068933,4.988382,3.858547,7.700217,0.889645,0.068933,4.988382
1,ATL,18.830107,22.0572,0.427999,22.0572,-1.021492,0.022497,3.708785,7.679778,0.875529,0.04523,4.439918,3.708785,7.679778,0.875529,0.04523,4.439918
2,BAL,27.822132,17.815528,0.707196,17.815528,-1.019121,0.019828,3.767911,7.74654,0.869581,0.055342,5.046116,3.767911,7.74654,0.869581,0.055342,5.046116
3,BUF,27.085107,19.134047,0.654438,19.134047,-1.022799,0.023853,3.833218,7.565109,0.880223,0.065497,4.773855,3.833218,7.565109,0.880223,0.065497,4.773855
4,CAR,15.144543,26.036992,0.141088,26.036992,-1.014536,0.016916,3.381419,7.010071,0.867508,0.056112,4.497754,3.381419,7.010071,0.867508,0.056112,4.497754
5,CHI,21.152042,21.566829,0.427805,21.566829,-1.022251,0.02408,3.580899,6.773192,0.874842,0.047307,4.446631,3.580899,6.773192,0.874842,0.047307,4.446631
6,CIN,22.617413,23.436219,0.49007,23.436219,-1.018905,0.019938,3.9613,7.898447,0.87914,0.061059,4.587665,3.9613,7.898447,0.87914,0.061059,4.587665
7,CLE,22.367918,21.998754,0.582666,21.998754,-1.023061,0.025288,3.342192,6.602654,0.853491,0.050658,4.430184,3.342192,6.602654,0.853491,0.050658,4.430184
8,DAL,29.183292,20.093965,0.667003,20.093965,-1.016859,0.018914,3.822842,7.657624,0.869448,0.060465,4.472908,3.822842,7.657624,0.869448,0.060465,4.472908
9,DEN,20.093305,22.408735,0.473911,22.408735,-1.021559,0.022574,3.577199,6.892332,0.876846,0.042292,4.633767,3.577199,6.892332,0.876846,0.042292,4.633767


In [335]:
upcoming_encoded_home = upcoming_games.merge(all_team_features, left_on='Home', right_on='Team', how='left')
upcoming_encoded_both = upcoming_encoded_home.merge(all_team_features, left_on='Visitor', right_on='Team', suffixes=('_Home', '_Visitor'), how='left')

upcoming_encoded_both

Unnamed: 0,Home,Visitor,Team_Home,AvgPointsScored_Home,AvgPointsAllowed_Home,WinRate_Home,AvgPointsDefended_Home,AvgConcededPlays_Home,AvgForcedTurnovers_Home,AvgYardsPerPlay_Home,...,AvgYardsPerPlay_Visitor,AvgYardsPerGame_Visitor,AvgPassCompletionRate_Visitor,AvgTouchdownsPerGame_Visitor,AvgRushSuccessRate_Visitor,AvgYardsAllowedPerPlay_Visitor,AvgYardsAllowedPerGame_Visitor,AvgPassCompletionAllowedRate_Visitor,AvgTouchdownsAllowedPerGame_Visitor,AvgRushSuccessAllowedRate_Visitor
0,SEA,SF,SEA,22.615646,23.400449,0.577938,23.400449,-1.01552,0.016258,3.785555,...,4.133866,8.289819,0.88135,0.061131,4.728345,4.133866,8.289819,0.88135,0.061131,4.728345
1,CHI,JAX,CHI,21.152042,21.566829,0.427805,21.566829,-1.022251,0.02408,3.580899,...,3.804866,7.502013,0.865761,0.048766,4.338487,3.804866,7.502013,0.865761,0.048766,4.338487
2,GB,ARI,GB,23.162211,21.157925,0.51438,21.157925,-1.018225,0.019218,3.943184,...,3.858547,7.700217,0.889645,0.068933,4.988382,3.858547,7.700217,0.889645,0.068933,4.988382
3,TEN,IND,TEN,18.43128,21.896381,0.339613,21.896381,-1.01552,0.016834,3.409721,...,3.748662,8.017865,0.877172,0.051864,4.494286,3.748662,8.017865,0.877172,0.051864,4.494286
4,NO,TB,NO,25.19105,19.013552,0.519013,19.013552,-1.017486,0.018972,3.783593,...,3.725924,7.445483,0.866514,0.049194,4.109102,3.725924,7.445483,0.866514,0.049194,4.109102
5,BAL,WAS,BAL,27.822132,17.815528,0.707196,17.815528,-1.019121,0.019828,3.767911,...,3.906967,8.000261,0.875925,0.064657,4.760739,3.906967,8.000261,0.875925,0.064657,4.760739
6,NE,HOU,NE,13.889417,21.636883,0.240836,21.636883,-1.021356,0.02314,3.458305,...,3.714165,7.351366,0.870351,0.052383,3.924443,3.714165,7.351366,0.870351,0.052383,3.924443
7,PHI,CLE,PHI,25.171363,25.262223,0.633474,25.262223,-1.019601,0.021588,3.972648,...,3.342192,6.602654,0.853491,0.050658,4.430184,3.342192,6.602654,0.853491,0.050658,4.430184
8,LV,PIT,LV,19.585844,20.579419,0.475201,20.579419,-1.018283,0.020667,3.669956,...,3.706018,7.149707,0.873755,0.038557,4.408594,3.706018,7.149707,0.873755,0.038557,4.408594
9,DEN,LAC,DEN,20.093305,22.408735,0.473911,22.408735,-1.021559,0.022574,3.577199,...,3.70295,7.143968,0.868254,0.046232,4.320652,3.70295,7.143968,0.868254,0.046232,4.320652


In [336]:
# Calculate the difference in features as this might be a more predictive representation
for col in ['AvgPointsScored', 'AvgPointsAllowed', 'WinRate', 'AvgPointsDefended', 'AvgConcededPlays', 'AvgForcedTurnovers',
            'AvgYardsPerPlay', 'AvgYardsPerGame', 'AvgPassCompletionRate', 'AvgTouchdownsPerGame', 'AvgRushSuccessRate',
            'AvgYardsAllowedPerPlay', 'AvgYardsAllowedPerGame', 'AvgPassCompletionAllowedRate', 'AvgTouchdownsAllowedPerGame', 'AvgRushSuccessAllowedRate']:
    upcoming_encoded_both[f'Diff_{col}'] = upcoming_encoded_both[f'{col}_Home'] - upcoming_encoded_both[f'{col}_Visitor']

# Selecting only the difference columns and the teams for clarity
upcoming_encoded_final = upcoming_encoded_both[['Home', 'Visitor'] + [col for col in upcoming_encoded_both.columns if 'Diff_' in col]]

upcoming_encoded_final

Unnamed: 0,Home,Visitor,Diff_AvgPointsScored,Diff_AvgPointsAllowed,Diff_WinRate,Diff_AvgPointsDefended,Diff_AvgConcededPlays,Diff_AvgForcedTurnovers,Diff_AvgYardsPerPlay,Diff_AvgYardsPerGame,Diff_AvgPassCompletionRate,Diff_AvgTouchdownsPerGame,Diff_AvgRushSuccessRate,Diff_AvgYardsAllowedPerPlay,Diff_AvgYardsAllowedPerGame,Diff_AvgPassCompletionAllowedRate,Diff_AvgTouchdownsAllowedPerGame,Diff_AvgRushSuccessAllowedRate
0,SEA,SF,-5.633538,5.170409,-0.084801,5.170409,0.007019,-0.007547,-0.348311,-0.880561,-0.009992,-0.005043,-0.03683,-0.348311,-0.880561,-0.009992,-0.005043,-0.03683
1,CHI,JAX,0.182977,-1.47819,0.004076,-1.47819,-0.00362,0.004176,-0.223967,-0.728821,0.00908,-0.001458,0.108144,-0.223967,-0.728821,0.00908,-0.001458,0.108144
2,GB,ARI,3.118318,-5.518858,0.277727,-5.518858,-0.001927,0.001133,0.084637,0.51193,-0.016445,-0.010354,-0.252185,0.084637,0.51193,-0.016445,-0.010354,-0.252185
3,TEN,IND,-4.355579,-2.050342,-0.184673,-2.050342,0.005583,-0.006472,-0.338942,-1.439818,-0.005,-0.007432,-0.2823,-0.338942,-1.439818,-0.005,-0.007432,-0.2823
4,NO,TB,3.737397,-0.368368,-0.051789,-0.368368,0.004671,-0.004205,0.057668,0.419667,0.003294,0.008628,0.374185,0.057668,0.419667,0.003294,0.008628,0.374185
5,BAL,WAS,6.797412,-12.003324,0.375291,-12.003324,-0.002233,0.001282,-0.139056,-0.253721,-0.006344,-0.009315,0.285376,-0.139056,-0.253721,-0.006344,-0.009315,0.285376
6,NE,HOU,-7.731331,-0.022647,-0.370814,-0.022647,-0.004247,0.005295,-0.25586,-0.406973,0.003068,-0.009348,0.074931,-0.25586,-0.406973,0.003068,-0.009348,0.074931
7,PHI,CLE,2.803445,3.263468,0.050808,3.263468,0.003461,-0.0037,0.630456,1.476566,0.016077,0.013668,0.220527,0.630456,1.476566,0.016077,0.013668,0.220527
8,LV,PIT,1.577468,2.489554,-0.137802,2.489554,0.003153,-0.001553,-0.036061,0.159361,0.003969,0.013835,0.007449,-0.036061,0.159361,0.003969,0.013835,0.007449
9,DEN,LAC,0.203266,0.709581,0.14539,0.709581,-0.00212,0.001878,-0.125751,-0.251636,0.008592,-0.00394,0.313115,-0.125751,-0.251636,0.008592,-0.00394,0.313115


In [337]:
training_encoded_home = combined_data.merge(all_team_features, left_on='Home', right_on='Team', how='left')
training_encoded_both = training_encoded_home.merge(all_team_features, left_on='Visitor', right_on='Team', suffixes=('_Home', '_Visitor'), how='left')

for col in ['AvgPointsScored', 'AvgPointsAllowed', 'WinRate', 'AvgPointsDefended', 'AvgConcededPlays', 'AvgForcedTurnovers',
            'AvgYardsPerPlay', 'AvgYardsPerGame', 'AvgPassCompletionRate', 'AvgTouchdownsPerGame', 'AvgRushSuccessRate',
            'AvgYardsAllowedPerPlay', 'AvgYardsAllowedPerGame', 'AvgPassCompletionAllowedRate', 'AvgTouchdownsAllowedPerGame', 'AvgRushSuccessAllowedRate']:
    training_encoded_both[f'Diff_{col}'] = training_encoded_both[f'{col}_Home'] - training_encoded_both[f'{col}_Visitor']

training_data = training_encoded_both[[col for col in training_encoded_both.columns if 'Diff_' in col]]
training_labels = training_encoded_both['HomeWon']

In [338]:
training_data.head()

Unnamed: 0,Diff_AvgPointsScored,Diff_AvgPointsAllowed,Diff_WinRate,Diff_AvgPointsDefended,Diff_AvgConcededPlays,Diff_AvgForcedTurnovers,Diff_AvgYardsPerPlay,Diff_AvgYardsPerGame,Diff_AvgPassCompletionRate,Diff_AvgTouchdownsPerGame,Diff_AvgRushSuccessRate,Diff_AvgYardsAllowedPerPlay,Diff_AvgYardsAllowedPerGame,Diff_AvgPassCompletionAllowedRate,Diff_AvgTouchdownsAllowedPerGame,Diff_AvgRushSuccessAllowedRate
0,-0.025533,-1.902612,-0.011512,-1.902612,-0.00312,0.00338,0.037674,0.357764,0.011908,-0.000368,0.238675,0.037674,0.357764,0.011908,-0.000368,0.238675
1,-0.025533,-1.902612,-0.011512,-1.902612,-0.00312,0.00338,0.037674,0.357764,0.011908,-0.000368,0.238675,0.037674,0.357764,0.011908,-0.000368,0.238675
2,-0.025533,-1.902612,-0.011512,-1.902612,-0.00312,0.00338,0.037674,0.357764,0.011908,-0.000368,0.238675,0.037674,0.357764,0.011908,-0.000368,0.238675
3,-0.025533,-1.902612,-0.011512,-1.902612,-0.00312,0.00338,0.037674,0.357764,0.011908,-0.000368,0.238675,0.037674,0.357764,0.011908,-0.000368,0.238675
4,-0.025533,-1.902612,-0.011512,-1.902612,-0.00312,0.00338,0.037674,0.357764,0.011908,-0.000368,0.238675,0.037674,0.357764,0.011908,-0.000368,0.238675


In [339]:
training_data.shape

(63138, 16)

In [340]:
training_labels.head()

Unnamed: 0,HomeWon
0,True
1,True
2,True
3,True
4,True


# Training the Model

In [341]:
logreg = LogisticRegression(max_iter=1000)
cross_val_scores = cross_val_score(logreg, training_data, training_labels, cv=10)
cross_val_scores_mean = cross_val_scores.mean()

cross_val_scores_mean

0.6721942609437634

In [342]:
logreg.fit(training_data, training_labels)

# Testing the Model

In [343]:
upcoming_games_probabilities = logreg.predict_proba(upcoming_encoded_final[[col for col in upcoming_encoded_final.columns if 'Diff' in col]])

upcoming_games_probabilities

array([[0.55237287, 0.44762713],
       [0.44624411, 0.55375589],
       [0.201951  , 0.798049  ],
       [0.63823119, 0.36176881],
       [0.50797556, 0.49202444],
       [0.13638915, 0.86361085],
       [0.801765  , 0.198235  ],
       [0.38984825, 0.61015175],
       [0.60822448, 0.39177552],
       [0.30814719, 0.69185281],
       [0.51087242, 0.48912758],
       [0.74591017, 0.25408983],
       [0.61423794, 0.38576206],
       [0.69826973, 0.30173027]])

In [344]:
upcoming_game_home_win_probabilities = upcoming_games_probabilities[:, 1]
upcoming_encoded_final['HomeWinProbability'] = upcoming_game_home_win_probabilities
upcoming_predictions = upcoming_encoded_final[['Home', 'Visitor', 'HomeWinProbability']].sort_values(by='HomeWinProbability', ascending=False)

upcoming_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_encoded_final['HomeWinProbability'] = upcoming_game_home_win_probabilities


Unnamed: 0,Home,Visitor,HomeWinProbability
5,BAL,WAS,0.863611
2,GB,ARI,0.798049
9,DEN,LAC,0.691853
7,PHI,CLE,0.610152
1,CHI,JAX,0.553756
4,NO,TB,0.492024
10,DAL,DET,0.489128
0,SEA,SF,0.447627
8,LV,PIT,0.391776
12,NYG,CIN,0.385762


Using 2023 and 2024 DataSet

*   Model Performance w/ Cross-Validation: 67%
*   Ratio of Predicted vs Actual Results - 13:14 (92.86%)

Using Only 2024 DataSet

*   Model Performance w/ Cross-Validation: 88%
*   Ratio of Predicted vs Actual Results - 11:14 (78.57%)



