In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv('NFL_pbp_2009-2019.csv', low_memory=False)

threshold = 100000
df['field_goal_result'].fillna('none', inplace=True)
dfV2 = df.loc[:, df.isnull().sum() < threshold]
missing_values = dfV2.isnull().sum()

statistical_cols = ['play_id', 'game_id', 'home_team', 'away_team', 'posteam', 
                    'defteam', 'side_of_field', 'yardline_100', 'half_seconds_remaining', 
                    'game_seconds_remaining', 'game_half', 'drive', 'qtr', 'down', 'goal_to_go', 'time', 
                    'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'home_timeouts_remaining', 
                    'away_timeouts_remaining', 'total_home_score',  'total_away_score', 'score_differential', 'home_wp', 'away_wp', 'ep']

game_dynamics_cols = [
    'punt_blocked', 'first_down_rush', 'first_down_pass', 'first_down_penalty', 'third_down_converted',
    'third_down_failed', 'fourth_down_converted', 'fourth_down_failed', 'incomplete_pass', 'interception',
    'fumble_forced', 'fumble_not_forced', 'fumble_out_of_bounds', 'solo_tackle', 'safety', 'penalty',
    'tackled_for_loss', 'fumble_lost', 'own_kickoff_recovery', 'own_kickoff_recovery_td', 'qb_hit',
    'rush_attempt', 'pass_attempt', 'sack', 'touchdown', 'pass_touchdown', 'rush_touchdown', 'field_goal_result',
    'return_touchdown', 'extra_point_attempt', 'two_point_attempt', 'field_goal_attempt', 'kickoff_attempt',
    'punt_attempt', 'fumble', 'complete_pass', 'shotgun', 'no_huddle', 'punt_inside_twenty', 'kickoff_inside_twenty']

columns_to_keep = statistical_cols + game_dynamics_cols
dfV3 = dfV2[columns_to_keep]

dfV4 = dfV3.drop(['play_id', 'game_seconds_remaining', 'fumble_forced'], axis=1)
dfV4 = dfV4.dropna(subset=['down', 'defteam', 'posteam'])
dfV4 = dfV4.reset_index(drop=True)

# Indicators for if within last 2 minutes of the half and the whole game
dfV4['close_to_end_of_half'] = (dfV4['half_seconds_remaining'] <= 120).astype(int)
dfV4['close_to_end_of_game'] = ((dfV4['half_seconds_remaining'] <= 120) & (dfV4['game_half'] == 'Half2')).astype(int)

# Indicator for if the touchdown was for the away or home team
dfV4['home_td'] = ((dfV4['touchdown'] == 1) & (dfV4['posteam'] != dfV4['away_team'])).astype(int)
dfV4['away_td'] = ((dfV4['touchdown'] == 1) & (dfV4['posteam'] != dfV4['home_team'])).astype(int)

# Trackers for the difference in both teams' win probability after each play
dfV4['home_wp_change'] = dfV4['home_wp'].diff().fillna(0)
dfV4['away_wp_change'] = dfV4['away_wp'].diff().fillna(0)

# Indicator for turnover
dfV4['turnover'] = (
    (dfV4['safety'] == 1) |
    (dfV4['interception'] == 1) |
    (dfV4['fumble_lost'] == 1) |
    ((dfV4['fourth_down_converted'] == 0) & (dfV4['down'] == 4))
).astype(int)

# Drive time - Added drive ended indicator to help - Manually resets after end of game, half, and change of possession
dfV4['drive_ended'] = (
    (dfV4['posteam'] != dfV4['posteam'].shift(1)) |  
    (dfV4['game_id'] != dfV4['game_id'].shift(1)) |  
    dfV4['desc'].str.contains('END GAME', na=False) |  
    dfV4['desc'].str.contains('END QUARTER', na=False)  
).astype(int)
dfV4['drive'] = (
    (dfV4['posteam'].ne(dfV4['posteam'].shift())) |
    (dfV4['game_id'].ne(dfV4['game_id'].shift()))
).cumsum()
dfV4['drive_time_seconds'] = (
    dfV4.groupby(['game_id', 'drive'])['half_seconds_remaining']
    .transform('first') - dfV4['half_seconds_remaining']
)
dfV4['drive_time_seconds'] = dfV4.apply(
    lambda row: 0 if row['drive_ended'] == 1 else row['drive_time_seconds'], axis=1
)
dfV4['drive_time_seconds'] = dfV4.groupby(['game_id', 'drive'])['drive_time_seconds'].cumsum()

# Indicator for long touchdowns
dfV4['long_td'] = ((dfV4['touchdown'] == 1) & (dfV4['yards_gained'] >= 50)).astype(int)

# Trackers for score differentials and lead changes
dfV4['home_score_differential'] = dfV4['total_home_score'] - dfV4['total_away_score']
dfV4['away_score_differential'] = -dfV4['home_score_differential']
dfV4['lead_change'] = ((dfV4['home_score_differential'].diff() < 0) &
                       (dfV4['home_score_differential'].shift() * dfV4['home_score_differential'] < 0)).astype(int)

# Combining first down indicators
dfV4['first_down'] = ((dfV4['first_down_pass'] == 1) | (dfV4['first_down_rush'] == 1) | (dfV4['first_down_penalty'] == 1)).astype(int)

# Indicators for scoring drives - Removing
dfV4['home_scoring_drive'] = (
    (dfV4['home_td'] == 1) 
).astype(int)
dfV4['away_scoring_drive'] = (
    (dfV4['away_td'] == 1) 
).astype(int)

# Helper for consecutive scoring events - Remove Later!!!!!!!!!!!!!!
dfV4['home_scoring_events'] = (
    (dfV4['posteam'] != dfV4['away_team']) & 
    ((dfV4['home_td'] == 1) | (dfV4['field_goal_result'] == 'made'))
).astype(int)
dfV4['away_scoring_events'] = (
    (dfV4['posteam'] != dfV4['home_team']) & 
    ((dfV4['away_td'] == 1) | (dfV4['field_goal_result'] == 'made'))
).astype(int)

# Consecutive Scoring Events + Helper function 
def calc_consecutive_cumsum_with_game_reset(series, reset_series, game_ids):
    cumsum = 0
    consecutive = []
    prev_game_id = None  
    
    for i in range(len(series)):
        if game_ids[i] != prev_game_id:
            cumsum = 0 
        if reset_series[i] == 1:  
            cumsum = 0
        if series[i] == 1:  
            cumsum += 1
        consecutive.append(cumsum)
        prev_game_id = game_ids[i]  
    return consecutive

dfV4['home_csum_scores'] = calc_consecutive_cumsum_with_game_reset(
    dfV4['home_scoring_events'], dfV4['away_scoring_events'], dfV4['game_id']
)
dfV4['away_csum_scores'] = calc_consecutive_cumsum_with_game_reset(
    dfV4['away_scoring_events'], dfV4['home_scoring_events'], dfV4['game_id']
)

#Consecutive defensive stops
dfV4['home_def_stop'] = (
    (dfV4['posteam'] != dfV4['home_team']) &  ((dfV4['punt_attempt'] == 1) |  (dfV4['turnover'] == 1)) & 
    ~dfV4['field_goal_result'].isin(['made'])  
).astype(int)
dfV4['away_def_stop'] = (
    (dfV4['posteam'] != dfV4['away_team']) & ((dfV4['punt_attempt'] == 1) |  (dfV4['turnover'] == 1)) & 
    ~dfV4['field_goal_result'].isin(['made'])
).astype(int)

def calc_consecutive_defensive_stops_with_game_reset(series, reset_series, game_ids):
    cumsum = 0
    consecutive = []
    prev_game_id = None  
    for i in range(len(series)):
        if game_ids[i] != prev_game_id:
            cumsum = 0
        if reset_series[i] == 1:
            cumsum = 0
        if series[i] == 1:
            cumsum += 1
        consecutive.append(cumsum)
        prev_game_id = game_ids[i]  
    return consecutive

dfV4['home_csum_def_stops'] = calc_consecutive_defensive_stops_with_game_reset(
    dfV4['home_def_stop'], dfV4['away_scoring_events'], dfV4['game_id']
)
dfV4['away_csum_def_stops'] = calc_consecutive_defensive_stops_with_game_reset(
    dfV4['away_def_stop'], dfV4['home_scoring_events'], dfV4['game_id']
)

# Home/Away Drive Numbers
dfV4['away_drive_number'] = (
    dfV4.loc[dfV4['posteam'] != dfV4['home_team']]
    .groupby('game_id')['drive_ended'].cumsum()
)
dfV4['home_drive_number'] = (
    dfV4.loc[dfV4['posteam'] == dfV4['home_team']]
    .groupby('game_id')['drive_ended'].cumsum()
)

# Offense needs to score
dfV4['off_need_score'] = (
    (dfV4['down'].isin([3, 4])) & 
    (abs(dfV4['score_differential']) <= 8) & 
    (dfV4['qtr'] >= 4) &
    (dfV4['first_down'] == 1)
).astype(int)

# Defense Needs a Stop
dfV4['def_need_stop'] = (
    (dfV4['down'].isin([3, 4])) & 
    (abs(dfV4['score_differential']) <= 8) & 
    (dfV4['qtr'] >= 4) &
    (dfV4['turnover'] == 1)
).astype(int)

# Drought Ending score
dfV4['drought_end_play'] = (
    ((dfV4['away_csum_scores'].shift(1) >= 2) & (dfV4['away_csum_scores'] == 0) & (dfV4['home_scoring_events'] == 1)) |
    ((dfV4['home_csum_scores'].shift(1) >= 2) & (dfV4['home_csum_scores'] == 0) & (dfV4['away_scoring_events'] == 1))
).astype(int)

# Defensive touchdown
dfV4['def_td'] = (
    ((dfV4['fumble'] == 1) & (dfV4['return_touchdown'] == 1)) |
    ((dfV4['interception'] == 1) & (dfV4['return_touchdown'] == 1))
).astype(int)

# Defensive touchdown
dfV4['off_td'] = (
    (dfV4['pass_touchdown'] == 1) | (dfV4['rush_touchdown'] == 1)
).astype(int)

# Special Teams touchdown
dfV4['st_return_td'] = (
    ((dfV4['kickoff_attempt'] == 1) & (dfV4['return_touchdown'] == 1)) | 
    ((dfV4['punt_attempt'] == 1) & (dfV4['return_touchdown'] == 1))  
).astype(int)

# Big special teams play...punt blocked, field goal blocked, return_touchdown, kick recovery, pin team near endzone
dfV4['big_st_play'] = (
    (dfV4['punt_blocked'] == 1) | 
    (dfV4['field_goal_result'] == 'blocked') | 
    (dfV4['own_kickoff_recovery'] == 1) | 
    (dfV4['st_return_td'] == 1) | 
    (dfV4['kickoff_inside_twenty'] == 1) | 
    (dfV4['punt_inside_twenty'] == 1)
).astype(int)

# Scoring type differentiatior, touchdowns should hold more weight than a field goal, other types may hold more weight also
dfV4['scoring_type'] = np.select(
    [
        dfV4['field_goal_result'] == 'made',
        dfV4['off_td'] == 1,
        dfV4['def_td'] == 1,
        dfV4['st_return_td'] == 1,
    ],
    ['fg', 'off_td', 'def_td', 'st_td'],
    default='none'
)

# Indicator for big offensive play
dfV4['big_offensive_play'] = (
        (dfV4['yards_gained'] >= 40) |
        (dfV4['long_td'] == 1) |
        ((dfV4['off_need_score'] == 1) & (dfV4['off_td'] == 1))
).astype(int)

# Indicator for big defensive play
dfV4['big_defensive_play'] = (
    (dfV4['sack'] == 1) |
    (dfV4['tackled_for_loss'] == 1) |
    ((dfV4['def_need_stop'] == 1) & ((dfV4['def_td'] == 'def_td')) | dfV4['turnover'] == 1) |
    (dfV4['scoring_type'] == 'def_td')
).astype(int)

#Quick Score and Quick Stop #### Needs fixing, only want 1 on last play of drive when they score or get stop, right now 1 for whole drive
dfV4['total_drive_time'] = dfV4.groupby('drive')['drive_time_seconds'].transform('last') 
dfV4['cumulative_drive_time'] = dfV4.groupby(['game_id', 'drive'])['drive_time_seconds'].cumsum()
dfV4['long_drive_triggered'] = (
    dfV4.groupby(['game_id', 'drive'])['cumulative_drive_time']
    .transform(lambda x: (x > 360).idxmax() == x.index)  # Flags the first row that exceeds 360s
).astype(int)

dfV4['quick_score'] = (
    (dfV4['drive_time_seconds'] < 180) &
    ((dfV4['touchdown'] == 1) | (dfV4['field_goal_result'] == 'made')) &
    (dfV4.groupby('drive')['drive_time_seconds'].transform('last') == dfV4['drive_time_seconds'])
).astype(int)
dfV4['quick_stop'] = (
    (dfV4['total_drive_time'] < 180) & 
    (dfV4['scoring_type'] == 'none') &
    (dfV4.groupby('drive')['drive_time_seconds'].transform('last') == dfV4['drive_time_seconds'])
).astype(int)

# Consecutive first downs
dfV4['home_csum_first_downs'] = 0
dfV4['away_csum_first_downs'] = 0
dfV4['home_csum_first_downs'] = (
    dfV4.groupby(['home_team', 'away_team', 'home_drive_number'])['first_down']
    .cumsum()
    .where(dfV4['posteam'] != 'away_team', 0)
)
dfV4['away_csum_first_downs'] = (
    dfV4.groupby(['home_team', 'away_team', 'away_drive_number'])['first_down']
    .cumsum()
    .where(dfV4['posteam'] != 'home_team', 0)
)


columns_to_remove = [
    'ep', 'punt_blocked', 'first_down_rush', 'first_down_pass', 
    'third_down_converted', 'third_down_failed', 'fourth_down_converted', 
    'fourth_down_failed', 'incomplete_pass', 'interception', 'fumble_not_forced', 
    'fumble_out_of_bounds', 'solo_tackle', 'safety', 'penalty', 'tackled_for_loss', 
    'fumble_lost', 'own_kickoff_recovery', 'own_kickoff_recovery_td', 'qb_hit', 
    'rush_attempt', 'pass_attempt', 'sack', 'extra_point_attempt', 'two_point_attempt', 
    'field_goal_attempt', 'kickoff_attempt', 'punt_attempt', 'fumble', 'pass_touchdown', 'rush_touchdown'
    'complete_pass', 'shotgun', 'home_scoring_drive', 'away_scoring_drive','home_scoring_events','away_scoring_events',
    'rush_touchdown', 'field_goal_result', 'return_touchdown', 'complete_pass', 'no_huddle', 'punt_inside_twenty', 'kickoff_inside_twenty',
    'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'side_of_field', 'yardline_100', 'desc', 'drive', 'game_half', 'drive_ended', 'drive_time_seconds',
    'touchdown', 'score_differential', 'total_drive_time'
]

dfV5 = dfV4.drop(columns=columns_to_remove, errors='ignore')

dynamics = [
    ('big_offensive_play', dfV5['big_offensive_play'] == 1),
    ('big_defensive_play', dfV5['big_defensive_play'] == 1),
    ('off_td', dfV5['off_td'] == 1),
    ('def_td', dfV5['def_td'] == 1),
    ('big_st_play', dfV5['big_st_play'] == 1),
    ('st_return_td', dfV5['st_return_td'] == 1),
    ('off_need_score', dfV5['off_need_score'] == 1),
    ('def_need_stop', dfV5['def_need_stop'] == 1),
    ('drought_end_play', dfV5['drought_end_play'] == 1),
    ('home_csum_scores', dfV5['home_csum_scores'] >= 2),
    ('away_csum_scores', dfV5['away_csum_scores'] >= 2),
    ('home_csum_def_stops', dfV5['home_csum_def_stops'] >= 2),
    ('away_csum_def_stops', dfV5['away_csum_def_stops'] >= 2),
    ('home_csum_first_downs', dfV5['home_csum_first_downs'] >= 2),
    ('away_csum_first_downs', dfV5['away_csum_first_downs'] >= 2),
    ('long_td', dfV5['long_td'] == 1),
    ('quick_score', dfV5['quick_score'] == 1),
    ('quick_stop', dfV5['quick_stop'] == 1),
    ('home_score_differential', dfV5['home_score_differential'] == 1),
    ('away_score_differential', dfV5['away_score_differential'] == 1),
]


def_wp_change = {
    "big_defensive_play": 0.029471,
    "def_td": 0.016322,
    "big_st_play": 0.034637,
    "st_return_td": 0.040082,
    "def_need_stop": 0.042132,
    "quick_stop": 0.029971
}

off_wp_change = {
    "big_offensive_play": 0.038602,
    "off_td": 0.028432,
    "off_need_score":  0.035536, 
    "drought_end_play": 0.028891,
    "long_td": 0.033325,
    "quick_score": 0.026664
}

streaks_multipliers = {
    "home_csum_scores": 1.118986,
    "away_csum_scores": 1.118986,
    "home_csum_first_downs": 1.1112094,
    "away_csum_first_downs": 1.1112094,
    "home_csum_def_stops": 1.111932,
    "away_csum_def_stops": 1.111932,
}

score_game_multipliers = {
    "tied_or_1_score": 1.06634844,
    "2_score": 1.035777727,
    "3_or_more_score": 1.0274060
}

qtr_multipliers = {
    "first_and_fourth": 1.5522285,
    "second_and_third": 1.3201836
}

home_away_multipliers = {
    "home": 1.07949869,  
    "away": 1.06027507 
}

boost_case_multipliers = {
    "home_and_4th": 1.122276683,  
    "away_and_1st": 1.16675933,  
    "none": 1.0           
}

decay_multipliers = {
    "opponent_scores": 0.68004571,
    "turnover": 0.21742678,
    "opponent_ends_drought": 0.18212307,
    "long_possession":  0.1534018395,
    "none": 0.0  
}



def calculate_multipliers(row, index, category, is_offensive):
    if abs(row['home_score_differential']) <= 8:
        S = score_game_multipliers["tied_or_1_score"]
    elif 9 <= abs(row['home_score_differential']) <= 16:
        S = score_game_multipliers["2_score"]
    else:
        S = score_game_multipliers["3_or_more_score"]

    team = row['posteam'] if is_offensive else row['defteam']
    HA = home_away_multipliers.get(team, 1.0)

    if row['qtr'] == 1 or row['qtr'] == 4:
        Q = qtr_multipliers["first_and_fourth"]
    else:
        Q = qtr_multipliers["second_and_third"]

    if is_offensive:
        if row['posteam'] == 'home' and row['qtr'] == 4:
            B = boost_case_multipliers["home_and_4th"]
        elif row['posteam'] == 'away' and row['qtr'] == 1:
            B = boost_case_multipliers["away_and_1st"]
        else:
            B = 1.0
    else:
        if team == 'home' and row['qtr'] == 4:
            B = boost_case_multipliers["home_and_4th"]
        elif team == 'away' and row['qtr'] == 1:
            B = boost_case_multipliers["away_and_1st"]
        else:
            B = 1.0

    CS = 1.0
   
    if is_offensive:
        if row['posteam'] == row['home_team']:
            if row['home_csum_scores'] >= 2:
                if row['home_csum_scores'] > dfV5.at[index - 1, 'home_csum_scores']: 
                    CS = streaks_multipliers['home_csum_scores']
            elif row['home_csum_first_downs'] >= 4:
                if row['home_csum_first_downs'] > dfV5.at[index - 1, 'home_csum_first_downs']: 
                    CS = streaks_multipliers['home_csum_first_downs']
            else:
                CS = 1.0
        else:
            if row['away_csum_scores'] >= 2:
                if row['away_csum_scores'] > dfV5.at[index - 1, 'away_csum_scores']: 
                    CS = streaks_multipliers['away_csum_scores']
            elif row['away_csum_first_downs'] >= 4:
                if row['away_csum_first_downs'] >  dfV5.at[index - 1, 'away_csum_first_downs']: 
                    CS = streaks_multipliers['away_csum_first_downs']
            else:
                CS = 1.0
    else:
        if row['defteam'] == row['home_team']:
            if row['home_csum_def_stops'] >= 2:
                if row['home_csum_def_stops'] > dfV5.at[index - 1, 'home_csum_def_stops']: 
                    CS = streaks_multipliers['home_csum_def_stops']
            else:
                CS = 1.0
        else:
            if row['away_csum_def_stops'] >= 2:
                if row['away_csum_def_stops'] > dfV5.at[index - 1, 'away_csum_def_stops']:
                    CS = streaks_multipliers['away_csum_def_stops']
            else:
                CS = 1.0

    return S, HA, B, CS, Q



def calculate_momentum_gain(wp_change_value, S, HA, CS, B, Q):
    return wp_change_value * (S * HA * CS * B * Q) * 1000



def calculate_decay(row, category, momentum_gain):
    if category in ['off_td', 'long_td', 'def_td', 'st_return_td']:
        D = decay_multipliers['opponent_scores']
    elif row['turnover'] == 1:
        D = decay_multipliers['turnover']
    elif row['drought_end_play'] == 1:
        D = decay_multipliers["opponent_ends_drought"]
    elif row['long_drive_triggered'] == 1:  
        D = decay_multipliers['long_possession']
    else:
        D = decay_multipliers['none']

    return momentum_gain * D



def update_momentum_scores(dfV5):
    dfV5['Home_Momentum_Score'] = 500
    dfV5['Away_Momentum_Score'] = 500

    dfV5['game_id_diff'] = dfV5['game_id'] != dfV5['game_id'].shift(1)

    for index, row in dfV5.iterrows():
        if index == 0:  
            continue

        if row['game_id_diff']:
            dfV5.at[index, 'Home_Momentum_Score'] = 500
            dfV5.at[index, 'Away_Momentum_Score'] = 500
            continue

        home_momentum_gain = 0
        away_momentum_gain = 0

        for category, wp_change_value in off_wp_change.items():
            if row[category] == 1:
                S, HA, B, CS, Q = calculate_multipliers(row, index, category, True)
                momentum_gain = calculate_momentum_gain(wp_change_value, S, HA, CS, B, Q)
                momentum_loss = calculate_decay(row, category, momentum_gain)

                if row['posteam'] == row['home_team']:
                    home_momentum_gain += momentum_gain
                    away_momentum_gain -= momentum_loss
                else:
                    away_momentum_gain += momentum_gain
                    home_momentum_gain -= momentum_loss

        for category, wp_change_value in def_wp_change.items():
            if row[category] == 1:
                S, HA, B, CS, Q = calculate_multipliers(row, index, category, False)
                momentum_gain = calculate_momentum_gain(wp_change_value, S, HA, CS, B, Q)
                momentum_loss = calculate_decay(row, category, momentum_gain)

                if row['defteam'] == row['home_team']:
                    home_momentum_gain += momentum_gain
                    away_momentum_gain -= momentum_loss
                else:
                    away_momentum_gain += momentum_gain
                    home_momentum_gain -= momentum_loss

        dfV5.at[index, 'Home_Momentum_Score'] = dfV5.at[index - 1, 'Home_Momentum_Score'] + home_momentum_gain
        dfV5.at[index, 'Away_Momentum_Score'] = dfV5.at[index - 1, 'Away_Momentum_Score'] + away_momentum_gain

update_momentum_scores(dfV5)

dfV5['Game_Momentum_Diff'] = 0

historical_max_diff_mean = dfV5.groupby('game_id')['Game_Momentum_Diff'].max().mean()
historical_max_diff_std = dfV5.groupby('game_id')['Game_Momentum_Diff'].max().std()

base_threshold = historical_max_diff_mean + 0.8 * historical_max_diff_std #.7

dfV5['Game_Momentum_Diff'] = abs(dfV5['Home_Momentum_Score'] - dfV5['Away_Momentum_Score'])
dfV5['Dynamic_Threshold'] = None
dfV5['Momentum_Holding_Team'] = None

def detect_momentum_shifts(game_data):
    momentum_holding_team = None
    last_shift_home_momentum = game_data.iloc[0]['Home_Momentum_Score']
    last_shift_away_momentum = game_data.iloc[0]['Away_Momentum_Score']
    max_momentum_diff_so_far = 0

    for i in range(1, len(game_data)): 
        if i < 10:  # Ignore shifts for the first 10 plays
            continue
        home_momentum_diff = game_data.iloc[i]['Home_Momentum_Score'] - last_shift_home_momentum
        away_momentum_diff = game_data.iloc[i]['Away_Momentum_Score'] - last_shift_away_momentum        

        current_momentum_diff = abs(game_data.iloc[i]['Home_Momentum_Score'] - game_data.iloc[i]['Away_Momentum_Score'])
        max_momentum_diff_so_far = max(max_momentum_diff_so_far, current_momentum_diff)
        game_threshold = max(base_threshold, 0.8 * max_momentum_diff_so_far) #.7
        game_data.iloc[i, game_data.columns.get_loc('Dynamic_Threshold')] = game_threshold

        home_momentum_shift = False
        away_momentum_shift = False

        if home_momentum_diff >= game_threshold and away_momentum_diff < game_threshold * 0.8: #.5
            home_momentum_shift = True
        elif away_momentum_diff >= game_threshold and home_momentum_diff < game_threshold * 0.8: #.5
            away_momentum_shift = True

        if home_momentum_shift:
            momentum_holding_team = "Home"
            last_shift_home_momentum = game_data.iloc[i]['Home_Momentum_Score']
            last_shift_away_momentum = game_data.iloc[i]['Away_Momentum_Score']
        elif away_momentum_shift:
            momentum_holding_team = "Away"
            last_shift_home_momentum = game_data.iloc[i]['Home_Momentum_Score']
            last_shift_away_momentum = game_data.iloc[i]['Away_Momentum_Score']

        game_data.iloc[i, game_data.columns.get_loc('Momentum_Holding_Team')] = momentum_holding_team

    return game_data

dfV5 = dfV5.groupby('game_id', group_keys=False).apply(detect_momentum_shifts)

dfV5['Momentum_Shift_Occurred'] = dfV5.groupby('game_id')['Momentum_Holding_Team'].transform(
    lambda x: x.ne(x.shift()) & x.notna()
)

columns_to_fill = ['home_drive_number', 'away_drive_number', 'home_csum_first_downs', 
                    'away_csum_first_downs', 'Dynamic_Threshold', 'yards_gained']
dfV5[columns_to_fill] = dfV5[columns_to_fill].fillna(0)

features = dfV5.drop(['Momentum_Shift_Occurred'], axis=1)  
numeric_df = dfV5.select_dtypes(include=[np.number])
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

pca = PCA(n_components=0.90)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
pca_df['Momentum_Shift_Occurred'] = dfV5['Momentum_Shift_Occurred'].values
pca_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['field_goal_result'].fillna('none', inplace=True)
  dfV5.at[index, 'Home_Momentum_Score'] = dfV5.at[index - 1, 'Home_Momentum_Score'] + home_momentum_gain
  dfV5.at[index, 'Away_Momentum_Score'] = dfV5.at[index - 1, 'Away_Momentum_Score'] + away_momentum_gain
  dfV5 = dfV5.groupby('game_id', group_keys=False).apply(detect_momentum_shifts)
  dfV5[columns_to_fill] = dfV5[columns_to_fill].fillna(0)


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,Momentum_Shift_Occurred
0,-2.502194,-3.218508,-0.044483,-0.692268,0.207728,0.61773,-0.09579,-0.244981,0.253502,-0.905387,-1.129721,0.761692,-0.316661,0.775689,0.529774,0.099472,2.245678,0.057101,-0.596545,0.007475,0.00678,0.106489,0.182943,0.267666,-0.104805,0.543723,-0.244041,-0.033606,0.612715,0.095361,False
1,-2.428102,-3.246133,0.211228,-0.427948,0.206565,0.619298,0.392401,-0.273335,-0.108111,-0.083365,-0.97867,0.892659,-0.029448,-0.045745,0.487501,0.066728,-0.285052,0.317987,-0.058645,-0.122947,0.307805,-0.098938,0.218983,0.119934,-0.352395,0.675005,-0.245362,-0.029322,0.568054,0.084004,False
2,-2.196172,-3.057688,1.820782,0.013693,0.171288,0.602331,0.51045,-0.78514,-0.151083,0.007881,-0.968615,0.976115,-0.060222,-0.286907,0.489112,-0.026622,-0.186035,0.236064,0.102568,-0.559631,1.016895,0.141309,0.30203,0.146483,-0.556321,0.652954,-0.21955,-0.090924,-0.206692,0.003514,False
3,-1.521493,-2.918547,4.443888,1.935242,0.949298,0.537175,2.031028,0.302749,0.057638,0.453378,-0.444415,1.154558,-0.003316,-0.032604,0.331423,-0.483051,-0.276693,-0.191189,-0.417322,-2.645951,-0.81922,0.260691,-0.118316,-0.071153,-0.277915,0.423098,-0.209122,-0.245814,-1.100681,-0.606992,False
4,-2.170653,-3.158175,-0.832833,-0.398895,-0.329927,0.098929,-1.306571,0.874564,-1.304541,3.139035,0.899681,0.905133,-2.938221,0.742369,0.451527,-0.209335,1.523188,-0.790911,3.285815,-1.212305,0.865374,-1.4742,0.939521,-0.462161,-0.1728,1.062859,0.305143,-0.017631,1.443531,-0.090435,False


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

df = dfV5.copy()

excluded_cols = ['game_id', 'Momentum_Shift_Occurred', 'Momentum_Holding_Team']
target_col = 'Momentum_Shift_Occurred'
feature_df = df.drop(columns=excluded_cols, errors='ignore')

numeric_cols = feature_df.select_dtypes(include=['number']).columns
categorical_cols = feature_df.select_dtypes(exclude=['number']).columns
if len(categorical_cols) > 0:
    feature_df = pd.get_dummies(feature_df, columns=categorical_cols, drop_first=True)

unique_games = df['game_id'].unique()
unique_games_sorted = sorted(unique_games)
train_size = int(0.8 * len(unique_games_sorted))
train_games = unique_games_sorted[:train_size]
test_games = unique_games_sorted[train_size:]

feature_df['game_id'] = df['game_id']
train_features = feature_df[feature_df['game_id'].isin(train_games)].drop(columns='game_id')
test_features = feature_df[feature_df['game_id'].isin(test_games)].drop(columns='game_id')
X_train = train_features.values
X_test = test_features.values
y_train = df.loc[df['game_id'].isin(train_games), target_col].astype(int).values
y_test = df.loc[df['game_id'].isin(test_games), target_col].astype(int).values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter Tuning for Class 1 Optimization
f1_scorer = make_scorer(f1_score, pos_label=1)

param_grid = {
    'hidden_layer_sizes': [(32, 16), (64, 32, 16), (128, 64, 32)],
    'alpha': [0.0001, 0.001, 0.01],
    'solver': ['adam'],
    'max_iter': [500, 1000]
}

mlp = MLPClassifier(random_state=42)
grid_search = GridSearchCV(mlp, param_grid, scoring=f1_scorer, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation F1 Score:", grid_search.best_score_)

# Threshold Tuning After Training
y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]

best_threshold, best_f1 = 0.5, 0
for t in np.linspace(0.05, 0.5, 20):
    y_pred_t = (y_test_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_t)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold: {best_threshold}, Best F1: {best_f1:.4f}")

# Final Evaluation
y_train_pred = (best_model.predict_proba(X_train_scaled)[:, 1] >= best_threshold).astype(int)
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("\n=== Train Set ===")
print(classification_report(y_train, y_train_pred, zero_division=0))

print("=== Test Set ===")
print(classification_report(y_test, y_test_pred, zero_division=0))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'alpha': 0.001, 'hidden_layer_sizes': (32, 16), 'max_iter': 500, 'solver': 'adam'}
Best Cross-Validation F1 Score: 0.26989819607297105
Best threshold: 0.16842105263157897, Best F1: 0.3224

=== Train Set ===
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    331300
           1       0.46      0.77      0.57      6975

    accuracy                           0.98    338275
   macro avg       0.73      0.87      0.78    338275
weighted avg       0.98      0.98      0.98    338275

=== Test Set ===
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     82061
           1       0.26      0.42      0.32      1726

    accuracy                           0.96     83787
   macro avg       0.62      0.70      0.65     83787
weighted avg       0.97      0.96      0.97     83787



In [None]:
from scipy.ndimage import maximum_filter1d

df = dfV5.copy()

excluded_cols = ['game_id', 'Momentum_Shift_Occurred', 'Momentum_Holding_Team']
target_col = 'Momentum_Shift_Occurred'
feature_df = df.drop(columns=excluded_cols, errors='ignore')

numeric_cols = feature_df.select_dtypes(include=['number']).columns
categorical_cols = feature_df.select_dtypes(exclude=['number']).columns
if len(categorical_cols) > 0:
    feature_df = pd.get_dummies(feature_df, columns=categorical_cols, drop_first=True)

unique_games = df['game_id'].unique()
unique_games_sorted = sorted(unique_games)
train_size = int(0.8 * len(unique_games_sorted))
train_games = unique_games_sorted[:train_size]
test_games = unique_games_sorted[train_size:]

feature_df['game_id'] = df['game_id']
train_features = feature_df[feature_df['game_id'].isin(train_games)].drop(columns='game_id')
test_features = feature_df[feature_df['game_id'].isin(test_games)].drop(columns='game_id')
X_train = train_features.values
X_test = test_features.values
y_train = df.loc[df['game_id'].isin(train_games), target_col].astype(int).values
y_test = df.loc[df['game_id'].isin(test_games), target_col].astype(int).values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(32, 16), alpha=0.001, solver='adam', max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Predict Probabilities
y_train_proba = mlp.predict_proba(X_train_scaled)[:, 1]
y_test_proba = mlp.predict_proba(X_test_scaled)[:, 1]

# Apply Best Threshold for Predictions
y_train_pred_threshold = (y_train_proba >= 0.16842105263157897).astype(int)
y_test_pred_threshold = (y_test_proba >= 0.16842105263157897).astype(int)

# Apply Temporal Tolerance (6-Play Window)
y_train_tolerant = maximum_filter1d(y_train, size=6, mode='constant', cval=0)
y_train_pred_tolerant = maximum_filter1d(y_train_pred_threshold, size=6, mode='constant', cval=0)
y_test_tolerant = maximum_filter1d(y_test, size=6, mode='constant', cval=0)
y_test_pred_tolerant = maximum_filter1d(y_test_pred_threshold, size=6, mode='constant', cval=0)

# Evaluate with Best Threshold
print("\n=== Train Set with 6-Play Tolerance (Threshold Applied) ===")
print(classification_report(y_train_tolerant, y_train_pred_tolerant, zero_division=0))
print("\n=== Test Set with 6-Play Tolerance (Threshold Applied) ===")
print(classification_report(y_test_tolerant, y_test_pred_tolerant, zero_division=0))

# Evaluate Without Applying Threshold
y_train_pred_no_threshold = (y_train_proba >= 0.5).astype(int)
y_test_pred_no_threshold = (y_test_proba >= 0.5).astype(int)

y_train_pred_no_threshold_tolerant = maximum_filter1d(y_train_pred_no_threshold, size=6, mode='constant', cval=0)
y_test_pred_no_threshold_tolerant = maximum_filter1d(y_test_pred_no_threshold, size=6, mode='constant', cval=0)

print("\n=== Train Set with 6-Play Tolerance (No Threshold Applied) ===")
print(classification_report(y_train_tolerant, y_train_pred_no_threshold_tolerant, zero_division=0))
print("\n=== Test Set with 6-Play Tolerance (No Threshold Applied) ===")
print(classification_report(y_test_tolerant, y_test_pred_no_threshold_tolerant, zero_division=0))



=== Train Set with 6-Play Tolerance (Threshold Applied) ===
              precision    recall  f1-score   support

           0       0.97      0.91      0.94    298549
           1       0.55      0.78      0.65     39726

    accuracy                           0.90    338275
   macro avg       0.76      0.85      0.79    338275
weighted avg       0.92      0.90      0.91    338275


=== Test Set with 6-Play Tolerance (Threshold Applied) ===
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     73947
           1       0.38      0.52      0.44      9840

    accuracy                           0.85     83787
   macro avg       0.66      0.71      0.68     83787
weighted avg       0.87      0.85      0.86     83787


=== Train Set with 6-Play Tolerance (No Threshold Applied) ===
              precision    recall  f1-score   support

           0       0.93      0.99      0.96    298549
           1       0.85      0.44      0.58     3972

In [7]:
from imblearn.ensemble import BalancedBaggingClassifier

balanced_mlp = BalancedBaggingClassifier(
    estimator=mlp,
    n_estimators=10,  # Number of models to ensemble
    sampling_strategy=0.8,  # Adjust how much balancing occurs
    random_state=42,
    n_jobs=-1
)

balanced_mlp.fit(X_train_scaled, y_train)

y_train_proba = balanced_mlp.predict_proba(X_train_scaled)[:, 1]
y_test_proba = balanced_mlp.predict_proba(X_test_scaled)[:, 1]

# Apply Best Threshold for Predictions
y_train_pred_threshold = (y_train_proba >= 0.16842105263157897).astype(int)
y_test_pred_threshold = (y_test_proba >= 0.16842105263157897).astype(int)

# Apply Temporal Tolerance (6-Play Window)
y_train_tolerant = maximum_filter1d(y_train, size=6, mode='constant', cval=0)
y_train_pred_tolerant = maximum_filter1d(y_train_pred_threshold, size=6, mode='constant', cval=0)
y_test_tolerant = maximum_filter1d(y_test, size=6, mode='constant', cval=0)
y_test_pred_tolerant = maximum_filter1d(y_test_pred_threshold, size=6, mode='constant', cval=0)

# Evaluate with Best Threshold
print("\n=== Train Set with 6-Play Tolerance (Threshold Applied) ===")
print(classification_report(y_train_tolerant, y_train_pred_tolerant, zero_division=0))
print("\n=== Test Set with 6-Play Tolerance (Threshold Applied) ===")
print(classification_report(y_test_tolerant, y_test_pred_tolerant, zero_division=0))

# Evaluate Without Applying Threshold
y_train_pred_no_threshold = (y_train_proba >= 0.5).astype(int)
y_test_pred_no_threshold = (y_test_proba >= 0.5).astype(int)

y_train_pred_no_threshold_tolerant = maximum_filter1d(y_train_pred_no_threshold, size=6, mode='constant', cval=0)
y_test_pred_no_threshold_tolerant = maximum_filter1d(y_test_pred_no_threshold, size=6, mode='constant', cval=0)

print("\n=== Train Set with 6-Play Tolerance (No Threshold Applied) ===")
print(classification_report(y_train_tolerant, y_train_pred_no_threshold_tolerant, zero_division=0))
print("\n=== Test Set with 6-Play Tolerance (No Threshold Applied) ===")
print(classification_report(y_test_tolerant, y_test_pred_no_threshold_tolerant, zero_division=0))


=== Train Set with 6-Play Tolerance (Threshold Applied) ===
              precision    recall  f1-score   support

           0       1.00      0.28      0.43    298549
           1       0.16      1.00      0.27     39726

    accuracy                           0.36    338275
   macro avg       0.58      0.64      0.35    338275
weighted avg       0.90      0.36      0.41    338275


=== Test Set with 6-Play Tolerance (Threshold Applied) ===
              precision    recall  f1-score   support

           0       1.00      0.27      0.43     73947
           1       0.15      0.99      0.27      9840

    accuracy                           0.36     83787
   macro avg       0.58      0.63      0.35     83787
weighted avg       0.90      0.36      0.41     83787


=== Train Set with 6-Play Tolerance (No Threshold Applied) ===
              precision    recall  f1-score   support

           0       1.00      0.52      0.69    298549
           1       0.22      0.99      0.36     3972

Completed:

Did look at ensemble, was thinking of trying combined ensemble?????
Minor Undersampling -? method that does ?
Hypertuned Parameters with extra parameter that was missing and bigger network
Hypertuned prediction threshold to help raise f1 score if possible
Trained model with parameters + prediction threshold resulting in highest f1 score for class 1, and itroduced 6 play ?fault tolerance?
Trained model with parameters(No Threshold) resulting in highest f1 score for class 1, and itroduced 6 play ?fault tolerance?