# Saturday, November 15, 2025

# 1. IMPORT LIBRARIES

In [1]:
import nfl_data_py as nfl
import pandas as pd

# 2. LOAD THE DATASET
####  Load all REGULAR seasons only

In [None]:
# we want all regular season from 1999 to 2025
seasons = [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
           2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
           2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
regular = True

# load play-by-play data
dataset = nfl.import_pbp_data(seasons, downcast=True, cache=False)

# keep regular season only
if regular and "season_type" in dataset.columns:
    dataset = dataset[dataset["season_type"] == "REG"]

2025 done.
2024 done.
2023 done.
2022 done.
2021 done.
2020 done.
2019 done.
2018 done.
2017 done.
2016 done.
2015 done.
2014 done.
2013 done.
2012 done.
2011 done.
2010 done.
2009 done.
2008 done.
2007 done.
2006 done.
2005 done.
2004 done.
2003 done.
2002 done.
2001 done.
2000 done.
1999 done.
Downcasting floats.


- Save the dataset for the first time
- All other times just load the dataset

In [42]:
# save the dataset
dataset.to_csv('NFL[1999-2025].csv')

In [2]:
# load our saved dataset
dataset = pd.read_csv('NFL[1999-2025].csv', low_memory=False)

# 3. CLEAN THE DATASET
## Offensive Metrics:
### [ Off_EPA | DSR | Off_ DVOA | AY/A | QB_EPA ]

#### Off_EPA:
- Offensive Expected Points Added by the possession team
#### DSR:
- Drive Success Rate = number_of_drive_first_downs / number_of_drives
#### Off_DVOA:
- Offensive Defense-Adjusted Value Over Average = (team_off_epa - league_off_epa) / absolute_value_of(league_off_epa)
#### AY/A:
- Adjusted Yards Per Attempt = (number_of_passing_yards + 20 * number_of_pass_touchdowns - 45 * number_of_interceptions) / number_of_pass_attempts
#### QB_EPA:
- Quarter Back's Expected Points Added by the possesion team

## Defensive Metrics:
### [ Def_EPA | Def_DVOA | NPRG ]
#### Def_EPA:
- Defensive Expected Points Allowed by the defense team
#### Def_DVOA:
- Defensive Defense-Adjusted Value Over Average = (team_def_epa - league_def_epa) / absolute_value_of(league_def_epa)
### NPRG:
- Net Pressure Rate Gained = number_of_times_qb_was_pressured / number_of_dropbacks

## Other Metrics
### [ Year | Week | Home_Team | Away_Team | Result ]
#### Year & Week:
- The years will also represent the weights for the dataset. The more recent the year, the heavier the weight. This helps the model prioritize recent games over outdated games
#### Home & Away Teams:
- Data will keep track of which teams were home and away for each game
#### Result:
- This is the boolean data we want to predict. True means home won, False means home lost

##### NOTE! All data will be based on Game-By-Game

### 3.1 Get The Offensive & Defensive Metrics

In [54]:
# keep only scrimmage plays that actually gain/lose yards: runs or passes
# (This implicitly excludes penalties-only, timeouts, spikes, kneels, punts, etc.)
play_mask = (dataset.get("rush", 0) == 1) | (dataset.get("pass", 0) == 1)
dataset = dataset[play_mask].copy()

# note! we'll calculate offensive and defensive DVOA later

# OFFENSE metrics (by possession team)
# epa: expected points added - how good offense is
offense = (
    # for each possession team in each game, get the average offensive metrics
    # we'll get the average of dsr and ay/a later
    dataset.groupby(["game_id", "posteam"])
       .agg(
           off_epa=("epa", "mean"), # average epa & dvoa
           qb_epa=("qb_epa", "mean"), # average QB epa
           # for dsr:
           # total drives and number of drive first downs
           total_drives=("drive", "sum"),
           successful_drives=("drive_first_downs", "count"),
           # for ay/a:
           # number of passing yards, pass touchdowns, interceptons, and pass attempts
           pass_yards=("passing_yards", "sum"),
           pass_touchdowns=("pass_touchdown", "sum"),
           interceptions=("interception", "sum"),
           pass_attempts=("pass_attempt", "sum")
       )
       .rename_axis(["game_id", "team"])
       .reset_index()
)

# DEFENSE metrics (what each defense allowed)
# epa_allowed: mean offensive EPA by opponents vs this defense - how weak defense is
defense = (
    # for each defense team in each game, get the average defensive metrics
    # we'll get the average of nprg later
    dataset.groupby(["game_id", "defteam"])
       .agg(
           def_epa=("epa", "mean"), # average epa & dvoa
           # for nprg:
           # total number of times QB was pressured and dropbacks
           pressures=("was_pressure", "count"), # true/false values. NaN/0.0
           dropbacks=("qb_dropback", "count")
       )
       .rename_axis(["game_id", "team"])
       .reset_index()
)

# combine the metrics we got from offense and defense
team_stats = offense.merge(defense, on=["game_id", "team"], how="outer")  # merge on 'team'
team_stats

Unnamed: 0,game_id,team,off_epa,qb_epa,total_drives,successful_drives,pass_yards,pass_touchdowns,interceptions,pass_attempts,def_epa,pressures,dropbacks
0,1999_01_ARI_PHI,ARI,-0.124676,-0.124676,1546.0,87,274.0,1.0,3.0,50.0,-0.250785,0,60
1,1999_01_ARI_PHI,PHI,-0.250785,-0.250785,941.0,61,91.0,2.0,2.0,30.0,-0.124676,0,87
2,1999_01_BUF_IND,BUF,-0.209526,-0.209526,834.0,68,300.0,1.0,2.0,47.0,0.103372,0,61
3,1999_01_BUF_IND,IND,0.103372,0.103372,699.0,62,284.0,2.0,2.0,33.0,-0.209526,0,68
4,1999_01_CAR_NO,CAR,-0.350204,-0.350204,914.0,56,207.0,1.0,1.0,39.0,-0.204681,0,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13679,2025_10_PHI_GB,PHI,-0.069291,-0.069291,593.0,63,183.0,1.0,0.0,26.0,-0.152826,0,67
13680,2025_10_PIT_LAC,LAC,-0.111648,-0.111648,938.0,72,220.0,1.0,0.0,38.0,-0.372950,0,52
13681,2025_10_PIT_LAC,PIT,-0.372950,-0.372950,696.0,52,161.0,1.0,2.0,34.0,-0.111648,0,72
13682,2025_11_NYJ_NE,NE,0.138452,0.138452,556.0,64,281.0,1.0,0.0,35.0,-0.085097,0,57


In [55]:
# it's time to find the averages of off_dvoa, def_dvoa, dsr, ay/a, and nprg
team_eff = team_stats[['game_id', 'team', 'off_epa', 'def_epa', 'qb_epa']].copy()

# METRIC CALCULATIONS
# DVOA (offensive and defensive) = (team_epa - league_epa) / abs(league_epa)
# calculate the offensive and defensive league epa first
off_league_epa = team_stats['off_epa'].mean()
def_league_epa = team_stats['def_epa'].mean()
# now calculate the offensive and defensive DVOA
team_eff['off_dvoa'] = (team_stats['off_epa'] - off_league_epa) / abs(off_league_epa)
team_eff['def_dvoa'] = (team_stats['def_epa'] - def_league_epa) / abs(def_league_epa)

# DSR = successful_drives / total_drives
team_eff['dsr'] = team_stats['successful_drives'] / team_stats['total_drives']

# AY/A = (passing_yards + 20 * pass_touchdowns - 45 * interceptions) / pass_attempts
team_eff['ay/a'] = (team_stats['pass_yards'] + 20 * team_stats['pass_touchdowns']
                    - 45 * team_stats['interceptions']) / team_stats['pass_attempts']

# NPRG = pressures / dropbacks
team_eff['nprg'] = team_stats['pressures'] / team_stats['dropbacks']

team_eff

Unnamed: 0,game_id,team,off_epa,def_epa,qb_epa,off_dvoa,def_dvoa,dsr,ay/a,nprg
0,1999_01_ARI_PHI,ARI,-0.124676,-0.250785,-0.124676,-8.238406,-17.582923,0.056274,3.180000,0.0
1,1999_01_ARI_PHI,PHI,-0.250785,-0.124676,-0.250785,-17.582923,-8.238406,0.064825,1.366667,0.0
2,1999_01_BUF_IND,BUF,-0.209526,0.103372,-0.209526,-14.525645,8.659734,0.081535,4.893617,0.0
3,1999_01_BUF_IND,IND,0.103372,-0.209526,0.103372,8.659734,-14.525645,0.088698,7.090909,0.0
4,1999_01_CAR_NO,CAR,-0.350204,-0.204681,-0.350204,-24.949785,-14.166638,0.061269,4.666667,0.0
...,...,...,...,...,...,...,...,...,...,...
13679,2025_10_PHI_GB,PHI,-0.069291,-0.152826,-0.069291,-4.134427,-10.324250,0.106239,7.807692,0.0
13680,2025_10_PIT_LAC,LAC,-0.111648,-0.372950,-0.111648,-7.273024,-26.635203,0.076759,6.315789,0.0
13681,2025_10_PIT_LAC,PIT,-0.372950,-0.111648,-0.372950,-26.635203,-7.273024,0.074713,2.676471,0.0
13682,2025_11_NYJ_NE,NE,0.138452,-0.085097,0.138452,11.259125,-5.305635,0.115108,8.600000,0.0


### 3.2. Get The Away & Home Teams

In [56]:
# split into away and home DataFrames
away = team_eff.iloc[1::2].reset_index(drop=True)  # odd indices → away team
home = team_eff.iloc[::2].reset_index(drop=True)  # even indices → home team

# merge side by side
gbg = pd.concat([away, home], axis=1, keys=['away', 'home'])

# flatten the MultiIndex columns (e.g., 'away_team', 'home_team')
gbg.columns = [f"{side}_{col}" for side, col in gbg.columns]

# keep only one copy of game_id
gbg = gbg.drop(columns=['home_game_id'])
gbg = gbg.rename(columns={'away_game_id': 'game_id'})

### 3.3. Get The Years And Weeks

In [57]:
# split game_id into parts
date = dataset[['game_id']].drop_duplicates(['game_id'])
date[['year', 'week', 'away', 'home']] = date['game_id'].str.split('_', expand=True, n=3)
date = date.sort_values('game_id', ascending=True).reset_index()
date['year'] = date['year'].astype(int)
date['week'] = date['week'].astype(int)
gbg = gbg.join(date[['year', 'week']])

gbg

Unnamed: 0,game_id,away_team,away_off_epa,away_def_epa,away_qb_epa,away_off_dvoa,away_def_dvoa,away_dsr,away_ay/a,away_nprg,...,home_off_epa,home_def_epa,home_qb_epa,home_off_dvoa,home_def_dvoa,home_dsr,home_ay/a,home_nprg,year,week
0,1999_01_ARI_PHI,PHI,-0.250785,-0.124676,-0.250785,-17.582923,-8.238406,0.064825,1.366667,0.0,...,-0.124676,-0.250785,-0.124676,-8.238406,-17.582923,0.056274,3.180000,0.0,1999,1
1,1999_01_BUF_IND,IND,0.103372,-0.209526,0.103372,8.659734,-14.525645,0.088698,7.090909,0.0,...,-0.209526,0.103372,-0.209526,-14.525645,8.659734,0.081535,4.893617,0.0,1999,1
2,1999_01_CAR_NO,NO,-0.204681,-0.350204,-0.204681,-14.166638,-24.949785,0.057093,5.846154,0.0,...,-0.350204,-0.204681,-0.350204,-24.949785,-14.166638,0.061269,4.666667,0.0,1999,1
3,1999_01_CIN_TEN,TEN,0.039889,0.173752,0.122393,3.955703,13.874885,0.076709,10.470588,0.0,...,0.173752,0.039889,0.173752,13.874885,3.955703,0.080271,3.972973,0.0,1999,1
4,1999_01_DAL_WAS,WAS,0.200565,0.277992,0.200565,15.861709,21.598945,0.071992,11.722222,0.0,...,0.277992,0.200565,0.277992,21.598945,15.861709,0.065918,6.540000,0.0,1999,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6837,2025_10_NO_CAR,NO,0.142840,-0.300061,0.142840,11.584328,-21.234240,0.089779,10.733333,0.0,...,-0.300061,0.142840,-0.300061,-21.234240,11.584328,0.113978,2.925926,0.0,2025,10
6838,2025_10_NYG_CHI,NYG,-0.005457,0.076580,-0.005457,0.595623,6.674534,0.084687,7.175000,0.0,...,0.076580,-0.005457,0.076580,6.674534,0.595623,0.091413,6.666667,0.0,2025,10
6839,2025_10_PHI_GB,PHI,-0.069291,-0.152826,-0.069291,-4.134427,-10.324250,0.106239,7.807692,0.0,...,-0.152826,-0.069291,-0.152826,-10.324250,-4.134427,0.091033,4.631579,0.0,2025,10
6840,2025_10_PIT_LAC,PIT,-0.372950,-0.111648,-0.372950,-26.635203,-7.273024,0.074713,2.676471,0.0,...,-0.111648,-0.372950,-0.111648,-7.273024,-26.635203,0.076759,6.315789,0.0,2025,10


### 3.4. Get The Injuries

In [None]:
def GetInjuries(tables):
    injured = []
    for table_div in tables:
        if table_div is not None:
            table = table_div.find("table")  # get the actual <table> tag inside the div
            rows = table.find("tbody").find_all("tr")
            injured.extend([
                sum("QB" in r.text for r in rows),
                sum("WR" in r.text for r in rows),
                sum("DT" in r.text for r in rows),
                sum("OL" in r.text for r in rows),
            ])
        else:
            # If table div exists but no table found, fill zeros
            injured.extend([0, 0, 0, 0])
    return injured

from bs4 import BeautifulSoup
import requests

# columns for our injury dataframe
injuries = pd.DataFrame(columns=['year', 'week', 'away_team', 'home_team',
                                 'away_qb_injured', 'away_wr_injured', 'away_dt_injured', 'away_ol_injured',
                                 'home_qb_injured', 'home_wr_injured', 'home_dt_injured', 'home_ol_injured'])

# seasons from 2008 and before have no record of injuries so let's exclude those
seasons = [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
           2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

for year in seasons:
    # add the year to the url
    url = f"https://www.nfl.com/injuries/league/{year}/reg"
    week = 1

    # 17 weeks in a season
    limit = 17
    # 2025 doesn't have the full 17 weeks yet
    if year == 2025:
        limit = 11

    while week <= limit:
        # add the week to the url
        url_week = url
        url_week += f"{week}"
        url_request = requests.get(url_week).text
        soup = BeautifulSoup(url_request, 'lxml')

        units = soup.find_all("section", class_="nfl-o-injury-report__unit")

        teams = []
        injured = []

        for unit in units:

            # ---- 1. Get team abbreviation ----
            team_tags = unit.find_all("span", class_="nfl-c-matchup-strip__team-abbreviation")
            away = team_tags[0].text.strip()
            home = team_tags[1].text.strip()
            teams.extend([away, home])
            # ---- 2. Try to find a table ----
            tables = unit.find_all("div", class_="d3-o-table--horizontal-scroll")
            if len(tables) % 2 != 0:
                tables.append(None)
            injured.extend(GetInjuries(tables))                         

        # counter for team names
        i = 0
        n = len(teams)
        # counter for injured
        j = 0
        m = len(injured)


        while i < n and j < m:
            injuries.loc[-1] = [year, week, teams[i], teams[i+1],
                        injured[j], injured[j+1], injured[j+2], injured[j+3],
                        injured[j+4], injured[j+5], injured[j+6], injured[j+7]]
            injuries.index += 1
            injuries = injuries.sort_index()
            i += 2
            j += 8
        
        week += 1

injuries = injuries.sort_values(['year', 'week'], ascending=True).reset_index()
injuries = injuries.drop(columns=['index'])
injuries

Unnamed: 0,year,week,away_team,home_team,away_qb_injured,away_wr_injured,away_dt_injured,away_ol_injured,home_qb_injured,home_wr_injured,home_dt_injured,home_ol_injured
0,2009,1,SD,OAK,0,0,0,0,0,2,0,0
1,2009,1,BUF,NE,0,0,1,0,1,3,1,0
2,2009,1,STL,SEA,1,0,0,0,0,1,0,0
3,2009,1,DET,NO,1,2,1,0,0,0,0,0
4,2009,1,KC,BAL,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4255,2025,11,CHI,MIN,0,3,1,0,1,0,2,0
4256,2025,11,GB,NYG,0,5,0,0,1,1,1,0
4257,2025,11,TB,BUF,0,1,1,0,0,2,3,0
4258,2025,11,WAS,MIA,1,2,0,0,0,1,0,0


In [None]:
injuries.to_csv('NFL_injuries[2009-2025].csv')

In [58]:
gbg = gbg.merge(injuries, on=['year', 'week', 'away_team', 'home_team'], how='inner')
gbg

Unnamed: 0,game_id,away_team,away_off_epa,away_def_epa,away_qb_epa,away_off_dvoa,away_def_dvoa,away_dsr,away_ay/a,away_nprg,...,year,week,away_qb_injured,away_wr_injured,away_dt_injured,away_ol_injured,home_qb_injured,home_wr_injured,home_dt_injured,home_ol_injured
0,2009_01_DEN_CIN,DEN,-0.063401,-0.156457,-0.063401,-3.697942,-10.593334,0.081325,8.218750,0.0,...,2009,1,2,1,1,0,1,1,0,0
1,2009_01_KC_BAL,KC,-0.141449,0.178693,-0.141449,-9.481256,14.240999,0.080139,8.037037,0.0,...,2009,1,1,1,0,0,0,0,0,0
2,2009_01_MIA_ATL,MIA,-0.368126,-0.024849,-0.156578,-26.277765,-0.841321,0.080054,4.441176,0.0,...,2009,1,0,0,0,0,0,0,0,0
3,2009_01_MIN_CLE,MIN,0.187801,-0.198595,0.187801,14.915910,-13.715712,0.085250,5.000000,0.0,...,2009,1,0,1,0,0,0,0,1,0
4,2009_01_NYJ_HOU,NYJ,0.136241,-0.371862,0.136241,11.095356,-26.554604,0.093671,7.967742,0.0,...,2009,1,1,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1760,2025_10_NO_CAR,NO,0.142840,-0.300061,0.142840,11.584328,-21.234240,0.089779,10.733333,0.0,...,2025,10,1,0,2,0,0,2,1,0
1761,2025_10_NYG_CHI,NYG,-0.005457,0.076580,-0.005457,0.595623,6.674534,0.084687,7.175000,0.0,...,2025,10,0,1,1,0,0,4,1,0
1762,2025_10_PHI_GB,PHI,-0.069291,-0.152826,-0.069291,-4.134427,-10.324250,0.106239,7.807692,0.0,...,2025,10,0,1,1,0,1,5,1,0
1763,2025_10_PIT_LAC,PIT,-0.372950,-0.111648,-0.372950,-26.635203,-7.273024,0.074713,2.676471,0.0,...,2025,10,1,1,1,0,0,2,0,0


### 3.5. Get The Game Results
##### 'Result' column will have True and False boolean values
- True = Home wins!
- False = Home loses...

In [None]:
# find the result of each game
result = dataset[['game_id', 'home_score', 'away_score']].drop_duplicates(['game_id'])
result['result'] = result['home_score'] > result['away_score']

# reset the indices to match the ones in gbg dataframe
result = result.sort_values('game_id', ascending=True).reset_index()

result

Unnamed: 0,index,game_id,home_score,away_score,result
0,1160281,1999_01_ARI_PHI,24,25,False
1,1160482,1999_01_BUF_IND,31,14,True
2,1160664,1999_01_CAR_NO,19,10,True
3,1160838,1999_01_CIN_TEN,36,35,True
4,1161027,1999_01_DAL_WAS,35,41,False
...,...,...,...,...,...
6837,24883,2025_10_NO_CAR,7,17,False
6838,25036,2025_10_NYG_CHI,24,20,True
6839,25214,2025_10_PHI_GB,7,10,False
6840,25386,2025_10_PIT_LAC,25,10,True


### 3.6. Tie It All Together

In [None]:
# join the result dataframe to our main dataframe
gbg = gbg.join(result[['result']])

# organize the dataframe
cols = ['year', 'week',
        'away_team',
        'away_off_epa', 'away_def_epa', 'away_qb_epa', 'away_off_dvoa', 'away_def_dvoa', 'away_dsr', 'away_ay/a', 'away_nprg',
        'home_team',
        'home_off_epa', 'home_def_epa', 'home_qb_epa', 'home_off_dvoa', 'home_def_dvoa', 'home_dsr', 'home_ay/a', 'home_nprg',
        'result']
gbg = gbg[cols]

gbg

Unnamed: 0,year,week,away_team,away_off_epa,away_def_epa,away_qb_epa,away_off_dvoa,away_def_dvoa,away_dsr,away_ay/a,...,home_team,home_off_epa,home_def_epa,home_qb_epa,home_off_dvoa,home_def_dvoa,home_dsr,home_ay/a,home_nprg,result
0,1999,1,ARI,-0.124676,-0.250785,-0.124676,-8.238406,-17.582923,0.056274,3.180000,...,PHI,-0.250785,-0.124676,-0.250785,-17.582923,-8.238406,0.064825,1.366667,0.0,False
1,1999,1,BUF,-0.209526,0.103372,-0.209526,-14.525645,8.659734,0.081535,4.893617,...,IND,0.103372,-0.209526,0.103372,8.659734,-14.525645,0.088698,7.090909,0.0,True
2,1999,1,CAR,-0.350204,-0.204681,-0.350204,-24.949785,-14.166638,0.061269,4.666667,...,NO,-0.204681,-0.350204,-0.204681,-14.166638,-24.949785,0.057093,5.846154,0.0,True
3,1999,1,CIN,0.173752,0.039889,0.173752,13.874885,3.955703,0.080271,3.972973,...,TEN,0.039889,0.173752,0.122393,3.955703,13.874885,0.076709,10.470588,0.0,True
4,1999,1,DAL,0.277992,0.200565,0.277992,21.598945,15.861709,0.065918,6.540000,...,WAS,0.200565,0.277992,0.200565,15.861709,21.598945,0.071992,11.722222,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6837,2025,10,CAR,-0.300061,0.142840,-0.300061,-21.234240,11.584328,0.113978,2.925926,...,NO,0.142840,-0.300061,0.142840,11.584328,-21.234240,0.089779,10.733333,0.0,False
6838,2025,10,CHI,0.076580,-0.005457,0.076580,6.674534,0.595623,0.091413,6.666667,...,NYG,-0.005457,0.076580,-0.005457,0.595623,6.674534,0.084687,7.175000,0.0,True
6839,2025,10,GB,-0.152826,-0.069291,-0.152826,-10.324250,-4.134427,0.091033,4.631579,...,PHI,-0.069291,-0.152826,-0.069291,-4.134427,-10.324250,0.106239,7.807692,0.0,False
6840,2025,10,LAC,-0.111648,-0.372950,-0.111648,-7.273024,-26.635203,0.076759,6.315789,...,PIT,-0.372950,-0.111648,-0.372950,-26.635203,-7.273024,0.074713,2.676471,0.0,True


In [None]:
# version number
v = 3.1

# save the dataframe for the next steps
gbg.to_csv(f'V{v}_NFL_gbg_data[1999-2025].csv')

### 4. PREP FOR TRAINING
- The model should know EPA, YPP, home & away scores
- The home & away teams are only for our eyes, not the model

In [None]:
import pandas as pd

dataset = pd.read_csv(f'V{v}_NFL_gbg_data[1999-2025].csv')

In [None]:
team_map = {
    'NO': 1,
    'BUF': 2,
    'JAX': 3,
    'CLE': 4,
    'PHI': 5,
    'GB': 6,
    'LA': 7,
    'LAC': 8,
    'NE': 9,
    'IND': 10,
    'CHI': 11,
    'WAS': 12,
    'NYJ': 13,
    'SEA': 14,
    'ATL': 15,
    'DEN': 16,
    'MIN': 17,
    'ARI': 18,
    'DET': 19,
    'BAL': 20,
    'CIN': 21,
    'LV': 22,
    'TEN': 23,
    'MIA': 24,
    'DAL': 25,
    'KC': 26,
    'PIT': 27,
    'HOU': 28,
    'SF': 29,
    'CAR': 30,
    'NYG': 31,
    'TB': 32
}

dataset['away_team'] = dataset['away_team'].map(team_map)
dataset['home_team'] = dataset['home_team'].map(team_map)

# Removing unnamed columns using drop function
dataset.drop(dataset.columns[dataset.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)

dataset

Unnamed: 0,year,week,away_team,away_off_epa,away_def_epa,away_qb_epa,away_off_dvoa,away_def_dvoa,away_dsr,away_ay/a,...,home_team,home_off_epa,home_def_epa,home_qb_epa,home_off_dvoa,home_def_dvoa,home_dsr,home_ay/a,home_nprg,result
0,1999,1,18,-0.124676,-0.250785,-0.124676,-8.238406,-17.582923,0.056274,3.180000,...,5,-0.250785,-0.124676,-0.250785,-17.582923,-8.238406,0.064825,1.366667,0.0,False
1,1999,1,2,-0.209526,0.103372,-0.209526,-14.525645,8.659734,0.081535,4.893617,...,10,0.103372,-0.209526,0.103372,8.659734,-14.525645,0.088698,7.090909,0.0,True
2,1999,1,30,-0.350204,-0.204681,-0.350204,-24.949785,-14.166638,0.061269,4.666667,...,1,-0.204681,-0.350204,-0.204681,-14.166638,-24.949785,0.057093,5.846154,0.0,True
3,1999,1,21,0.173752,0.039889,0.173752,13.874885,3.955703,0.080271,3.972973,...,23,0.039889,0.173752,0.122393,3.955703,13.874885,0.076709,10.470588,0.0,True
4,1999,1,25,0.277992,0.200565,0.277992,21.598945,15.861709,0.065918,6.540000,...,12,0.200565,0.277992,0.200565,15.861709,21.598945,0.071992,11.722222,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6837,2025,10,30,-0.300061,0.142840,-0.300061,-21.234240,11.584328,0.113978,2.925926,...,1,0.142840,-0.300061,0.142840,11.584328,-21.234240,0.089779,10.733333,0.0,False
6838,2025,10,11,0.076580,-0.005457,0.076580,6.674534,0.595623,0.091413,6.666667,...,31,-0.005457,0.076580,-0.005457,0.595623,6.674534,0.084687,7.175000,0.0,True
6839,2025,10,6,-0.152826,-0.069291,-0.152826,-10.324250,-4.134427,0.091033,4.631579,...,5,-0.069291,-0.152826,-0.069291,-4.134427,-10.324250,0.106239,7.807692,0.0,False
6840,2025,10,8,-0.111648,-0.372950,-0.111648,-7.273024,-26.635203,0.076759,6.315789,...,27,-0.372950,-0.111648,-0.372950,-26.635203,-7.273024,0.074713,2.676471,0.0,True


In [None]:
# set the training and testing variables
from sklearn.model_selection import train_test_split

X = dataset.drop(columns=['result'])
y = dataset['result']
sample_weight = dataset['year']
X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(X, y, sample_weight, test_size=0.2, random_state=42) # 80% train / 20% test

### 5. AI MODEL TRAINING


In [None]:
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
import cupy as cp

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
X = cp.asarray(X.values)
y = cp.asarray(y.values)

def objective(trial):
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
        # "tree_method": "hist",
        "device": "cuda",
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []
    for tr, va in skf.split(
        cp.asnumpy(X), cp.asnumpy(y)
    ):  # indices on CPU are fine
        model = xgb.XGBClassifier(**param)
        model.fit(X[tr], y[tr])
        # GPU inference (no warning)
        p = model.predict_proba(X[va])[:, 1]
        # metric on CPU
        scores.append(roc_auc_score(cp.asnumpy(y[va]), cp.asnumpy(p)))
    return float(cp.asarray(scores).mean())

In [None]:
# Create and run the optimization process with 100 trials
study = optuna.create_study(study_name="xgboost_study_gpu_cupy", direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)

# Retrieve the best parameter values
best_params = study.best_params
print(f"\nBest parameters: {best_params}")

[I 2025-11-16 18:53:31,802] A new study created in memory with name: xgboost_study_gpu_cupy
Best trial: 3. Best value: 0.57306:   1%|          | 1/100 [00:14<23:12, 14.06s/it]

[I 2025-11-16 18:53:46,090] Trial 3 finished with value: 0.5730601637148328 and parameters: {'max_depth': 3, 'learning_rate': 0.054714023625026746, 'n_estimators': 148, 'subsample': 0.6450556225694122, 'colsample_bytree': 0.6582141725036098, 'min_child_weight': 3, 'gamma': 4.2315153994498775}. Best is trial 3 with value: 0.5730601637148328.


Best trial: 8. Best value: 0.578396:   2%|▏         | 2/100 [00:19<14:33,  8.91s/it]

[I 2025-11-16 18:53:51,418] Trial 8 finished with value: 0.5783957605463343 and parameters: {'max_depth': 6, 'learning_rate': 0.03954324939810724, 'n_estimators': 188, 'subsample': 0.7935691360874388, 'colsample_bytree': 0.7468089157089168, 'min_child_weight': 7, 'gamma': 3.9821509857026482}. Best is trial 8 with value: 0.5783957605463343.


Best trial: 5. Best value: 0.58208:   3%|▎         | 3/100 [00:36<20:21, 12.60s/it] 

[I 2025-11-16 18:54:08,391] Trial 5 finished with value: 0.5820798973449876 and parameters: {'max_depth': 4, 'learning_rate': 0.016453270741231353, 'n_estimators': 315, 'subsample': 0.5419970870105528, 'colsample_bytree': 0.5017978544776562, 'min_child_weight': 2, 'gamma': 2.3453357021678993}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:   4%|▍         | 4/100 [00:53<23:20, 14.59s/it]

[I 2025-11-16 18:54:26,034] Trial 10 finished with value: 0.5751452528174659 and parameters: {'max_depth': 20, 'learning_rate': 0.04596803087543049, 'n_estimators': 663, 'subsample': 0.7757240432246433, 'colsample_bytree': 0.7011565246254579, 'min_child_weight': 9, 'gamma': 3.663756210412812}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:   5%|▌         | 5/100 [00:54<15:14,  9.63s/it]

[I 2025-11-16 18:54:26,873] Trial 2 finished with value: 0.5699892547415878 and parameters: {'max_depth': 26, 'learning_rate': 0.09207417284627195, 'n_estimators': 613, 'subsample': 0.5240772823618753, 'colsample_bytree': 0.8182051475649001, 'min_child_weight': 4, 'gamma': 4.071918873904037}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:   6%|▌         | 6/100 [00:55<10:17,  6.56s/it]

[I 2025-11-16 18:54:27,485] Trial 4 finished with value: 0.5648399389809126 and parameters: {'max_depth': 8, 'learning_rate': 0.06943260768368369, 'n_estimators': 501, 'subsample': 0.5851024074744862, 'colsample_bytree': 0.542191566558774, 'min_child_weight': 5, 'gamma': 2.639777104732808}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:   7%|▋         | 7/100 [00:58<08:11,  5.28s/it]

[I 2025-11-16 18:54:30,131] Trial 12 finished with value: 0.5689860222575408 and parameters: {'max_depth': 27, 'learning_rate': 0.04079396342429756, 'n_estimators': 148, 'subsample': 0.9220537122483554, 'colsample_bytree': 0.9971136818139122, 'min_child_weight': 3, 'gamma': 1.2368772653604214}. Best is trial 5 with value: 0.5820798973449876.
[I 2025-11-16 18:54:30,220] Trial 7 finished with value: 0.5751191155106729 and parameters: {'max_depth': 28, 'learning_rate': 0.027726427889279842, 'n_estimators': 133, 'subsample': 0.8687487750385876, 'colsample_bytree': 0.7918233289050429, 'min_child_weight': 5, 'gamma': 1.6210844991137456}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:   9%|▉         | 9/100 [01:02<05:38,  3.72s/it]

[I 2025-11-16 18:54:34,242] Trial 14 finished with value: 0.5812753156852682 and parameters: {'max_depth': 29, 'learning_rate': 0.06292200049871494, 'n_estimators': 321, 'subsample': 0.7848035545228729, 'colsample_bytree': 0.7484048461543635, 'min_child_weight': 9, 'gamma': 4.267252531658815}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:  10%|█         | 10/100 [01:10<07:15,  4.84s/it]

[I 2025-11-16 18:54:42,305] Trial 11 finished with value: 0.5768585374484173 and parameters: {'max_depth': 4, 'learning_rate': 0.0298292547747451, 'n_estimators': 964, 'subsample': 0.7681865279359721, 'colsample_bytree': 0.7691587419860788, 'min_child_weight': 10, 'gamma': 4.279010819186828}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 5. Best value: 0.58208:  11%|█         | 11/100 [01:32<14:08,  9.53s/it]

[I 2025-11-16 18:55:04,638] Trial 1 finished with value: 0.5622901056976429 and parameters: {'max_depth': 8, 'learning_rate': 0.052070562343107446, 'n_estimators': 924, 'subsample': 0.550095165295717, 'colsample_bytree': 0.9503343258043817, 'min_child_weight': 7, 'gamma': 3.2299295246105117}. Best is trial 5 with value: 0.5820798973449876.


Best trial: 13. Best value: 0.583647:  12%|█▏        | 12/100 [01:39<12:57,  8.84s/it]

[I 2025-11-16 18:55:11,670] Trial 13 finished with value: 0.5836467209561863 and parameters: {'max_depth': 6, 'learning_rate': 0.01182569734662597, 'n_estimators': 807, 'subsample': 0.590359542082279, 'colsample_bytree': 0.5042109621741726, 'min_child_weight': 2, 'gamma': 3.7649637420190816}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  13%|█▎        | 13/100 [01:46<11:49,  8.16s/it]

[I 2025-11-16 18:55:18,118] Trial 17 finished with value: 0.5676414017337343 and parameters: {'max_depth': 15, 'learning_rate': 0.08439680939337582, 'n_estimators': 684, 'subsample': 0.8877325832308292, 'colsample_bytree': 0.9799089312993532, 'min_child_weight': 5, 'gamma': 2.3255677440185076}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  14%|█▍        | 14/100 [01:59<13:46,  9.61s/it]

[I 2025-11-16 18:55:31,298] Trial 20 finished with value: 0.5683891602840495 and parameters: {'max_depth': 26, 'learning_rate': 0.07702491939991024, 'n_estimators': 415, 'subsample': 0.8829339907448701, 'colsample_bytree': 0.5549949127510152, 'min_child_weight': 4, 'gamma': 0.9719931346024474}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  15%|█▌        | 15/100 [02:03<11:23,  8.04s/it]

[I 2025-11-16 18:55:35,535] Trial 15 finished with value: 0.5790198893846039 and parameters: {'max_depth': 19, 'learning_rate': 0.026448868513803757, 'n_estimators': 628, 'subsample': 0.5859562224918856, 'colsample_bytree': 0.8324649953229049, 'min_child_weight': 8, 'gamma': 3.4010921112643766}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  16%|█▌        | 16/100 [02:16<13:26,  9.60s/it]

[I 2025-11-16 18:55:48,883] Trial 6 finished with value: 0.5636025877389637 and parameters: {'max_depth': 24, 'learning_rate': 0.0371849346874929, 'n_estimators': 660, 'subsample': 0.5205336163107595, 'colsample_bytree': 0.714632921953261, 'min_child_weight': 5, 'gamma': 1.5881224116556574}. Best is trial 13 with value: 0.5836467209561863.
[I 2025-11-16 18:55:48,940] Trial 0 finished with value: 0.561070806837811 and parameters: {'max_depth': 15, 'learning_rate': 0.05477114124281026, 'n_estimators': 376, 'subsample': 0.5546798944226252, 'colsample_bytree': 0.8410718607340623, 'min_child_weight': 2, 'gamma': 0.35148015730814497}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  18%|█▊        | 18/100 [02:30<11:16,  8.26s/it]

[I 2025-11-16 18:56:02,197] Trial 16 finished with value: 0.5588700945722792 and parameters: {'max_depth': 27, 'learning_rate': 0.07627380875906638, 'n_estimators': 629, 'subsample': 0.553954787058135, 'colsample_bytree': 0.7524667635272191, 'min_child_weight': 2, 'gamma': 1.1789605702614554}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  19%|█▉        | 19/100 [02:51<15:28, 11.46s/it]

[I 2025-11-16 18:56:23,475] Trial 18 finished with value: 0.5536226931263358 and parameters: {'max_depth': 20, 'learning_rate': 0.08583797062947637, 'n_estimators': 925, 'subsample': 0.8147606347250833, 'colsample_bytree': 0.8048881145912198, 'min_child_weight': 7, 'gamma': 0.7555465393423261}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 13. Best value: 0.583647:  20%|██        | 20/100 [03:28<24:01, 18.01s/it]

[I 2025-11-16 18:57:00,145] Trial 28 finished with value: 0.5833742296034848 and parameters: {'max_depth': 9, 'learning_rate': 0.015994703818947208, 'n_estimators': 826, 'subsample': 0.6818391530755474, 'colsample_bytree': 0.5021774788129547, 'min_child_weight': 1, 'gamma': 4.860393270515921}. Best is trial 13 with value: 0.5836467209561863.


Best trial: 27. Best value: 0.584894:  21%|██        | 21/100 [03:31<18:25, 14.00s/it]

[I 2025-11-16 18:57:03,338] Trial 27 finished with value: 0.584893714593005 and parameters: {'max_depth': 12, 'learning_rate': 0.011429329341869958, 'n_estimators': 797, 'subsample': 0.6736464192784861, 'colsample_bytree': 0.5002696110088647, 'min_child_weight': 1, 'gamma': 4.835607018799738}. Best is trial 27 with value: 0.584893714593005.


Best trial: 27. Best value: 0.584894:  22%|██▏       | 22/100 [03:47<18:59, 14.60s/it]

[I 2025-11-16 18:57:19,500] Trial 9 finished with value: 0.5580496578596535 and parameters: {'max_depth': 12, 'learning_rate': 0.03037341358430714, 'n_estimators': 931, 'subsample': 0.785151318414584, 'colsample_bytree': 0.7452102554347921, 'min_child_weight': 7, 'gamma': 0.35515886987166567}. Best is trial 27 with value: 0.584893714593005.


Best trial: 25. Best value: 0.586094:  23%|██▎       | 23/100 [03:58<17:27, 13.60s/it]

[I 2025-11-16 18:57:30,590] Trial 25 finished with value: 0.5860937815968164 and parameters: {'max_depth': 13, 'learning_rate': 0.010182778742372344, 'n_estimators': 796, 'subsample': 0.6751554671776667, 'colsample_bytree': 0.5000599650652335, 'min_child_weight': 1, 'gamma': 2.765943401110309}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  24%|██▍       | 24/100 [04:14<17:55, 14.15s/it]

[I 2025-11-16 18:57:46,083] Trial 21 finished with value: 0.5793191907191101 and parameters: {'max_depth': 12, 'learning_rate': 0.01038545862844743, 'n_estimators': 426, 'subsample': 0.6569927674350806, 'colsample_bytree': 0.5132878261048397, 'min_child_weight': 1, 'gamma': 0.17684504174366245}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  25%|██▌       | 25/100 [04:21<15:08, 12.12s/it]

[I 2025-11-16 18:57:53,277] Trial 29 finished with value: 0.5833758953401729 and parameters: {'max_depth': 12, 'learning_rate': 0.01364257251646099, 'n_estimators': 837, 'subsample': 0.6731621080112322, 'colsample_bytree': 0.5028160524906846, 'min_child_weight': 1, 'gamma': 2.660929331785753}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  26%|██▌       | 26/100 [04:33<14:59, 12.16s/it]

[I 2025-11-16 18:58:05,544] Trial 24 finished with value: 0.5803392001399884 and parameters: {'max_depth': 12, 'learning_rate': 0.010325660530352342, 'n_estimators': 404, 'subsample': 0.6794516707405681, 'colsample_bytree': 0.5062809074324377, 'min_child_weight': 1, 'gamma': 0.528575894017743}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  27%|██▋       | 27/100 [04:42<13:35, 11.18s/it]

[I 2025-11-16 18:58:14,391] Trial 31 finished with value: 0.5847459475484248 and parameters: {'max_depth': 11, 'learning_rate': 0.013966227918960992, 'n_estimators': 817, 'subsample': 0.6786450142478208, 'colsample_bytree': 0.6160385381241196, 'min_child_weight': 1, 'gamma': 4.779554455678298}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  28%|██▊       | 28/100 [04:47<11:18,  9.43s/it]

[I 2025-11-16 18:58:19,688] Trial 30 finished with value: 0.5810935561999465 and parameters: {'max_depth': 11, 'learning_rate': 0.010227321744362344, 'n_estimators': 814, 'subsample': 0.6749889470369462, 'colsample_bytree': 0.6176326891159769, 'min_child_weight': 1, 'gamma': 2.7142378955465927}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  29%|██▉       | 29/100 [04:49<08:34,  7.24s/it]

[I 2025-11-16 18:58:21,787] Trial 32 finished with value: 0.5843915038764114 and parameters: {'max_depth': 12, 'learning_rate': 0.01023240520511284, 'n_estimators': 816, 'subsample': 0.6842044136513428, 'colsample_bytree': 0.6034085708381737, 'min_child_weight': 1, 'gamma': 4.962489248603152}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  30%|███       | 30/100 [04:55<07:47,  6.68s/it]

[I 2025-11-16 18:58:27,142] Trial 33 finished with value: 0.5848530317432402 and parameters: {'max_depth': 11, 'learning_rate': 0.017458410138395337, 'n_estimators': 813, 'subsample': 0.6850886501092499, 'colsample_bytree': 0.6045741726025263, 'min_child_weight': 1, 'gamma': 4.862810958295297}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  31%|███       | 31/100 [05:11<10:52,  9.45s/it]

[I 2025-11-16 18:58:43,093] Trial 22 finished with value: 0.5802995375204297 and parameters: {'max_depth': 15, 'learning_rate': 0.011132101559953186, 'n_estimators': 374, 'subsample': 0.6678818591131951, 'colsample_bytree': 0.5435969257971546, 'min_child_weight': 1, 'gamma': 0.19825917389320358}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  32%|███▏      | 32/100 [05:13<08:13,  7.25s/it]

[I 2025-11-16 18:58:45,197] Trial 34 finished with value: 0.5829105061487808 and parameters: {'max_depth': 12, 'learning_rate': 0.01170069379728144, 'n_estimators': 788, 'subsample': 0.7007249353944266, 'colsample_bytree': 0.616564661874312, 'min_child_weight': 1, 'gamma': 4.798159045487925}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  33%|███▎      | 33/100 [05:19<07:44,  6.93s/it]

[I 2025-11-16 18:58:51,368] Trial 35 finished with value: 0.5821846823536259 and parameters: {'max_depth': 11, 'learning_rate': 0.019969317478214385, 'n_estimators': 781, 'subsample': 0.7093407486396237, 'colsample_bytree': 0.5855405705427272, 'min_child_weight': 1, 'gamma': 4.82992810339551}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  34%|███▍      | 34/100 [05:23<06:40,  6.06s/it]

[I 2025-11-16 18:58:55,417] Trial 36 finished with value: 0.5821644755376583 and parameters: {'max_depth': 10, 'learning_rate': 0.022972579996294557, 'n_estimators': 767, 'subsample': 0.7125101661742773, 'colsample_bytree': 0.5945499767635503, 'min_child_weight': 3, 'gamma': 4.715728141135144}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  35%|███▌      | 35/100 [05:33<07:55,  7.32s/it]

[I 2025-11-16 18:59:05,653] Trial 37 finished with value: 0.5709083735726989 and parameters: {'max_depth': 10, 'learning_rate': 0.0199190494823888, 'n_estimators': 774, 'subsample': 0.9971519420787243, 'colsample_bytree': 0.6010721665071274, 'min_child_weight': 3, 'gamma': 3.035448481201411}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  36%|███▌      | 36/100 [05:45<09:17,  8.72s/it]

[I 2025-11-16 18:59:17,639] Trial 38 finished with value: 0.5798585205995725 and parameters: {'max_depth': 17, 'learning_rate': 0.02202475222460408, 'n_estimators': 751, 'subsample': 0.7196139099005066, 'colsample_bytree': 0.6020144881507654, 'min_child_weight': 3, 'gamma': 4.9708785040686045}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  37%|███▋      | 37/100 [05:46<06:39,  6.33s/it]

[I 2025-11-16 18:59:18,412] Trial 39 finished with value: 0.5804299230272489 and parameters: {'max_depth': 17, 'learning_rate': 0.022868181099526333, 'n_estimators': 739, 'subsample': 0.7205736825884798, 'colsample_bytree': 0.6032524699057433, 'min_child_weight': 3, 'gamma': 4.988941618841755}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  38%|███▊      | 38/100 [05:54<07:04,  6.85s/it]

[I 2025-11-16 18:59:26,470] Trial 40 finished with value: 0.5808056987824005 and parameters: {'max_depth': 15, 'learning_rate': 0.02087246384619537, 'n_estimators': 736, 'subsample': 0.7256405429401023, 'colsample_bytree': 0.5832749712924838, 'min_child_weight': 3, 'gamma': 4.64336592494205}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  39%|███▉      | 39/100 [05:56<05:27,  5.38s/it]

[I 2025-11-16 18:59:28,406] Trial 41 finished with value: 0.5784363944580326 and parameters: {'max_depth': 15, 'learning_rate': 0.022092402085776494, 'n_estimators': 741, 'subsample': 0.7285983415133187, 'colsample_bytree': 0.5697922065709445, 'min_child_weight': 3, 'gamma': 4.576052016530198}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  40%|████      | 40/100 [06:14<09:11,  9.20s/it]

[I 2025-11-16 18:59:46,524] Trial 42 finished with value: 0.5804005533125906 and parameters: {'max_depth': 16, 'learning_rate': 0.022411736483397823, 'n_estimators': 738, 'subsample': 0.727162022710046, 'colsample_bytree': 0.5920590143788136, 'min_child_weight': 3, 'gamma': 4.654402231180257}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  41%|████      | 41/100 [06:15<06:33,  6.66s/it]

[I 2025-11-16 18:59:47,271] Trial 43 finished with value: 0.5801832558484813 and parameters: {'max_depth': 17, 'learning_rate': 0.021549070503701983, 'n_estimators': 747, 'subsample': 0.7329888503301677, 'colsample_bytree': 0.5739746576172581, 'min_child_weight': 3, 'gamma': 4.732136324101827}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  42%|████▏     | 42/100 [06:19<05:36,  5.80s/it]

[I 2025-11-16 18:59:51,065] Trial 19 finished with value: 0.5584269347877698 and parameters: {'max_depth': 23, 'learning_rate': 0.019160812746714787, 'n_estimators': 947, 'subsample': 0.524878806261802, 'colsample_bytree': 0.9965253784416417, 'min_child_weight': 7, 'gamma': 0.1447663635232732}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  43%|████▎     | 43/100 [06:21<04:41,  4.95s/it]

[I 2025-11-16 18:59:54,010] Trial 44 finished with value: 0.5808709480788724 and parameters: {'max_depth': 17, 'learning_rate': 0.021677263898556426, 'n_estimators': 736, 'subsample': 0.7269154721247566, 'colsample_bytree': 0.573679246738255, 'min_child_weight': 3, 'gamma': 4.561687543063101}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  44%|████▍     | 44/100 [06:41<08:43,  9.35s/it]

[I 2025-11-16 19:00:13,627] Trial 45 finished with value: 0.5836823450029919 and parameters: {'max_depth': 17, 'learning_rate': 0.020824598320463544, 'n_estimators': 877, 'subsample': 0.6144794826358013, 'colsample_bytree': 0.6526706011461053, 'min_child_weight': 2, 'gamma': 4.51771355522197}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  45%|████▌     | 45/100 [06:42<06:16,  6.85s/it]

[I 2025-11-16 19:00:14,646] Trial 50 finished with value: 0.5798667364506384 and parameters: {'max_depth': 7, 'learning_rate': 0.03447439482882449, 'n_estimators': 560, 'subsample': 0.632584550833774, 'colsample_bytree': 0.6524448711668888, 'min_child_weight': 2, 'gamma': 4.498201455237005}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  46%|████▌     | 46/100 [06:44<04:47,  5.33s/it]

[I 2025-11-16 19:00:16,432] Trial 49 finished with value: 0.5761806058761345 and parameters: {'max_depth': 14, 'learning_rate': 0.032774708122985226, 'n_estimators': 524, 'subsample': 0.6373100002216038, 'colsample_bytree': 0.8800270599763903, 'min_child_weight': 2, 'gamma': 4.455574661005593}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  47%|████▋     | 47/100 [06:46<03:58,  4.49s/it]

[I 2025-11-16 19:00:18,970] Trial 23 finished with value: 0.5749365357040425 and parameters: {'max_depth': 12, 'learning_rate': 0.012637688176645828, 'n_estimators': 784, 'subsample': 0.6560392330215803, 'colsample_bytree': 0.515438493978555, 'min_child_weight': 1, 'gamma': 0.2913026354954318}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  48%|████▊     | 48/100 [06:50<03:32,  4.09s/it]

[I 2025-11-16 19:00:22,127] Trial 46 finished with value: 0.584747866736933 and parameters: {'max_depth': 17, 'learning_rate': 0.021832317033776215, 'n_estimators': 871, 'subsample': 0.6278746797505035, 'colsample_bytree': 0.6552482288723993, 'min_child_weight': 2, 'gamma': 4.461796885633412}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  49%|████▉     | 49/100 [06:58<04:37,  5.44s/it]

[I 2025-11-16 19:00:30,705] Trial 48 finished with value: 0.5789250508165944 and parameters: {'max_depth': 14, 'learning_rate': 0.03466835781101934, 'n_estimators': 891, 'subsample': 0.6356377119865873, 'colsample_bytree': 0.8979972381633725, 'min_child_weight': 2, 'gamma': 4.47270506774311}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  50%|█████     | 50/100 [06:59<03:17,  3.95s/it]

[I 2025-11-16 19:00:31,188] Trial 47 finished with value: 0.5777574070978284 and parameters: {'max_depth': 16, 'learning_rate': 0.035316048625701604, 'n_estimators': 891, 'subsample': 0.6240756193964966, 'colsample_bytree': 0.6460967769328237, 'min_child_weight': 2, 'gamma': 4.578795215076145}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  51%|█████     | 51/100 [07:28<09:22, 11.48s/it]

[I 2025-11-16 19:01:00,219] Trial 26 finished with value: 0.5724876773213885 and parameters: {'max_depth': 13, 'learning_rate': 0.013627495539461458, 'n_estimators': 828, 'subsample': 0.6881350365043829, 'colsample_bytree': 0.5058831445759524, 'min_child_weight': 1, 'gamma': 0.3998588625842192}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  52%|█████▏    | 52/100 [07:28<06:31,  8.15s/it]

[I 2025-11-16 19:01:00,622] Trial 51 finished with value: 0.58034755121374 and parameters: {'max_depth': 7, 'learning_rate': 0.033910143464765286, 'n_estimators': 876, 'subsample': 0.6263461220708783, 'colsample_bytree': 0.6653684274568712, 'min_child_weight': 2, 'gamma': 4.430612190014969}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  53%|█████▎    | 53/100 [07:29<04:42,  6.02s/it]

[I 2025-11-16 19:01:01,650] Trial 52 finished with value: 0.5791274004404928 and parameters: {'max_depth': 7, 'learning_rate': 0.03391899208406078, 'n_estimators': 865, 'subsample': 0.6207535362709431, 'colsample_bytree': 0.6556836714499314, 'min_child_weight': 2, 'gamma': 4.054069717711376}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  54%|█████▍    | 54/100 [07:32<03:53,  5.07s/it]

[I 2025-11-16 19:01:04,516] Trial 55 finished with value: 0.5786112792960331 and parameters: {'max_depth': 7, 'learning_rate': 0.0340051480569202, 'n_estimators': 557, 'subsample': 0.6280856253928553, 'colsample_bytree': 0.6463982213797527, 'min_child_weight': 2, 'gamma': 4.0926861522444025}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  55%|█████▌    | 55/100 [07:34<03:01,  4.04s/it]

[I 2025-11-16 19:01:06,149] Trial 54 finished with value: 0.5840622988214581 and parameters: {'max_depth': 7, 'learning_rate': 0.0324015030863313, 'n_estimators': 874, 'subsample': 0.6411019433112727, 'colsample_bytree': 0.6528365366128696, 'min_child_weight': 2, 'gamma': 4.360402919648676}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  56%|█████▌    | 56/100 [07:42<03:52,  5.29s/it]

[I 2025-11-16 19:01:14,343] Trial 53 finished with value: 0.5795603265281505 and parameters: {'max_depth': 7, 'learning_rate': 0.032800883200657514, 'n_estimators': 870, 'subsample': 0.6201998134899437, 'colsample_bytree': 0.648013934844267, 'min_child_weight': 2, 'gamma': 4.395107644424768}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  58%|█████▊    | 58/100 [08:01<04:40,  6.69s/it]

[I 2025-11-16 19:01:33,606] Trial 56 finished with value: 0.5775055138685128 and parameters: {'max_depth': 14, 'learning_rate': 0.03323227808350662, 'n_estimators': 872, 'subsample': 0.6332490859586639, 'colsample_bytree': 0.8776639214724029, 'min_child_weight': 2, 'gamma': 3.956045555504856}. Best is trial 25 with value: 0.5860937815968164.
[I 2025-11-16 19:01:33,776] Trial 61 finished with value: 0.5777554126181759 and parameters: {'max_depth': 8, 'learning_rate': 0.049728193483974364, 'n_estimators': 691, 'subsample': 0.5976527174299864, 'colsample_bytree': 0.6724441231098109, 'min_child_weight': 6, 'gamma': 4.0340837860394485}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  59%|█████▉    | 59/100 [08:12<05:26,  7.97s/it]

[I 2025-11-16 19:01:44,735] Trial 60 finished with value: 0.5766713607919342 and parameters: {'max_depth': 8, 'learning_rate': 0.04948773661193613, 'n_estimators': 880, 'subsample': 0.6126136321342895, 'colsample_bytree': 0.636628213626196, 'min_child_weight': 4, 'gamma': 3.982075137881017}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  60%|██████    | 60/100 [08:16<04:25,  6.64s/it]

[I 2025-11-16 19:01:48,278] Trial 58 finished with value: 0.5795461494301074 and parameters: {'max_depth': 8, 'learning_rate': 0.01693223111607464, 'n_estimators': 880, 'subsample': 0.6218862725017695, 'colsample_bytree': 0.6907178191097318, 'min_child_weight': 2, 'gamma': 3.9243930583600344}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  61%|██████    | 61/100 [08:19<03:34,  5.49s/it]

[I 2025-11-16 19:01:51,097] Trial 57 finished with value: 0.580479554707138 and parameters: {'max_depth': 8, 'learning_rate': 0.016081771600339842, 'n_estimators': 886, 'subsample': 0.6121999550400072, 'colsample_bytree': 0.6831765368403232, 'min_child_weight': 2, 'gamma': 3.9218668428913634}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  62%|██████▏   | 62/100 [08:27<03:57,  6.24s/it]

[I 2025-11-16 19:01:59,080] Trial 59 finished with value: 0.582256784061794 and parameters: {'max_depth': 8, 'learning_rate': 0.016277979085921872, 'n_estimators': 1000, 'subsample': 0.6171023118580865, 'colsample_bytree': 0.678588057356277, 'min_child_weight': 6, 'gamma': 4.06154051384581}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  63%|██████▎   | 63/100 [08:50<07:01, 11.39s/it]

[I 2025-11-16 19:02:22,481] Trial 65 finished with value: 0.5838736402778398 and parameters: {'max_depth': 9, 'learning_rate': 0.016499686187329065, 'n_estimators': 684, 'subsample': 0.6009255556376168, 'colsample_bytree': 0.7181943834463532, 'min_child_weight': 4, 'gamma': 3.8309908294997435}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  64%|██████▍   | 64/100 [08:55<05:39,  9.42s/it]

[I 2025-11-16 19:02:27,301] Trial 62 finished with value: 0.5813345594699636 and parameters: {'max_depth': 9, 'learning_rate': 0.017160064442305376, 'n_estimators': 862, 'subsample': 0.5940363582978639, 'colsample_bytree': 0.6874784291725495, 'min_child_weight': 6, 'gamma': 4.095922261395224}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  65%|██████▌   | 65/100 [09:02<05:02,  8.65s/it]

[I 2025-11-16 19:02:34,162] Trial 64 finished with value: 0.5807059060959879 and parameters: {'max_depth': 9, 'learning_rate': 0.016815667970367133, 'n_estimators': 989, 'subsample': 0.7649955882509145, 'colsample_bytree': 0.687808705510485, 'min_child_weight': 1, 'gamma': 3.849076908929302}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  66%|██████▌   | 66/100 [09:09<04:45,  8.40s/it]

[I 2025-11-16 19:02:41,964] Trial 66 finished with value: 0.58166173101178 and parameters: {'max_depth': 9, 'learning_rate': 0.015900619676536297, 'n_estimators': 988, 'subsample': 0.7527222745642423, 'colsample_bytree': 0.6910105355106175, 'min_child_weight': 4, 'gamma': 3.65064294988883}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  67%|██████▋   | 67/100 [09:11<03:34,  6.50s/it]

[I 2025-11-16 19:02:44,026] Trial 63 finished with value: 0.580433139825009 and parameters: {'max_depth': 9, 'learning_rate': 0.01682930079026429, 'n_estimators': 989, 'subsample': 0.6037297008418322, 'colsample_bytree': 0.6898620329322538, 'min_child_weight': 4, 'gamma': 3.9497124105788552}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  68%|██████▊   | 68/100 [09:37<06:33, 12.29s/it]

[I 2025-11-16 19:03:09,818] Trial 68 finished with value: 0.5814867979577848 and parameters: {'max_depth': 9, 'learning_rate': 0.027112686486635017, 'n_estimators': 979, 'subsample': 0.5953259322329223, 'colsample_bytree': 0.6882144473543981, 'min_child_weight': 4, 'gamma': 3.4719643489719387}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  69%|██████▉   | 69/100 [09:47<05:56, 11.50s/it]

[I 2025-11-16 19:03:19,485] Trial 71 finished with value: 0.5830504655125327 and parameters: {'max_depth': 10, 'learning_rate': 0.017805482391207068, 'n_estimators': 978, 'subsample': 0.7636140196680957, 'colsample_bytree': 0.5320863795203956, 'min_child_weight': 4, 'gamma': 3.542521138150554}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  70%|███████   | 70/100 [09:54<05:07, 10.24s/it]

[I 2025-11-16 19:03:26,786] Trial 69 finished with value: 0.5764072461285626 and parameters: {'max_depth': 10, 'learning_rate': 0.01618007928914443, 'n_estimators': 703, 'subsample': 0.7502347932572871, 'colsample_bytree': 0.707838937299359, 'min_child_weight': 4, 'gamma': 1.9574333932949024}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  71%|███████   | 71/100 [10:05<05:00, 10.36s/it]

[I 2025-11-16 19:03:37,416] Trial 73 finished with value: 0.572646161355509 and parameters: {'max_depth': 4, 'learning_rate': 0.0286572554579726, 'n_estimators': 965, 'subsample': 0.7590125492015761, 'colsample_bytree': 0.7212639541511705, 'min_child_weight': 1, 'gamma': 1.631632177606215}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  72%|███████▏  | 72/100 [10:09<03:54,  8.37s/it]

[I 2025-11-16 19:03:41,149] Trial 72 finished with value: 0.5731472338573775 and parameters: {'max_depth': 4, 'learning_rate': 0.027274705569766716, 'n_estimators': 975, 'subsample': 0.5700772549175835, 'colsample_bytree': 0.5371025531841351, 'min_child_weight': 1, 'gamma': 1.8201441851971354}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  73%|███████▎  | 73/100 [10:21<04:20,  9.65s/it]

[I 2025-11-16 19:03:53,805] Trial 75 finished with value: 0.5808923325372762 and parameters: {'max_depth': 19, 'learning_rate': 0.028357698623783602, 'n_estimators': 950, 'subsample': 0.6531589754490166, 'colsample_bytree': 0.5323820424315446, 'min_child_weight': 1, 'gamma': 3.597814660683035}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  74%|███████▍  | 74/100 [10:23<03:09,  7.29s/it]

[I 2025-11-16 19:03:55,573] Trial 70 finished with value: 0.572823334905283 and parameters: {'max_depth': 10, 'learning_rate': 0.01688777080738103, 'n_estimators': 696, 'subsample': 0.5648097615101039, 'colsample_bytree': 0.7136706364601675, 'min_child_weight': 4, 'gamma': 1.9959343232092355}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  75%|███████▌  | 75/100 [10:25<02:22,  5.69s/it]

[I 2025-11-16 19:03:57,547] Trial 74 finished with value: 0.5711356097569341 and parameters: {'max_depth': 4, 'learning_rate': 0.02839345837098283, 'n_estimators': 955, 'subsample': 0.649838946342742, 'colsample_bytree': 0.5307247325440325, 'min_child_weight': 1, 'gamma': 2.0763876762846794}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  76%|███████▌  | 76/100 [10:25<01:38,  4.09s/it]

[I 2025-11-16 19:03:57,904] Trial 78 finished with value: 0.5802237475431226 and parameters: {'max_depth': 5, 'learning_rate': 0.029435915166826324, 'n_estimators': 942, 'subsample': 0.6939556684222122, 'colsample_bytree': 0.5429856419560954, 'min_child_weight': 1, 'gamma': 4.299065585112549}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  77%|███████▋  | 77/100 [10:29<01:34,  4.11s/it]

[I 2025-11-16 19:04:02,034] Trial 67 finished with value: 0.5703712416493266 and parameters: {'max_depth': 9, 'learning_rate': 0.027173376263535505, 'n_estimators': 990, 'subsample': 0.5808967980107259, 'colsample_bytree': 0.6962263529158155, 'min_child_weight': 4, 'gamma': 1.9340684069196385}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  78%|███████▊  | 78/100 [10:41<02:18,  6.28s/it]

[I 2025-11-16 19:04:13,380] Trial 79 finished with value: 0.5679293276070024 and parameters: {'max_depth': 3, 'learning_rate': 0.06054816761641746, 'n_estimators': 929, 'subsample': 0.6935350146292079, 'colsample_bytree': 0.5373844796882261, 'min_child_weight': 1, 'gamma': 4.290254203400799}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  79%|███████▉  | 79/100 [11:00<03:34, 10.21s/it]

[I 2025-11-16 19:04:32,754] Trial 81 finished with value: 0.5748800513834729 and parameters: {'max_depth': 3, 'learning_rate': 0.02568349593682339, 'n_estimators': 810, 'subsample': 0.5628291534837029, 'colsample_bytree': 0.535617501362441, 'min_child_weight': 1, 'gamma': 4.281528178109213}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  80%|████████  | 80/100 [11:13<03:40, 11.02s/it]

[I 2025-11-16 19:04:45,670] Trial 82 finished with value: 0.5752555190947769 and parameters: {'max_depth': 5, 'learning_rate': 0.06193416036977873, 'n_estimators': 936, 'subsample': 0.6526161904306667, 'colsample_bytree': 0.7250438563682062, 'min_child_weight': 1, 'gamma': 4.284998148232381}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  81%|████████  | 81/100 [11:19<03:01,  9.56s/it]

[I 2025-11-16 19:04:51,830] Trial 77 finished with value: 0.5791687197705394 and parameters: {'max_depth': 13, 'learning_rate': 0.02693373413543213, 'n_estimators': 921, 'subsample': 0.5743914910103534, 'colsample_bytree': 0.5322716690655802, 'min_child_weight': 1, 'gamma': 2.0043222975356176}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  82%|████████▏ | 82/100 [11:21<02:09,  7.20s/it]

[I 2025-11-16 19:04:53,518] Trial 76 finished with value: 0.5736726608989328 and parameters: {'max_depth': 19, 'learning_rate': 0.02687613772065787, 'n_estimators': 945, 'subsample': 0.6566286524356815, 'colsample_bytree': 0.5335081751378468, 'min_child_weight': 1, 'gamma': 1.7507362069922872}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  83%|████████▎ | 83/100 [11:31<02:14,  7.91s/it]

[I 2025-11-16 19:05:03,075] Trial 84 finished with value: 0.5822254816160793 and parameters: {'max_depth': 13, 'learning_rate': 0.02525853311222472, 'n_estimators': 803, 'subsample': 0.6946061036275457, 'colsample_bytree': 0.7274876944243529, 'min_child_weight': 1, 'gamma': 4.255947154998409}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  84%|████████▍ | 84/100 [11:34<01:45,  6.61s/it]

[I 2025-11-16 19:05:06,647] Trial 80 finished with value: 0.5717235043468961 and parameters: {'max_depth': 19, 'learning_rate': 0.05906921746984937, 'n_estimators': 923, 'subsample': 0.570124585887269, 'colsample_bytree': 0.531328378614973, 'min_child_weight': 1, 'gamma': 1.8248694682450561}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  85%|████████▌ | 85/100 [11:39<01:33,  6.22s/it]

[I 2025-11-16 19:05:11,961] Trial 85 finished with value: 0.5835238613838037 and parameters: {'max_depth': 13, 'learning_rate': 0.02506749258931297, 'n_estimators': 917, 'subsample': 0.6950039825899554, 'colsample_bytree': 0.6258358517099614, 'min_child_weight': 1, 'gamma': 4.245110556369324}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  86%|████████▌ | 86/100 [11:48<01:37,  6.93s/it]

[I 2025-11-16 19:05:20,557] Trial 83 finished with value: 0.577111608451104 and parameters: {'max_depth': 13, 'learning_rate': 0.025561572088722548, 'n_estimators': 922, 'subsample': 0.6527609934743747, 'colsample_bytree': 0.6177108929911844, 'min_child_weight': 1, 'gamma': 3.1591779008329266}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  87%|████████▋ | 87/100 [11:49<01:05,  5.03s/it]

[I 2025-11-16 19:05:21,138] Trial 86 finished with value: 0.5838822474409929 and parameters: {'max_depth': 13, 'learning_rate': 0.013726564769941038, 'n_estimators': 843, 'subsample': 0.6637018621851107, 'colsample_bytree': 0.6283882519723737, 'min_child_weight': 1, 'gamma': 4.271055091883526}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  88%|████████▊ | 88/100 [11:57<01:12,  6.04s/it]

[I 2025-11-16 19:05:29,559] Trial 89 finished with value: 0.5692237336956517 and parameters: {'max_depth': 13, 'learning_rate': 0.04224327784778161, 'n_estimators': 805, 'subsample': 0.665097978614731, 'colsample_bytree': 0.6211136215705504, 'min_child_weight': 5, 'gamma': 2.9857712602773043}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  89%|████████▉ | 89/100 [12:09<01:26,  7.87s/it]

[I 2025-11-16 19:05:41,693] Trial 87 finished with value: 0.582046375368541 and parameters: {'max_depth': 11, 'learning_rate': 0.01338857793613642, 'n_estimators': 805, 'subsample': 0.6598847937414218, 'colsample_bytree': 0.6221157017269151, 'min_child_weight': 1, 'gamma': 3.2159788604093182}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  90%|█████████ | 90/100 [12:24<01:39,  9.96s/it]

[I 2025-11-16 19:05:56,525] Trial 91 finished with value: 0.5832960033996512 and parameters: {'max_depth': 13, 'learning_rate': 0.013625047542541919, 'n_estimators': 850, 'subsample': 0.6677799643970004, 'colsample_bytree': 0.6270829679168067, 'min_child_weight': 10, 'gamma': 4.863269622527618}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  91%|█████████ | 91/100 [12:30<01:18,  8.76s/it]

[I 2025-11-16 19:06:02,483] Trial 88 finished with value: 0.5810765320283787 and parameters: {'max_depth': 13, 'learning_rate': 0.024768894738190037, 'n_estimators': 800, 'subsample': 0.5338155041485471, 'colsample_bytree': 0.6204315175371157, 'min_child_weight': 1, 'gamma': 3.0729641215928187}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  92%|█████████▏| 92/100 [12:33<00:57,  7.17s/it]

[I 2025-11-16 19:06:05,947] Trial 93 finished with value: 0.5805894294539335 and parameters: {'max_depth': 11, 'learning_rate': 0.01324896852370171, 'n_estimators': 843, 'subsample': 0.6711356495925984, 'colsample_bytree': 0.6259478925643881, 'min_child_weight': 2, 'gamma': 4.919564136524371}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  93%|█████████▎| 93/100 [12:35<00:39,  5.61s/it]

[I 2025-11-16 19:06:07,898] Trial 92 finished with value: 0.5829631900344895 and parameters: {'max_depth': 11, 'learning_rate': 0.012951536044748548, 'n_estimators': 847, 'subsample': 0.5342888617638161, 'colsample_bytree': 0.6265082768741779, 'min_child_weight': 2, 'gamma': 4.877742266151862}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  94%|█████████▍| 94/100 [12:37<00:26,  4.45s/it]

[I 2025-11-16 19:06:09,658] Trial 94 finished with value: 0.5834134139468953 and parameters: {'max_depth': 11, 'learning_rate': 0.013752761866618664, 'n_estimators': 846, 'subsample': 0.6660018166941497, 'colsample_bytree': 0.6288492766134078, 'min_child_weight': 2, 'gamma': 4.8717292404487935}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  95%|█████████▌| 95/100 [12:41<00:21,  4.25s/it]

[I 2025-11-16 19:06:13,434] Trial 95 finished with value: 0.5839711586350419 and parameters: {'max_depth': 11, 'learning_rate': 0.012980799221178167, 'n_estimators': 842, 'subsample': 0.5439832521027111, 'colsample_bytree': 0.6243204091281616, 'min_child_weight': 2, 'gamma': 4.886923703830052}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  96%|█████████▌| 96/100 [12:42<00:13,  3.30s/it]

[I 2025-11-16 19:06:14,519] Trial 90 finished with value: 0.5830998160728945 and parameters: {'max_depth': 13, 'learning_rate': 0.01261447980631593, 'n_estimators': 846, 'subsample': 0.6638352278884069, 'colsample_bytree': 0.767771623780218, 'min_child_weight': 1, 'gamma': 3.275773100611598}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  97%|█████████▋| 97/100 [12:43<00:08,  2.67s/it]

[I 2025-11-16 19:06:15,743] Trial 98 finished with value: 0.5834886083740359 and parameters: {'max_depth': 11, 'learning_rate': 0.013468148229167814, 'n_estimators': 842, 'subsample': 0.6662789976254881, 'colsample_bytree': 0.6346241928690333, 'min_child_weight': 10, 'gamma': 4.910175500805359}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094:  99%|█████████▉| 99/100 [12:45<00:01,  1.79s/it]

[I 2025-11-16 19:06:17,874] Trial 97 finished with value: 0.584264315171631 and parameters: {'max_depth': 11, 'learning_rate': 0.013328445620282417, 'n_estimators': 844, 'subsample': 0.6697501055073043, 'colsample_bytree': 0.5569140824244473, 'min_child_weight': 2, 'gamma': 4.911243155800612}. Best is trial 25 with value: 0.5860937815968164.
[I 2025-11-16 19:06:17,980] Trial 99 finished with value: 0.5840016464836472 and parameters: {'max_depth': 11, 'learning_rate': 0.01349311517074966, 'n_estimators': 830, 'subsample': 0.5436187205207846, 'colsample_bytree': 0.55597533186064, 'min_child_weight': 2, 'gamma': 4.8392964603102895}. Best is trial 25 with value: 0.5860937815968164.


Best trial: 25. Best value: 0.586094: 100%|██████████| 100/100 [12:46<00:00,  7.66s/it]

[I 2025-11-16 19:06:18,339] Trial 96 finished with value: 0.5835482383474999 and parameters: {'max_depth': 16, 'learning_rate': 0.013052150955111525, 'n_estimators': 654, 'subsample': 0.5450894774975558, 'colsample_bytree': 0.6309335511329779, 'min_child_weight': 2, 'gamma': 3.290521800550715}. Best is trial 25 with value: 0.5860937815968164.

Best parameters: {'max_depth': 13, 'learning_rate': 0.010182778742372344, 'n_estimators': 796, 'subsample': 0.6751554671776667, 'colsample_bytree': 0.5000599650652335, 'min_child_weight': 1, 'gamma': 2.765943401110309}





In [None]:
# iterations:
# max_depth=14, learning_rate=0.010573629786541423, n_estimators=140, subsample=0.6926815018566853, colsample_bytree=0.6226063389661705, min_child_weight=4, gamma=2.9740841251220034   | 57.90%
# max_depth=30, learning_rate=0.03192091052817414, n_estimators=105, subsample=0.5270193265047416, colsample_bytree=0.5489363584973745, min_child_weight=4, gamma=3.733593582255768     | 58.45%
# max_depth=30, learning_rate=0.03192091052817414, n_estimators=1000, subsample=0.5270193265047416, colsample_bytree=0.5489363584973745, min_child_weight=4, gamma=3.733593582255768    | 57.46%
# max_depth=30, learning_rate=0.03192091052817414, n_estimators=105, subsample=0.6926815018566853, colsample_bytree=0.6226063389661705, min_child_weight=4, gamma=3.733593582255768     | 57.57%

classifier = xgb.XGBClassifier(max_depth=13, learning_rate=0.010182778742372344, n_estimators=796, subsample=0.6751554671776667, colsample_bytree=0.5000599650652335, min_child_weight=1, gamma=2.765943401110309)
classifier.fit(X_train, y_train, sw_train)

# test the model
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



# cross-validation
from sklearn.model_selection import cross_val_score, KFold

# define number of k-folds
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# perform k-fold cross-validation
cross_val_results = cross_val_score(classifier, X_train, y_train, cv=kf)

# evaluation metrics
print("Cross-Validation Results (Accuracy): ")
for i, result in enumerate(cross_val_results, 1):
    print(f" Fold {i}: {result * 100:.2f}%")

print(f"Mean Accuracy: {cross_val_results.mean() * 100:.2f}%")



Accuracy: 0.5682980277574872
Cross-Validation Results (Accuracy): 
 Fold 1: 58.08%
 Fold 2: 57.81%
 Fold 3: 56.26%
 Fold 4: 57.13%
 Fold 5: 56.22%
Mean Accuracy: 57.10%


In [None]:
import joblib

# save the model
joblib.dump(classifier, f'./V{v}_NFL[1999-2025].joblib')

['./V3.1_NFL[1999-2025].joblib']

#### 6. TEST THE MODEL WITH NEW DATA

In [None]:
import joblib
import pandas as pd

# load the model
rf = joblib.load(f'./V{v}_NFL[1999-2025].joblib')

# load the dataset
df = pd.read_csv(f'V{v}_NFL_gbg_data[1999-2025].csv')
df.drop(
    df.columns[df.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True
    )

df

Unnamed: 0,year,week,away_team,away_off_epa,away_def_epa,away_qb_epa,away_off_dvoa,away_def_dvoa,away_dsr,away_ay/a,...,home_team,home_off_epa,home_def_epa,home_qb_epa,home_off_dvoa,home_def_dvoa,home_dsr,home_ay/a,home_nprg,result
0,1999,1,ARI,-0.124676,-0.250785,-0.124676,-8.238406,-17.582923,0.056274,3.180000,...,PHI,-0.250785,-0.124676,-0.250785,-17.582923,-8.238406,0.064825,1.366667,0.0,False
1,1999,1,BUF,-0.209526,0.103372,-0.209526,-14.525645,8.659734,0.081535,4.893617,...,IND,0.103372,-0.209526,0.103372,8.659734,-14.525645,0.088698,7.090909,0.0,True
2,1999,1,CAR,-0.350204,-0.204681,-0.350204,-24.949785,-14.166638,0.061269,4.666667,...,NO,-0.204681,-0.350204,-0.204681,-14.166638,-24.949785,0.057093,5.846154,0.0,True
3,1999,1,CIN,0.173752,0.039889,0.173752,13.874885,3.955703,0.080271,3.972973,...,TEN,0.039889,0.173752,0.122393,3.955703,13.874885,0.076709,10.470588,0.0,True
4,1999,1,DAL,0.277992,0.200565,0.277992,21.598945,15.861709,0.065918,6.540000,...,WAS,0.200565,0.277992,0.200565,15.861709,21.598945,0.071992,11.722222,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6837,2025,10,CAR,-0.300061,0.142840,-0.300061,-21.234240,11.584328,0.113978,2.925926,...,NO,0.142840,-0.300061,0.142840,11.584328,-21.234240,0.089779,10.733333,0.0,False
6838,2025,10,CHI,0.076580,-0.005457,0.076580,6.674534,0.595623,0.091413,6.666667,...,NYG,-0.005457,0.076580,-0.005457,0.595623,6.674534,0.084687,7.175000,0.0,True
6839,2025,10,GB,-0.152826,-0.069291,-0.152826,-10.324250,-4.134427,0.091033,4.631579,...,PHI,-0.069291,-0.152826,-0.069291,-4.134427,-10.324250,0.106239,7.807692,0.0,False
6840,2025,10,LAC,-0.111648,-0.372950,-0.111648,-7.273024,-26.635203,0.076759,6.315789,...,PIT,-0.372950,-0.111648,-0.372950,-26.635203,-7.273024,0.074713,2.676471,0.0,True


In [None]:
# dictionary for team name mapping. team name -> number for the model's interpretation
team_map = {
    'NO': 1,
    'BUF': 2,
    'JAX': 3,
    'CLE': 4,
    'PHI': 5,
    'GB': 6,
    'LA': 7,
    'LAC': 8,
    'NE': 9,
    'IND': 10,
    'CHI': 11,
    'WAS': 12,
    'NYJ': 13,
    'SEA': 14,
    'ATL': 15,
    'DEN': 16,
    'MIN': 17,
    'ARI': 18,
    'DET': 19,
    'BAL': 20,
    'CIN': 21,
    'LV': 22,
    'TEN': 23,
    'MIA': 24,
    'DAL': 25,
    'KC': 26,
    'PIT': 27,
    'HOU': 28,
    'SF': 29,
    'CAR': 30,
    'NYG': 31,
    'TB': 32
}

# the values we want to test: year, week, home & away teams
year, week           = 2025 ,  11
away_team, home_team = 'DET', 'PHI'
# columns needed to find the average of these metrics for both teams
cols = ['off_epa', 'def_epa', 'qb_epa', 'off_dvoa', 'def_dvoa', 'dsr', 'ay/a', 'nprg']
# max last number of games
n = 5

i = 1
while i <= n:
    # GET THE AVERAGES OF THE METRICS FROM THE LAST N GAMES
    # look at columns where this team is either away or home, whichever is most recent for last n games
    # away team
    away_data = df[(df['away_team'] == away_team) | (df['home_team'] == away_team)].tail(i)
    away_data = away_data[[f'away_{col}' for col in cols]].mean()

    # home team
    home_data = df[(df['away_team'] == home_team) | (df['home_team'] == home_team)].tail(i)
    home_data = home_data[[f'away_{col}' for col in cols]].mean()

    # CREATE A DICTIONARY TO HOLD THE NEW DATA
    new_data = {
        'year': year, 'week': week,
        # away stats
        'away_team': team_map[away_team],
        'away_off_epa': 0, 'away_def_epa': 0, 'away_qb_epa': 0,
        'away_off_dvoa': 0, 'away_def_dvoa': 0,
        'away_dsr': 0, 'away_ay/a': 0, 'away_nprg': 0,
        # home stats
        'home_team': team_map[home_team],
        'home_off_epa': 0, 'home_def_epa': 0, 'home_qb_epa': 0,
        'home_off_dvoa': 0, 'home_def_dvoa': 0,
        'home_dsr': 0, 'home_ay/a': 0, 'home_nprg': 0,
    }

    # assign the metrics to new data dictionary
    for col, metric in away_data.items():
        new_data[col] = metric
    for col, metric in home_data.items():
        new_data[col] = metric

    # create the new dataframe
    new_X = pd.DataFrame(data=new_data, index=[0])

    # PREDICT THE NEW VALUES
    pred = rf.predict(new_X)
    if pred == 1:
        print(f"{home_team} will win! Based on last {i} games.")
    else:
        print(f"{away_team} will win! Based on last {i} games.")

    i += 1

# W10: 61% -> 30% -> 42.8%
# W 9: 24% -> 50% -> 64.2%

PHI will win! Based on last 1 games.
PHI will win! Based on last 2 games.
PHI will win! Based on last 3 games.
PHI will win! Based on last 4 games.
PHI will win! Based on last 5 games.
