# Scrape stats from Pro Football Reference
## Keep stats as rolling averages for the season

In [None]:
###Now create a function that does all of this for a given team and year
import pandas as pd
import time
import random 

abbr_list = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai',
             'sdg', 'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']

stats = {'Result':'Rslt', 'Location':'Unnamed: 5', 'Week':'Week', 'Pass Yds':'Yds',  'Rush Yds':'Yds.2', 'Points per game':'Pts', 'Sacks':'Sk', 'Cmp':'Cmp%',
              'Plays':'Ply', '3DConv':'3DConv', 'Yards per Play':'Y/P'}


def create_stat_df(team:str, year:str, main_df:pd.DataFrame):
    ###Get the stat table we'll be scraping from
    url = 'https://www.pro-football-reference.com/teams/' + team + '/' + year + '/gamelog/'
    team_df = pd.read_html(url, header=1, attrs={'id':'table_pfr_team-year_game-logs_team-year-regular-season-game-log'})[0]
    opp_df = pd.read_html(url, header=1, attrs={'id':'table_pfr_team-year_game-logs_team-year-regular-season-opponent-game-log'})[0]
    ###Loop over the stats getting both team and opp info for each one
    print(f'Scraping stats for {team}, {year}')
    
    teamname_col = [f'{team}'] * (len(team_df) - 1)
    teamname_df = pd.DataFrame(teamname_col)
    teamname_df.columns = ['Team']
    main_df = pd.concat([main_df, teamname_df], axis=1)

    for stat in stats:
        if stat == 'Location' or stat == 'Week' or stat == "Result":
            ####Just loop once b/c we dont need opp info
            #Get stat
            season_tot = []
            for i in range(len(team_df)-1):                           
                season_tot.append(team_df[stats[stat]][i])
            
            #Concat to df
            tot_df = pd.DataFrame(season_tot)
            tot_df.columns = [stat]
            main_df = pd.concat([main_df, tot_df], axis=1)


        elif stat == "3DConv":
            third_pg, third_pg_a = [], []
            perc, perc_a = 0, 0

            for i in range(len(team_df)-1):
                perc += team_df['3DConv'][i] / team_df['3DAtt'][i]
                avg_third = perc / team_df['Week'][i]
                third_pg.append(avg_third.item())
            
                perc_a += opp_df['3DConv'][i] / opp_df['3DAtt'][i]
                avg_third_a = perc_a / opp_df['Week'][i]
                third_pg_a.append(avg_third_a.item())
            
            third_df = pd.DataFrame(third_pg)
            third_df.columns = ['3DConv']
            third_df_a = pd.DataFrame(third_pg_a)
            third_df_a.columns = ['3DConv Allowed']

            main_df = pd.concat([main_df, third_df, third_df_a], axis=1)


        else:
            #print(f'On stat: {stat}')
            team_pg, opp_pg = [], []
            team, opp = 0,0
            for i in range(len(team_df)-1):
                #print(f'team value is: {team}')
                team += team_df[stats[stat]][i]
                avg_team = team / team_df['Week'][i]
                team_pg.append(avg_team.item())

                #print(f'opp value is: {opp}')
                opp += opp_df[stats[stat]][i]
                avg_opp = opp / opp_df['Week'][i]
                opp_pg.append(avg_opp.item())

            cur_team_df = pd.DataFrame(team_pg)
            cur_team_df.columns = [stat]
            cur_opp_df = pd.DataFrame(opp_pg)
            cur_opp_df.columns = ["Opp " + stat]

            main_df = pd.concat([main_df, cur_team_df, cur_opp_df], axis=1)
            

    time.sleep(random.randint(8, 10))

    return main_df



# Generalize for all teams and all years

In [None]:
abbr_list = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai',
             'sdg', 'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']

#For now we'll scrape one year at a time for the purpose of merging with the weather df later (will update to scrape all years at once)
years = [str(season) for season in range(2024, 2025)]

overall_stats_df = pd.DataFrame()
main_df = pd.DataFrame()

for year in years:
    for team in abbr_list:
        complete_team_df = create_stat_df(team, year, main_df)
        overall_stats_df = pd.concat([overall_stats_df, complete_team_df], axis=0)

overall_stats_df

Scraping stats for crd, 2017
Scraping stats for atl, 2017
Scraping stats for rav, 2017
Scraping stats for buf, 2017
Scraping stats for car, 2017
Scraping stats for chi, 2017
Scraping stats for cin, 2017
Scraping stats for cle, 2017
Scraping stats for dal, 2017
Scraping stats for den, 2017
Scraping stats for det, 2017
Scraping stats for gnb, 2017
Scraping stats for htx, 2017
Scraping stats for clt, 2017
Scraping stats for jax, 2017
Scraping stats for kan, 2017
Scraping stats for rai, 2017
Scraping stats for sdg, 2017
Scraping stats for ram, 2017
Scraping stats for mia, 2017
Scraping stats for min, 2017
Scraping stats for nwe, 2017
Scraping stats for nor, 2017
Scraping stats for nyg, 2017
Scraping stats for nyj, 2017
Scraping stats for phi, 2017
Scraping stats for pit, 2017
Scraping stats for sea, 2017
Scraping stats for sfo, 2017
Scraping stats for tam, 2017
Scraping stats for oti, 2017
Scraping stats for was, 2017


Unnamed: 0,Team,Result,Location,Week,Pass Yds,Opp Pass Yds,Rush Yds,Opp Rush Yds,Points per game,Opp Points per game,Sacks,Opp Sacks,Cmp,Opp Cmp,Plays,Opp Plays,3DConv,3DConv Allowed,Yards per Play,Opp Yards per Play
0,crd,L,@,1.0,263.000000,285.000000,45.000000,82.000000,23.000000,23.000000,1.000000,1.000000,56.300000,70.700000,67.000000,69.000000,0.400000,0.500000,4.600000,5.320000
1,crd,W,@,2.0,284.500000,237.500000,64.000000,79.000000,19.500000,19.500000,2.500000,2.500000,54.550000,62.400000,66.000000,69.500000,0.400000,0.472222,5.290000,4.560000
2,crd,L,,3.0,284.000000,216.333333,59.000000,85.666667,18.666667,18.666667,3.666667,2.000000,56.500000,65.666667,69.000000,61.333333,0.414815,0.388889,5.003333,5.063333
3,crd,W,,4.0,292.250000,214.750000,57.000000,88.000000,18.500000,18.500000,4.250000,2.250000,58.550000,61.500000,71.500000,65.750000,0.389236,0.357456,4.917500,4.762500
4,crd,L,@,5.0,289.000000,231.200000,51.800000,94.800000,16.200000,16.200000,3.800000,2.000000,59.560000,63.200000,69.200000,65.400000,0.368532,0.414536,4.958000,5.120000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,was,L,@,13.0,230.230769,205.307692,92.615385,108.615385,20.923077,20.923077,2.692308,2.230769,61.615385,56.615385,58.692308,57.615385,0.332468,0.370274,5.071538,4.973846
12,was,L,@,14.0,223.500000,213.071429,90.642857,113.285714,20.357143,20.357143,2.642857,2.214286,61.185714,56.564286,58.071429,58.571429,0.320625,0.372397,4.996429,5.109286
13,was,W,,15.0,221.066667,208.533333,86.666667,115.133333,20.333333,20.333333,2.533333,2.400000,61.720000,55.393333,57.333333,60.000000,0.306657,0.361606,4.972667,5.007333
14,was,W,,16.0,225.937500,206.187500,86.687500,117.875000,20.750000,20.750000,2.375000,2.500000,61.075000,55.550000,57.875000,60.687500,0.311529,0.357388,5.027500,4.985000


# Scrape weather data

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random


def scrape_weather(year:str, base_df:pd.DataFrame):
    #for year in year:
    #We need to account for the switch to an 18 week season in 2021
    if int(year) <= 2020:
        weeks = [str(week) for week in range(1,18)]
    else:
        weeks = [str(week) for week in range(1,19)]
        
    for week in weeks:
        url = 'https://www.nflweather.com/week/' + year + '/week-' + week
        response = requests.get(url)
        response.raise_for_status()  # Raise error if request failed
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all game weather rows
        games = soup.find_all(class_='mx-2')
        #Get correspoinding team names
        teams = soup.find_all('span', class_="fw-bold")

        #Get team names from html for matchups
        team_list = []
        for i in range(len(teams) - 1):
            text = teams[i].get_text()
            if text != '@':
                team_list.append(text)
        #Create a list of matchups from team_list that correspond to the weather data
        matchups = []
        n = 0
        while n < len(team_list):
            vs = team_list[n] + " " + team_list[n+1]
            matchups.append(vs)
            n += 2  
        #Get the temperature and weather data from html
        m = 0
        temp = []
        weather = []
        while m < len(games):
            stripped_temp = games[m]
            stripped_weat = games[m+1]
            #print(stripped_temp.get_text(), stripped_weat.get_text())
            temp.append(games[m].get_text())
            weather.append(games[m+1].get_text())
            m += 2

        Matchup_df = pd.DataFrame(matchups)
        temp_df = pd.DataFrame(temp)
        weather_df = pd.DataFrame(weather)
        combined = pd.concat([Matchup_df, temp_df, weather_df], axis=1, ignore_index=True)
        combined.columns = ["Matchup", "Temperature", "Weather"]
        combined['Week'] = float(week)

        base_df = pd.concat([base_df, combined], axis=0)

        time.sleep(random.randint(8, 10))

            

    return base_df



In [None]:
###Now we can scrape the weather logs for a given year
year = str(2024)
base_df = pd.DataFrame()
weather_2024 = scrape_weather(year=year, base_df=base_df)

weather_2024

Unnamed: 0,Matchup,Temperature,Weather,Week
0,Chiefs Patriots,62 °F,Clear,1.0
1,Jets Bills,61 °F,Partly Cloudy,1.0
2,Eagles Washington,69 °F,Clear,1.0
3,Raiders Titans,73 °F,Clear,1.0
4,Jaguars Texans,80 °F,Clear,1.0
...,...,...,...,...
11,Bills Dolphins,71 °F,Clear,17.0
12,Raiders Chargers,62 °F,Clear,17.0
13,Chiefs Broncos,16 °F,Mostly Cloudy,17.0
14,49ers Rams,63 °F,Clear,17.0


In [None]:
team_name_conversion = {'Cardinals':'crd', 'Falcons':'atl', 'Ravens':'rav', 'Bills':'buf', 'Panthers':'car', 'Bears':'chi', 'Bengals':'cin', 'Browns':'cle', 'Cowboys':'dal', 'Broncos':'den', 'Lions':'det', 'Packers':'gnb', 'Texans':'htx', 'Colts':'clt', 'Jaguars':'jax', 'Chiefs':'kan', 'Raiders':'rai',
             'Chargers':'sdg', 'Rams':'ram', 'Dolphins':'mia', 'Vikings':'min', 'Patriots':'nwe', 'Saints':'nor', 'Giants':'nyg', 'Jets':'nyj', 'Eagles':'phi', 'Steelers':'pit', 'Seahawks':'sea', '49ers':'sfo', 'Buccaneers':'tam', 'Titans':'oti', 'Washington':'was'}

def edit_weather_columns(weather_df):
    ###This function will replace the team names in the 'Matchup' column with thier representation in the stats_df
    two_team_list = weather_df['Matchup']
    Matchup_array = []

    for i in two_team_list:
        fir_name, sec_name = i.split()
        for j in team_name_conversion:
            if j == fir_name:
                first = team_name_conversion[j]
            elif j == sec_name:
                second = team_name_conversion[j]

        matchup = first + " " + second
        Matchup_array.append(matchup)

    fixed_matchup_df = pd.DataFrame(Matchup_array)
    fixed_matchup_df.columns = ['Matchup']

    #Drop the column with the old team names
    dropped = weather_df.drop(['Matchup'], axis=1)
    dropped = dropped.reset_index(drop=True)
    #dropped


    #Concatenate the dataframe with the adjusted team names
    Weather_df_fin = pd.concat([fixed_matchup_df, dropped], axis=1, ignore_index=True)
    Weather_df_fin = Weather_df_fin.rename(columns={0:'Matchup', 1:'Temperature', 2:'Weather', 3:'Week'})

    return Weather_df_fin

In [None]:
renamed_2024_weather_df = edit_weather_columns(weather_2024)

renamed_2024_weather_df

Unnamed: 0,Matchup,Temperature,Weather,Week
0,kan nwe,62 °F,Clear,1.0
1,nyj buf,61 °F,Partly Cloudy,1.0
2,phi was,69 °F,Clear,1.0
3,rai oti,73 °F,Clear,1.0
4,jax htx,80 °F,Clear,1.0
...,...,...,...,...
251,buf mia,71 °F,Clear,17.0
252,rai sdg,62 °F,Clear,17.0
253,kan den,16 °F,Mostly Cloudy,17.0
254,sfo ram,63 °F,Clear,17.0


# Combine weather dataframe with statistics dataframe

In [None]:
##Split 'Matchup' and make two new columns, each with one of the team names
def merge_stats_weather(stats_df:pd.DataFrame, weather_df:pd.DataFrame):

    weather_df[['Team1', 'Team2']] = weather_df['Matchup'].str.split(' ', expand=True)
    weather_df = weather_df.drop('Matchup', axis=1)
    #weather_df

    #We do this so we can join with the stats df on Team1 or Team2 = Team
    flipped_teams = weather_df.copy()
    flipped_teams[['Team1', 'Team2']] = flipped_teams[['Team2', 'Team1']]

    merge_ready_weather_df = pd.DataFrame(pd.concat([weather_df, flipped_teams]))

    merged_df = pd.merge(stats_df, merge_ready_weather_df, left_on=['Team', 'Week'], right_on=['Team1', 'Week'], how='inner')

    return merged_df


In [None]:
merged_2024_df = merge_stats_weather(overall_stats_df, renamed_2024_weather_df)

merged_2024_df

Unnamed: 0,Team,Result,Location,Week,Pass Yds,Opp Pass Yds,Rush Yds,Opp Rush Yds,Points per game,Opp Points per game,...,Plays,Opp Plays,3DConv,3DConv Allowed,Yards per Play,Opp Yards per Play,Temperature,Weather,Team1,Team2
0,crd,L,@,1.0,263.000000,285.000000,45.000000,82.000000,23.000000,23.000000,...,67.000000,69.000000,0.400000,0.500000,4.600000,5.320000,64 °F,Partly Cloudy,crd,det
1,crd,W,@,2.0,284.500000,237.500000,64.000000,79.000000,19.500000,19.500000,...,66.000000,69.500000,0.400000,0.472222,5.290000,4.560000,84 °F,Clear,crd,clt
2,crd,L,,3.0,284.000000,216.333333,59.000000,85.666667,18.666667,18.666667,...,69.000000,61.333333,0.414815,0.388889,5.003333,5.063333,85 °F,Clear,crd,dal
3,crd,W,,4.0,292.250000,214.750000,57.000000,88.000000,18.500000,18.500000,...,71.500000,65.750000,0.389236,0.357456,4.917500,4.762500,93 °F,Clear,crd,sfo
4,crd,L,@,5.0,289.000000,231.200000,51.800000,94.800000,16.200000,16.200000,...,69.200000,65.400000,0.368532,0.414536,4.958000,5.120000,77 °F,Light Rain,crd,phi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,was,L,@,13.0,230.230769,205.307692,92.615385,108.615385,20.923077,20.923077,...,58.692308,57.615385,0.332468,0.370274,5.071538,4.973846,58 °F,Clear,was,dal
508,was,L,@,14.0,223.500000,213.071429,90.642857,113.285714,20.357143,20.357143,...,58.071429,58.571429,0.320625,0.372397,4.996429,5.109286,79 °F,Clear,was,sdg
509,was,W,,15.0,221.066667,208.533333,86.666667,115.133333,20.333333,20.333333,...,57.333333,60.000000,0.306657,0.361606,4.972667,5.007333,47 °F,Clear,was,crd
510,was,W,,16.0,225.937500,206.187500,86.687500,117.875000,20.750000,20.750000,...,57.875000,60.687500,0.311529,0.357388,5.027500,4.985000,41 °F,Partly Cloudy,was,den


In [None]:
merged_2024_df.to_csv("NFL_2024_dataframe.csv", index=False)

# Feature Engineering

In [88]:
#We'll use the scikit-learn library to encode, standardize, and normalize our data
#First, we'll combine the datasets from years 2018-2023 to serve as our training data, with 2024 being the test dataset
dataset_2017 = pd.read_csv("NFL_2017_dataframe.csv")
dataset_2018 = pd.read_csv("NFL_2018_dataframe.csv")
dataset_2019 = pd.read_csv("NFL_2019_dataframe.csv")
dataset_2020 = pd.read_csv("NFL_2020_dataframe.csv")
dataset_2021 = pd.read_csv("NFL_2021_dataframe.csv")
dataset_2022 = pd.read_csv("NFL_2022_dataframe.csv")
dataset_2023 = pd.read_csv("NFL_2023_dataframe.csv")

train_data_raw = pd.DataFrame(pd.concat([dataset_2017, dataset_2018, dataset_2019, dataset_2020, dataset_2021, dataset_2022, dataset_2023]))
train_data_raw = train_data_raw.drop(['Team1', 'Team2'], axis=1)

test_data_raw = pd.read_csv("NFL_2024_dataframe.csv")
test_data_raw = test_data_raw.drop(['Team1', 'Team2'], axis=1)
test_data_raw

Unnamed: 0,Team,Result,Location,Week,Pass Yds,Opp Pass Yds,Rush Yds,Opp Rush Yds,Points per game,Opp Points per game,...,Cmp,Opp Cmp,Plays,Opp Plays,3DConv,3DConv Allowed,Yards per Play,Opp Yards per Play,Temperature,Weather
0,crd,L,@,1.0,146.000000,222.000000,124.000000,130.000000,28.000000,28.000000,...,67.700000,78.300000,60.000000,58.000000,0.538462,0.333333,4.500000,6.070000,61 °F,Mostly Cloudy
1,crd,W,,2.0,202.000000,207.000000,177.500000,91.500000,34.500000,34.500000,...,74.350000,74.350000,61.000000,55.000000,0.587413,0.257576,6.195000,5.390000,99 °F,Clear
2,crd,L,,3.0,201.333333,200.000000,144.000000,123.333333,27.333333,27.333333,...,70.166667,75.666667,58.333333,59.333333,0.428645,0.338384,5.873333,5.423333,91 °F,Mostly Cloudy
3,crd,L,,4.0,179.750000,208.250000,153.250000,146.500000,24.000000,24.000000,...,70.800000,78.425000,58.250000,61.250000,0.412393,0.441288,5.680000,5.742500,106 °F,Clear
4,crd,W,@,5.0,181.600000,212.800000,156.400000,147.800000,24.000000,24.000000,...,69.300000,73.600000,58.000000,61.600000,0.389915,0.462121,5.800000,5.814000,91 °F,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,was,W,,13.0,220.692308,190.692308,156.923077,137.000000,28.923077,28.923077,...,70.900000,65.015385,63.923077,59.384615,0.437908,0.418729,5.873846,5.487692,42 °F,Partly Cloudy
540,was,W,@,15.0,203.866667,177.000000,145.133333,123.333333,26.400000,26.400000,...,66.820000,59.473333,60.333333,54.866667,0.414815,0.381080,5.384667,5.076000,74 °F,Clear
541,was,W,,16.0,207.062500,173.875000,143.125000,128.812500,27.000000,27.000000,...,66.487500,59.106250,60.562500,55.875000,0.422543,0.368981,5.407500,5.056250,31 °F,Clear
542,was,W,,17.0,206.411765,176.058824,147.411765,128.647059,27.176471,27.176471,...,66.500000,58.823529,61.529412,56.117647,0.424837,0.359041,5.404118,5.089412,58 °F,Overcast


In [97]:
###Import the necessary processing objects and functions from sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler


In [None]:
###Start by encoding the target column
le = LabelEncoder()
label_column = le.fit_transform(train_data_raw['Result'])

In [None]:
###The Location column can be binarized for home or away (NaN or @ here)

In [None]:
###The weather column will be one hot encoded (may use hash encoding instead for memory conservation)
ohe = OneHotEncoder()
Weather = ohe.fit_transform(pd.DataFrame(train_data_raw['Weather']))


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3678 stored elements and shape (3678, 38)>

In [None]:
###The team names will be hash encoded

In [None]:
###The rest of the features are numerical, so we will standardize and normalize (temperature is included here, we just need to strip off the °F)