# Scrape stats from Pro Football Reference
## Keep stats as rolling averages for the season

In [None]:
###Now create a function that does all of this for a given team and year
import pandas as pd
import time
import random 

abbr_list = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai',
             'sdg', 'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']

stats = {'Result':'Rslt', 'Location':'Unnamed: 5', 'Week':'Week', 'Pass Yds':'Yds',  'Rush Yds':'Yds.2', 'Points per game':'Pts', 'Sacks':'Sk', 'Cmp':'Cmp%',
              'Plays':'Ply', '3DConv':'3DConv', 'Yards per Play':'Y/P'}


def create_stat_df(team:str, year:str, main_df:pd.DataFrame):
    ###Get the stat table we'll be scraping from
    url = 'https://www.pro-football-reference.com/teams/' + team + '/' + year + '/gamelog/'
    team_df = pd.read_html(url, header=1, attrs={'id':'table_pfr_team-year_game-logs_team-year-regular-season-game-log'})[0]
    opp_df = pd.read_html(url, header=1, attrs={'id':'table_pfr_team-year_game-logs_team-year-regular-season-opponent-game-log'})[0]
    ###Loop over the stats getting both team and opp info for each one
    print(f'Scraping stats for {team}, {year}')
    
    teamname_col = [f'{team}'] * (len(team_df) - 1)
    teamname_df = pd.DataFrame(teamname_col)
    teamname_df.columns = ['Team']
    main_df = pd.concat([main_df, teamname_df], axis=1)

    for stat in stats:
        if stat == 'Location' or stat == 'Week' or stat == "Result":
            ####Just loop once b/c we dont need opp info
            #Get stat
            season_tot = []
            for i in range(len(team_df)-1):                           
                season_tot.append(team_df[stats[stat]][i])
            
            #Concat to df
            tot_df = pd.DataFrame(season_tot)
            tot_df.columns = [stat]
            main_df = pd.concat([main_df, tot_df], axis=1)


        elif stat == "3DConv":
            third_pg, third_pg_a = [], []
            perc, perc_a = 0, 0

            for i in range(len(team_df)-1):
                perc += team_df['3DConv'][i] / team_df['3DAtt'][i]
                avg_third = perc / team_df['Week'][i]
                third_pg.append(avg_third.item())
            
                perc_a += opp_df['3DConv'][i] / opp_df['3DAtt'][i]
                avg_third_a = perc_a / opp_df['Week'][i]
                third_pg_a.append(avg_third_a.item())
            
            third_df = pd.DataFrame(third_pg)
            third_df.columns = ['3DConv']
            third_df_a = pd.DataFrame(third_pg_a)
            third_df_a.columns = ['3DConv Allowed']

            main_df = pd.concat([main_df, third_df, third_df_a], axis=1)


        else:
            #print(f'On stat: {stat}')
            team_pg, opp_pg = [], []
            team, opp = 0,0
            for i in range(len(team_df)-1):
                #print(f'team value is: {team}')
                team += team_df[stats[stat]][i]
                avg_team = team / team_df['Week'][i]
                team_pg.append(avg_team.item())

                #print(f'opp value is: {opp}')
                opp += opp_df[stats[stat]][i]
                avg_opp = opp / opp_df['Week'][i]
                opp_pg.append(avg_opp.item())

            cur_team_df = pd.DataFrame(team_pg)
            cur_team_df.columns = [stat]
            cur_opp_df = pd.DataFrame(opp_pg)
            cur_opp_df.columns = ["Opp " + stat]

            main_df = pd.concat([main_df, cur_team_df, cur_opp_df], axis=1)
            

    time.sleep(random.randint(8, 10))

    return main_df



# Generalize for all teams and all years

In [None]:
abbr_list = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai',
             'sdg', 'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']

#For now we'll scrape one year at a time for the purpose of merging with the weather df later (will update to scrape all years at once)
years = [str(season) for season in range(2024, 2025)]

overall_stats_df = pd.DataFrame()
main_df = pd.DataFrame()

for year in years:
    for team in abbr_list:
        complete_team_df = create_stat_df(team, year, main_df)
        overall_stats_df = pd.concat([overall_stats_df, complete_team_df], axis=0)

overall_stats_df

# Scrape weather data

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random


def scrape_weather(year:str, base_df:pd.DataFrame):
    #for year in year:
    #We need to account for the switch to an 18 week season in 2021
    if int(year) <= 2020:
        weeks = [str(week) for week in range(1,18)]
    else:
        weeks = [str(week) for week in range(1,19)]
        
    for week in weeks:
        url = 'https://www.nflweather.com/week/' + year + '/week-' + week
        response = requests.get(url)
        response.raise_for_status()  # Raise error if request failed
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all game weather rows
        games = soup.find_all(class_='mx-2')
        #Get correspoinding team names
        teams = soup.find_all('span', class_="fw-bold")

        #Get team names from html for matchups
        team_list = []
        for i in range(len(teams) - 1):
            text = teams[i].get_text()
            if text != '@':
                team_list.append(text)
        #Create a list of matchups from team_list that correspond to the weather data
        matchups = []
        n = 0
        while n < len(team_list):
            vs = team_list[n] + " " + team_list[n+1]
            matchups.append(vs)
            n += 2  
        #Get the temperature and weather data from html
        m = 0
        temp = []
        weather = []
        while m < len(games):
            stripped_temp = games[m]
            stripped_weat = games[m+1]
            #print(stripped_temp.get_text(), stripped_weat.get_text())
            temp.append(games[m].get_text())
            weather.append(games[m+1].get_text())
            m += 2

        Matchup_df = pd.DataFrame(matchups)
        temp_df = pd.DataFrame(temp)
        weather_df = pd.DataFrame(weather)
        combined = pd.concat([Matchup_df, temp_df, weather_df], axis=1, ignore_index=True)
        combined.columns = ["Matchup", "Temperature", "Weather"]
        combined['Week'] = float(week)

        base_df = pd.concat([base_df, combined], axis=0)

        time.sleep(random.randint(8, 10))

            

    return base_df



In [None]:
###Now we can scrape the weather logs for a given year
year = str(2024)
base_df = pd.DataFrame()
weather_2024 = scrape_weather(year=year, base_df=base_df)

weather_2024

In [None]:
team_name_conversion = {'Cardinals':'crd', 'Falcons':'atl', 'Ravens':'rav', 'Bills':'buf', 'Panthers':'car', 'Bears':'chi', 'Bengals':'cin', 'Browns':'cle', 'Cowboys':'dal', 'Broncos':'den', 'Lions':'det', 'Packers':'gnb', 'Texans':'htx', 'Colts':'clt', 'Jaguars':'jax', 'Chiefs':'kan', 'Raiders':'rai',
             'Chargers':'sdg', 'Rams':'ram', 'Dolphins':'mia', 'Vikings':'min', 'Patriots':'nwe', 'Saints':'nor', 'Giants':'nyg', 'Jets':'nyj', 'Eagles':'phi', 'Steelers':'pit', 'Seahawks':'sea', '49ers':'sfo', 'Buccaneers':'tam', 'Titans':'oti', 'Washington':'was'}

def edit_weather_columns(weather_df):
    ###This function will replace the team names in the 'Matchup' column with thier representation in the stats_df
    two_team_list = weather_df['Matchup']
    Matchup_array = []

    for i in two_team_list:
        fir_name, sec_name = i.split()
        for j in team_name_conversion:
            if j == fir_name:
                first = team_name_conversion[j]
            elif j == sec_name:
                second = team_name_conversion[j]

        matchup = first + " " + second
        Matchup_array.append(matchup)

    fixed_matchup_df = pd.DataFrame(Matchup_array)
    fixed_matchup_df.columns = ['Matchup']

    #Drop the column with the old team names
    dropped = weather_df.drop(['Matchup'], axis=1)
    dropped = dropped.reset_index(drop=True)
    #dropped


    #Concatenate the dataframe with the adjusted team names
    Weather_df_fin = pd.concat([fixed_matchup_df, dropped], axis=1, ignore_index=True)
    Weather_df_fin = Weather_df_fin.rename(columns={0:'Matchup', 1:'Temperature', 2:'Weather', 3:'Week'})

    return Weather_df_fin

In [None]:
renamed_2024_weather_df = edit_weather_columns(weather_2024)

renamed_2024_weather_df

# Combine weather dataframe with statistics dataframe

In [None]:
##Split 'Matchup' and make two new columns, each with one of the team names
def merge_stats_weather(stats_df:pd.DataFrame, weather_df:pd.DataFrame):

    weather_df[['Team1', 'Team2']] = weather_df['Matchup'].str.split(' ', expand=True)
    weather_df = weather_df.drop('Matchup', axis=1)
    #weather_df

    #We do this so we can join with the stats df on Team1 or Team2 = Team
    flipped_teams = weather_df.copy()
    flipped_teams[['Team1', 'Team2']] = flipped_teams[['Team2', 'Team1']]

    merge_ready_weather_df = pd.DataFrame(pd.concat([weather_df, flipped_teams]))

    merged_df = pd.merge(stats_df, merge_ready_weather_df, left_on=['Team', 'Week'], right_on=['Team1', 'Week'], how='inner')

    ###Get rid of redundant information
    merged_df = merged_df.drop('Team1', axis=1)

    return merged_df


In [None]:
merged_2024_df = merge_stats_weather(overall_stats_df, renamed_2024_weather_df)

merged_2024_df

In [None]:
merged_2024_df.to_csv("NFL_2024_dataframe.csv", index=False)

# Feature Engineering

In [None]:
#We'll use the scikit-learn library to encode, standardize, and normalize our data
#First, we'll combine the datasets from years 2018-2023 to serve as our training data, with 2024 being the test dataset
import pandas as pd

dataset_2017 = pd.read_csv("NFL_2017_dataframe.csv")
dataset_2018 = pd.read_csv("NFL_2018_dataframe.csv")
dataset_2019 = pd.read_csv("NFL_2019_dataframe.csv")
dataset_2020 = pd.read_csv("NFL_2020_dataframe.csv")
dataset_2021 = pd.read_csv("NFL_2021_dataframe.csv")
dataset_2022 = pd.read_csv("NFL_2022_dataframe.csv")
dataset_2023 = pd.read_csv("NFL_2023_dataframe.csv")


train_data_raw = pd.DataFrame(pd.concat([dataset_2017, dataset_2018, dataset_2019, dataset_2020, dataset_2021, dataset_2022, dataset_2023], ignore_index=True))
train_data_raw.rename(columns={'Team2':'Opp Team'}, inplace=True)

test_data_raw = pd.read_csv("NFL_2024_dataframe.csv")
test_data_raw.rename(columns={'Team2':'Opp Team'}, inplace=True)


train_data_raw

Unnamed: 0,Team,Result,Location,Week,Pass Yds,Opp Pass Yds,Rush Yds,Opp Rush Yds,Points per game,Opp Points per game,...,Opp Cmp,Plays,Opp Plays,3DConv,3DConv Allowed,Yards per Play,Opp Yards per Play,Temperature,Weather,Opp Team
0,crd,L,@,1.0,263.000000,285.000000,45.000000,82.000000,23.000000,23.000000,...,70.700000,67.000000,69.000000,0.400000,0.500000,4.600000,5.320000,64 °F,Partly Cloudy,det
1,crd,W,@,2.0,284.500000,237.500000,64.000000,79.000000,19.500000,19.500000,...,62.400000,66.000000,69.500000,0.400000,0.472222,5.290000,4.560000,84 °F,Clear,clt
2,crd,L,,3.0,284.000000,216.333333,59.000000,85.666667,18.666667,18.666667,...,65.666667,69.000000,61.333333,0.414815,0.388889,5.003333,5.063333,85 °F,Clear,dal
3,crd,W,,4.0,292.250000,214.750000,57.000000,88.000000,18.500000,18.500000,...,61.500000,71.500000,65.750000,0.389236,0.357456,4.917500,4.762500,93 °F,Clear,sfo
4,crd,L,@,5.0,289.000000,231.200000,51.800000,94.800000,16.200000,16.200000,...,63.200000,69.200000,65.400000,0.368532,0.414536,4.958000,5.120000,77 °F,Light Rain,phi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3673,was,L,,13.0,236.230769,266.000000,99.923077,113.846154,20.076923,20.076923,...,64.661538,65.538462,63.230769,0.372382,0.408583,5.080000,6.042308,53 °F,Fog,mia
3674,was,L,@,15.0,219.266667,247.133333,91.866667,111.733333,18.733333,18.733333,...,61.093333,60.666667,59.666667,0.337017,0.391606,4.744000,5.643333,73 °F,Clear,ram
3675,was,L,@,16.0,214.500000,245.250000,92.500000,115.000000,19.312500,19.312500,...,60.718750,60.500000,61.250000,0.331578,0.382755,4.711250,5.570625,45 °F,Overcast,nyj
3676,was,L,,17.0,211.470588,244.000000,90.705882,119.058824,18.764706,18.764706,...,61.770588,59.470588,61.647059,0.331681,0.386384,4.741765,5.595882,51 °F,Partly Cloudy,sfo


In [None]:
###Import the necessary processing objects and functions from sklearn   (we'll start by focusing on the training data)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [32]:
###Start by encoding the target column
le = LabelEncoder()
Results_encoded = le.fit_transform(train_data_raw['Result'])

In [None]:
###Now for the features, encode each different type of weather
Weather_encoded = le.fit_transform(train_data_raw['Weather'])
Weather_encoded = pd.DataFrame(Weather_encoded)
#Weather_encoded

In [100]:
###Now we scale this feature so it doesn't dominate the model's predictions 
scaler = MinMaxScaler()
Weather_encoded = pd.DataFrame(scaler.fit_transform(Weather_encoded))
Weather_encoded.columns = ['Weather_lab']
#Weather_encoded

In [101]:
###For the team names and opponent team names, we'll encode this manually as we want to have the same encoding for both columns
###Make a dict with our mapping
Teams = [x for x in train_data_raw['Team'].unique()]
Team_mapping = {}
for i in range(len(Teams)):
    Team_mapping[Teams[i]] = i

###Now encode the Team and Opp Team features with this mapping
Team_encoded = []
for i in train_data_raw['Team']:
    Team_encoded.append(Team_mapping[i])
Team_encoded = pd.DataFrame(Team_encoded)

Opp_Team_encoded = []
for i in train_data_raw['Opp Team']:
    Opp_Team_encoded.append(Team_mapping[i])
Opp_Team_encoded = pd.DataFrame(Opp_Team_encoded)


In [102]:
#Now these need to be scaled
Team_encoded = pd.DataFrame(scaler.fit_transform(Team_encoded))
Team_encoded.columns = ['Team']
Opp_Team_encoded = pd.DataFrame(scaler.fit_transform(Opp_Team_encoded))
Opp_Team_encoded.columns = ['Opp Team']

In [56]:
###The Location column can be binarized for home or away (NaN or @ here)
Home_Away_encoded = train_data_raw['Location'].fillna(0)
Home_Away_encoded = Home_Away_encoded.replace('@', 1)
Home_Away_encoded = pd.DataFrame(Home_Away_encoded)
Home_Away_encoded.columns = ['Home Away']
#Home_Away_encoded

  Home_Away_encoded = Home_Away_encoded.replace('@', 1)


In [None]:
###The rest of the features are numerical, so we will standardize (temperature is included here, we just need to strip off the °F)
num_feats = ['Week', 'Pass Yds', 'Opp Pass Yds', 'Rush Yds', 'Opp Rush Yds', 'Points per game', 'Opp Points per game', 'Sacks', 'Opp Sacks',
             'Cmp', 'Opp Cmp', 'Plays', 'Opp Plays', '3DConv', '3DConv Allowed', 'Yards per Play', 'Opp Yards per Play', 'Temperature']

In [58]:
###Strip off the Fahrenheit symbol from the temperature readings
train_data_raw['Temperature'] = train_data_raw['Temperature'].str.replace(' °F', '')
train_data_raw['Temperature'] = pd.to_numeric(train_data_raw['Temperature'])

In [None]:
###Make a loop to scale all of the remaining features
scaled_feat_df = pd.DataFrame()
for feature in num_feats:
    scaled_feat = pd.DataFrame(scaler.fit_transform(train_data_raw[[feature]]))
    scaled_feat.columns = [feature]
    scaled_feat_df = pd.concat([scaled_feat_df, scaled_feat], axis=1)

scaled_feat_df

Unnamed: 0,Week,Pass Yds,Opp Pass Yds,Rush Yds,Opp Rush Yds,Points per game,Opp Points per game,Sacks,Opp Sacks,Cmp,Opp Cmp,Plays,Opp Plays,3DConv,3DConv Allowed,Yards per Play,Opp Yards per Play,Temperature
0,0.000000,0.496278,0.550868,0.098361,0.282353,0.389831,0.389831,0.100000,0.100000,0.413192,0.606159,0.553719,0.626866,0.444444,0.555556,0.351310,0.446910,0.663717
1,0.058824,0.549628,0.433002,0.176230,0.270588,0.330508,0.330508,0.250000,0.250000,0.386657,0.471637,0.537190,0.634328,0.444444,0.524691,0.457627,0.326466,0.840708
2,0.117647,0.548387,0.380480,0.155738,0.296732,0.316384,0.316384,0.366667,0.200000,0.416224,0.524581,0.586777,0.512438,0.460905,0.432099,0.413457,0.406233,0.849558
3,0.176471,0.568859,0.376551,0.147541,0.305882,0.313559,0.313559,0.425000,0.225000,0.447309,0.457050,0.628099,0.578358,0.432485,0.397173,0.400231,0.358558,0.920354
4,0.235294,0.560794,0.417370,0.126230,0.332549,0.274576,0.274576,0.380000,0.200000,0.462623,0.484603,0.590083,0.573134,0.409480,0.460596,0.406471,0.415214,0.778761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3673,0.705882,0.429853,0.503722,0.323455,0.407240,0.340287,0.340287,0.446154,0.269231,0.542194,0.508291,0.529561,0.540758,0.413758,0.453982,0.425270,0.561380,0.566372
3674,0.823529,0.387758,0.456907,0.290437,0.398954,0.317514,0.317514,0.393333,0.253333,0.464544,0.450459,0.449036,0.487562,0.374463,0.435117,0.373498,0.498151,0.743363
3675,0.882353,0.375931,0.452233,0.293033,0.411765,0.327331,0.327331,0.375000,0.237500,0.451289,0.444388,0.446281,0.511194,0.368420,0.425284,0.368451,0.486628,0.495575
3676,0.941176,0.368413,0.449132,0.285680,0.427682,0.318046,0.318046,0.358824,0.229412,0.452972,0.461436,0.429266,0.517120,0.368535,0.429316,0.373153,0.490631,0.548673
