# Scrape stats from Pro Football Reference
## Keep stats as rolling averages for the season

In [None]:
###Now create a function that does all of this for a given team and year
import pandas as pd
import time
import random 

abbr_list = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai',
             'sdg', 'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']

stats = {'Result':'Rslt', 'Location':'Unnamed: 5', 'Week':'Week', 'Pass Yds':'Yds',  'Rush Yds':'Yds.2', 'Points per game':'Pts', 'Sacks':'Sk', 'Cmp':'Cmp%',
              'Plays':'Ply', '3DConv':'3DConv', 'Yards per Play':'Y/P'}


def create_stat_df(team:str, year:str, main_df:pd.DataFrame):
    ###Get the stat table we'll be scraping from
    url = 'https://www.pro-football-reference.com/teams/' + team + '/' + year + '/gamelog/'
    team_df = pd.read_html(url, header=1, attrs={'id':'table_pfr_team-year_game-logs_team-year-regular-season-game-log'})[0]
    opp_df = pd.read_html(url, header=1, attrs={'id':'table_pfr_team-year_game-logs_team-year-regular-season-opponent-game-log'})[0]
    ###Loop over the stats getting both team and opp info for each one
    print(f'Scraping stats for {team}, {year}')
    
    teamname_col = [f'{team}'] * (len(team_df) - 1)
    teamname_df = pd.DataFrame(teamname_col)
    teamname_df.columns = ['Team']
    main_df = pd.concat([main_df, teamname_df], axis=1)

    for stat in stats:
        if stat == 'Location' or stat == 'Week' or stat == "Result":
            ####Just loop once b/c we dont need opp info
            #Get stat
            season_tot = []
            for i in range(len(team_df)-1):                           
                season_tot.append(team_df[stats[stat]][i])
            
            #Concat to df
            tot_df = pd.DataFrame(season_tot)
            tot_df.columns = [stat]
            main_df = pd.concat([main_df, tot_df], axis=1)


        elif stat == "3DConv":
            third_pg, third_pg_a = [], []
            perc, perc_a = 0, 0

            for i in range(len(team_df)-1):
                perc += team_df['3DConv'][i] / team_df['3DAtt'][i]
                avg_third = perc / team_df['Week'][i]
                third_pg.append(avg_third.item())
            
                perc_a += opp_df['3DConv'][i] / opp_df['3DAtt'][i]
                avg_third_a = perc_a / opp_df['Week'][i]
                third_pg_a.append(avg_third_a.item())
            
            third_df = pd.DataFrame(third_pg)
            third_df.columns = ['3DConv']
            third_df_a = pd.DataFrame(third_pg_a)
            third_df_a.columns = ['3DConv Allowed']

            main_df = pd.concat([main_df, third_df, third_df_a], axis=1)


        else:
            #print(f'On stat: {stat}')
            team_pg, opp_pg = [], []
            team, opp = 0,0
            for i in range(len(team_df)-1):
                #print(f'team value is: {team}')
                team += team_df[stats[stat]][i]
                avg_team = team / team_df['Week'][i]
                team_pg.append(avg_team.item())

                #print(f'opp value is: {opp}')
                opp += opp_df[stats[stat]][i]
                avg_opp = opp / opp_df['Week'][i]
                opp_pg.append(avg_opp.item())

            cur_team_df = pd.DataFrame(team_pg)
            cur_team_df.columns = [stat]
            cur_opp_df = pd.DataFrame(opp_pg)
            cur_opp_df.columns = ["Opp " + stat]

            main_df = pd.concat([main_df, cur_team_df, cur_opp_df], axis=1)
            

    time.sleep(random.randint(8, 10))

    return main_df



# Generalize for all teams and all years

In [None]:
abbr_list = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai',
             'sdg', 'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']

#For now we'll scrape one year at a time for the purpose of merging with the weather df later (will update to scrape all years at once)
years = [str(season) for season in range(2024, 2025)]

overall_stats_df = pd.DataFrame()
main_df = pd.DataFrame()

for year in years:
    for team in abbr_list:
        complete_team_df = create_stat_df(team, year, main_df)
        overall_stats_df = pd.concat([overall_stats_df, complete_team_df], axis=0)

overall_stats_df

# Scrape weather data

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random


def scrape_weather(year:str, base_df:pd.DataFrame):
    #for year in year:
    #We need to account for the switch to an 18 week season in 2021
    if int(year) <= 2020:
        weeks = [str(week) for week in range(1,18)]
    else:
        weeks = [str(week) for week in range(1,19)]
        
    for week in weeks:
        url = 'https://www.nflweather.com/week/' + year + '/week-' + week
        response = requests.get(url)
        response.raise_for_status()  # Raise error if request failed
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all game weather rows
        games = soup.find_all(class_='mx-2')
        #Get correspoinding team names
        teams = soup.find_all('span', class_="fw-bold")

        #Get team names from html for matchups
        team_list = []
        for i in range(len(teams) - 1):
            text = teams[i].get_text()
            if text != '@':
                team_list.append(text)
        #Create a list of matchups from team_list that correspond to the weather data
        matchups = []
        n = 0
        while n < len(team_list):
            vs = team_list[n] + " " + team_list[n+1]
            matchups.append(vs)
            n += 2  
        #Get the temperature and weather data from html
        m = 0
        temp = []
        weather = []
        while m < len(games):
            stripped_temp = games[m]
            stripped_weat = games[m+1]
            #print(stripped_temp.get_text(), stripped_weat.get_text())
            temp.append(games[m].get_text())
            weather.append(games[m+1].get_text())
            m += 2

        Matchup_df = pd.DataFrame(matchups)
        temp_df = pd.DataFrame(temp)
        weather_df = pd.DataFrame(weather)
        combined = pd.concat([Matchup_df, temp_df, weather_df], axis=1, ignore_index=True)
        combined.columns = ["Matchup", "Temperature", "Weather"]
        combined['Week'] = float(week)

        base_df = pd.concat([base_df, combined], axis=0)

        time.sleep(random.randint(8, 10))

            

    return base_df



In [None]:
###Now we can scrape the weather logs for a given year
year = str(2024)
base_df = pd.DataFrame()
weather_2024 = scrape_weather(year=year, base_df=base_df)

weather_2024

In [None]:
team_name_conversion = {'Cardinals':'crd', 'Falcons':'atl', 'Ravens':'rav', 'Bills':'buf', 'Panthers':'car', 'Bears':'chi', 'Bengals':'cin', 'Browns':'cle', 'Cowboys':'dal', 'Broncos':'den', 'Lions':'det', 'Packers':'gnb', 'Texans':'htx', 'Colts':'clt', 'Jaguars':'jax', 'Chiefs':'kan', 'Raiders':'rai',
             'Chargers':'sdg', 'Rams':'ram', 'Dolphins':'mia', 'Vikings':'min', 'Patriots':'nwe', 'Saints':'nor', 'Giants':'nyg', 'Jets':'nyj', 'Eagles':'phi', 'Steelers':'pit', 'Seahawks':'sea', '49ers':'sfo', 'Buccaneers':'tam', 'Titans':'oti', 'Washington':'was'}

def edit_weather_columns(weather_df):
    ###This function will replace the team names in the 'Matchup' column with thier representation in the stats_df
    two_team_list = weather_df['Matchup']
    Matchup_array = []

    for i in two_team_list:
        fir_name, sec_name = i.split()
        for j in team_name_conversion:
            if j == fir_name:
                first = team_name_conversion[j]
            elif j == sec_name:
                second = team_name_conversion[j]

        matchup = first + " " + second
        Matchup_array.append(matchup)

    fixed_matchup_df = pd.DataFrame(Matchup_array)
    fixed_matchup_df.columns = ['Matchup']

    #Drop the column with the old team names
    dropped = weather_df.drop(['Matchup'], axis=1)
    dropped = dropped.reset_index(drop=True)
    #dropped


    #Concatenate the dataframe with the adjusted team names
    Weather_df_fin = pd.concat([fixed_matchup_df, dropped], axis=1, ignore_index=True)
    Weather_df_fin = Weather_df_fin.rename(columns={0:'Matchup', 1:'Temperature', 2:'Weather', 3:'Week'})

    return Weather_df_fin

In [None]:
renamed_2024_weather_df = edit_weather_columns(weather_2024)

renamed_2024_weather_df

# Combine weather dataframe with statistics dataframe

In [None]:
##Split 'Matchup' and make two new columns, each with one of the team names
def merge_stats_weather(stats_df:pd.DataFrame, weather_df:pd.DataFrame):

    weather_df[['Team1', 'Team2']] = weather_df['Matchup'].str.split(' ', expand=True)
    weather_df = weather_df.drop('Matchup', axis=1)
    #weather_df

    #We do this so we can join with the stats df on Team1 or Team2 = Team
    flipped_teams = weather_df.copy()
    flipped_teams[['Team1', 'Team2']] = flipped_teams[['Team2', 'Team1']]

    merge_ready_weather_df = pd.DataFrame(pd.concat([weather_df, flipped_teams]))

    merged_df = pd.merge(stats_df, merge_ready_weather_df, left_on=['Team', 'Week'], right_on=['Team1', 'Week'], how='inner')

    ###Get rid of redundant information
    merged_df = merged_df.drop('Team1', axis=1)

    return merged_df


In [None]:
merged_2024_df = merge_stats_weather(overall_stats_df, renamed_2024_weather_df)

merged_2024_df

In [None]:
merged_2024_df.to_csv("NFL_2024_dataframe.csv", index=False)

# Feature Engineering

In [None]:
#We'll use the scikit-learn library to encode, standardize, and normalize our data
#First, we'll combine the datasets from years 2018-2023 to serve as our training data, with 2024 being the test dataset
dataset_2017 = pd.read_csv("NFL_2017_dataframe.csv")
dataset_2018 = pd.read_csv("NFL_2018_dataframe.csv")
dataset_2019 = pd.read_csv("NFL_2019_dataframe.csv")
dataset_2020 = pd.read_csv("NFL_2020_dataframe.csv")
dataset_2021 = pd.read_csv("NFL_2021_dataframe.csv")
dataset_2022 = pd.read_csv("NFL_2022_dataframe.csv")
dataset_2023 = pd.read_csv("NFL_2023_dataframe.csv")

train_data_raw = pd.DataFrame(pd.concat([dataset_2017, dataset_2018, dataset_2019, dataset_2020, dataset_2021, dataset_2022, dataset_2023]))
train_data_raw = train_data_raw.drop(['Team1', 'Team2'], axis=1)

test_data_raw = pd.read_csv("NFL_2024_dataframe.csv")
test_data_raw = test_data_raw.drop(['Team1', 'Team2'], axis=1)
test_data_raw

In [None]:
###Import the necessary processing objects and functions from sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler


In [None]:
###Start by encoding the target column
le = LabelEncoder()
label_column = le.fit_transform(train_data_raw['Result'])

In [None]:
###The Location column can be binarized for home or away (NaN or @ here)

In [None]:
###The weather column will be one hot encoded (may use hash encoding instead for memory conservation)
ohe = OneHotEncoder()
Weather = ohe.fit_transform(pd.DataFrame(train_data_raw['Weather']))


In [None]:
###The team names will be hash encoded

In [None]:
###The rest of the features are numerical, so we will standardize and normalize (temperature is included here, we just need to strip off the °F)