### Author: Rodolfo Elenes

Date Created: 9/3/2025

Change log:
9/3/2025 - Initialized

In [None]:
import pandas as pd
import numpy as np
import duckdb
import time
from pathlib import Path
import warnings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [None]:
%run ./common_utils.ipynb

##### Notebook Functions

In [None]:
def collect_szn_matchups(tm_name, tm_code):
    
    df_teams = pd.read_csv("../tables/team_info_xref.csv")
    df_temp = pd.DataFrame()
    
    seasons = list(range(1990, 2025))    
    for year in seasons:
        print(f"{year} season.")
        
        time.sleep(6)        
        url = f"https://www.pro-football-reference.com/teams/{tm_code}/{year}.htm"
        try:
            df_temp2 = pd.read_html(url, header=[0, 1])[1]
        except Exception as e:
            print(f"Unavailable link. Please check: {url}")
            print("\nError:", e, "\n")
            continue
        
         # Fill top-level header missing values forward
        cols = pd.DataFrame(df_temp2.columns.tolist())
        cols.iloc[:, 0] = cols.iloc[:, 0].replace("Unnamed:.*", pd.NA, regex=True).fillna(method='ffill')
        # Rebuild MultiIndex
        df_temp2.columns = pd.MultiIndex.from_frame(cols)
        df_temp2 = df_temp2[['NaN']]

        # Then flatten as before
        df_temp2.columns = [
            f"{b}".strip('_') if b else a 
            for a, b in df_temp2.columns
        ]
        
        df_temp2['Team'] = tm_name
        df_temp2['Season'] = year
        
        # Correct the year the game was played in for games in January + February
        df_temp2['Date'] = pd.to_datetime(df_temp2.Date + ", " + df_temp2.Season.astype(str), errors = 'coerce')
        df_temp2 = df_temp2.dropna(subset=['Date'])
        strt_dt = df_temp2.Date.tolist()[0]
        df_temp2['Date'] = np.where(df_temp2.Date < strt_dt, df_temp2.Date + pd.DateOffset(years=1), df_temp2.Date)

        df_temp2 = df_temp2[['Team', 'Season', 'Week', 'Date', 'Opp']].reset_index(drop=True)        

        # Convert post season rows to numerical values
        df_temp2['Week'] = pd.to_numeric(df_temp2.Week, errors='coerce').fillna(0)
        df_temp2['Week'] = df_temp2.Week.astype(int)
        lst_gm_reg_szn = df_temp2.Week.unique().tolist()
        if 0 in lst_gm_reg_szn:
            lst_gm_reg_szn.remove(0)
        lst_gm_reg_szn = lst_gm_reg_szn[-1]
        row = df_temp2[(df_temp2.Week == lst_gm_reg_szn)].index[0]
        post_szn_rows = df_temp2.shape[0] - row
        for i in range(1, post_szn_rows):
            i = i + row
            lst_gm_reg_szn += 1
            df_temp2.loc[i, 'Week'] = lst_gm_reg_szn
                
        df_temp = pd.concat([df_temp, df_temp2])
    
    return df_temp

In [None]:
def create_exempt_list():
#   Function name: get_missing_bye
#   Description: Checks for existing nfl_team_schedules files, so they arent reprocessed again
#   Return values: exempt_list
#        exempt_list(list): The list of teams that have already been processed by this notebook

    save_location = "../tables/nfl_team_schedules/teams"

    # creates folder if not existence
    output_dir = Path(save_location)
    output_dir.mkdir(parents=True, exist_ok=True)

    directory_path = Path(save_location)
    file_paths = [entry for entry in directory_path.iterdir() if entry.is_file()]
    file_names = [file.name for file in file_paths]

    exempt_list = []
    for i in file_names:
        exempt_list.append(i.split("_")[0])
        
    return exempt_list

In [None]:
def apply_aliases(df, tm_name):
    df_teams = pd.read_csv(f"../tables/team_info_xref.csv")
    df_teams = df_teams[(df_teams.Team2.notna()) & (df_teams.Team == tm_name)].reset_index(drop=True)

    if df_teams.shape[0] > 0:
        team_entry = df_teams.loc[0]
        tm_name = team_entry.loc['Team']
        tm_name2 = team_entry.loc['Team2']
        tm_name3 = team_entry.loc['Team3']
        TmLegacy2 = team_entry.loc['TmLegacy2']
        TmLegacy3 = team_entry.loc['TmLegacy3']

        seasons = list(range(int(TmLegacy2.split("-")[0]), int(TmLegacy2.split("-")[1]) + 1))    
        df['Team'] = np.where((df.Team == tm_name) & (df.Season.isin(seasons)), tm_name2, df.Team)

        if str(team_entry.loc['Team3']) != 'nan':
            seasons = list(range(int(TmLegacy3.split("-")[0]), int(TmLegacy3.split("-")[1]) + 1))    
            df['Team'] = np.where((df.Team == tm_name) & (df.Season.isin(seasons)), tm_name3, df.Team)
    
    return df

##### Main

In [None]:
def main():
#   Function name: main
#   Description: The entry function of the notebook

    df_teams = pd.read_csv("../tables/team_info_xref.csv")
    exempt_list = create_exempt_list()
    df_teams = df_teams[~df_teams.Team.isin(exempt_list)].reset_index(drop=True)
    print(f"Will acquire bye weeks data for the following teams: {df_teams.Team.tolist()}")
    display(df_teams[['Team', 'ABV', 'PFR_ABV']])
    
    df = pd.DataFrame()
    for row in range(df_teams.shape[0]):
        tm_entry = df_teams.loc[row]
        tm_name = tm_entry.loc['Team']
        tm_code = tm_entry.loc['PFR_ABV']
        print(f"Collecting {tm_name} NFL schedules")
        df_temp = collect_szn_matchups(tm_name, tm_code)
        df_temp = apply_aliases(df_temp, tm_name)
        display(df_temp)
        save_df(df_temp, '../tables/nfl_team_schedules/teams', f'{tm_name}_schedule.csv')

    if df_teams.shape[0] == 0:
        print("Files are up to date!")
    else:
        concatenate_all_files('nfl_team_schedules', 'teams')
    
main()