### Author: Rodolfo Elenes

Date Created: 8/15/2025

Change log:
8/15/2025 - Initialized

##### Imports

In [None]:
import pandas as pd
import numpy as np
import duckdb
import time
from pathlib import Path
import warnings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

##### Notebook Functions

In [None]:
def collect_year_calndr(con_memory, year):
#   Function name: collect_year_calndr
#   Description: This function is used to collect yearly NFL calendars
#   Parameters: con_memory, year
#        con_memory(duckdb connection): connect to notebook's duckdb session
#        year(str): The season that is going to be processed
#   Return values: df
#        df(pandas dataframe): The calendar for the input NFL season
    
    time.sleep(6)
    df = pd.read_html(f"https://www.pro-football-reference.com/years/{year}/games.htm")[0]

    df['Date'] = pd.to_datetime(df.Date, errors = 'coerce')
    df['Season'] = df.Date.dt.year.astype('Int64').min()
    df = df[['Season', 'Week', 'Date']]
    df = df.drop_duplicates()
    df = df.dropna(subset=['Date']).reset_index(drop=True)

    # Convert Postseason game names to week numbers
    postseason = ['WildCard', 'Division', 'ConfChamp', 'SuperBowl']
    weeks = df.Week.unique().tolist()
    for gm in postseason:
        df['Week'] = np.where(df['Week'] == gm, weeks.index(gm) + 1, df.Week)
    df['Week'] = df['Week'].astype(float).astype(int)

    # Take first and last games played of each NFL week, then fill dates in between   
    weeks2 = df.Week.unique().tolist()
    for i in weeks2:
        i = int(i)
        if i == weeks2[-1]:
            break
        df_temp1 = df[(df['Week'] == i)]
        df_temp2 = df[(df['Week'] == i + 1)]
        date1 = pd.to_datetime(df_temp1.Date.dt.strftime('%Y-%m-%d').iloc[0]) + pd.Timedelta(days=1) # Take first played game of the week
        date2 = pd.to_datetime(df_temp2.Date.dt.strftime('%Y-%m-%d').iloc[-1]) - pd.Timedelta(days=1) # Take last played game of the week

        # Fill in new onseason dates between rows
        new_days = pd.date_range(date1, date2)  
        df_temp3 = pd.DataFrame({'Season': year, 'Week': np.nan, 'Date': new_days})
        df = pd.concat([df, df_temp3])

    # Drop duplicate dates with null week values
    df_temp5 = pd.DataFrame()
    for i in df.Date.unique().tolist():
        df_temp4 = df[(df['Date'] == i)]
        if df_temp4.shape[0] > 1:
            df_temp4 = df_temp4.dropna()
        df_temp5 = pd.concat([df_temp5, df_temp4])

    # Fill in missing NFL week identifier and days of the week
    df = df_temp5.sort_values('Date').reset_index(drop=True)
    df['Week'] = df.Week.ffill() # Fill mid week dates with correct NFL week identifier
    df['Day'] = df.Date.dt.day_name() # Attach correct name for day of the week

    # Fill in missing games in the middle of the weeks for needed weeks
    df['date_diff'] = df['Date'].diff()
    weeks3 = df.Week.unique().tolist()
    for week in weeks3:
        df_temp1 = con_memory.execute(f"""SELECT * FROM df WHERE Week = {week}""").fetch_df()
        con_memory.register('curr_season', df_temp1)
        df_temp1 = con_memory.execute(f"""SELECT * FROM df WHERE Week = {week} AND date_diff != '1 days'""").fetch_df()
        if df_temp1.empty != True:
            df_temp2 = con_memory.execute("""SELECT * FROM curr_season""").fetch_df()    
            date1 = pd.to_datetime(df_temp1.Date.dt.strftime('%Y-%m-%d').iloc[0]) - pd.Timedelta(days=1)
            date2 = pd.to_datetime(df_temp2.Date.dt.strftime('%Y-%m-%d').iloc[0]) + pd.Timedelta(days=1)

            # Fill in new onseason dates between rows
            new_days = pd.date_range(date2, date1)  
            df_temp3 = pd.DataFrame({'Season': year, 'Week': np.nan, 'Date': new_days})
            df = pd.concat([df, df_temp3]).sort_values('Date').reset_index(drop=True)
            df['date_diff'] = df['Date'].diff()

        # For rare cases that date_diff > 1 days, twice in the same NFL week
        df_temp4 = con_memory.execute(f"""SELECT * FROM df WHERE Week = {week} AND date_diff != '1 days'""").fetch_df()
        if df_temp4.shape[0] > 0:
            weeks3.append(week)


    # Final df arrangements
    df = df.sort_values('Date').reset_index(drop=True)
    df['Week'] = df.Week.ffill() # Fill mid week dates with correct NFL week identifier
    df['Week'] = df['Week'].astype(float).astype(int)
    df['Day'] = df.Date.dt.day_name() # Attach correct name for day of the week
    df = df.drop('date_diff', axis=1)
    df = df.drop_duplicates()
    
    return df

In [None]:
def add_offszn_dates(con_memory, df):
#   Function name: add_offszn_dates
#   Description: This function is used to add offseason dates 
#                to help identify activities such as suspensions, injuries, transactions, etc.
#   Parameters: con_memory, df
#        con_memory(duckdb connection): connect to notebook's duckdb session
#        df(pandas dataframe): The NFL calendar without offseason dates
#   Return values: df
#        df(pandas dataframe): The NFL calendar with offseason dates

    # Take first and last games played of each NFL week, then fill in those missing rows
    seasons = df.Season.unique().tolist()
    for i in seasons:
        i = int(i)
        if i == seasons[-1]:
            break
        df_temp1 = df[(df['Season'] == i)]
        df_temp2 = df[(df['Season'] == i + 1)]
        date1 = pd.to_datetime(df_temp1.Date.dt.strftime('%Y-%m-%d').iloc[-1]) - pd.Timedelta(days=1) # Take last played game of the season
        date2 = pd.to_datetime(df_temp2.Date.dt.strftime('%Y-%m-%d').iloc[0]) + pd.Timedelta(days=1) # Take first played game of the season

        # Fill in new offseason dates between rows
        new_days = pd.date_range(date1, date2)  
        df_temp3 = pd.DataFrame({'Season': i + 1, 'Week': 0, 'Date': new_days})
        df_temp3['Day'] = df_temp3.Date.dt.day_name()
        df = pd.concat([df, df_temp3]).sort_values('Date').reset_index(drop=True)


    # Remove duplicate rows (duplicate dates for superbowl and first game of season listed as week 0)
        con_memory.register('nfl_dates', df)
        df = con_memory.execute("""SELECT Date, count(*) as count FROM nfl_dates 
                                   GROUP BY Date 
                                   HAVING count > 1""").fetchdf() # Get duplicate dates   

        df = con_memory.execute("""SELECT * FROM nfl_dates 
                                   WHERE Date IN (SELECT Date FROM df) 
                                   AND  Week != 0""").fetchdf() # Get correct rows from duplicates    

        df = con_memory.execute("""SELECT * FROM nfl_dates 
                                   WHERE Date NOT IN (SELECT Date FROM df) 
                                   UNION SELECT * FROM df ORDER BY Date""").fetchdf() # Remove incorrect duplicate rows
    return df

##### Other Functions

In [None]:
def save_df(df, save_location, csv_name):
#   Function name: save_df
#   Description: This function is used to save any dataframe as a csv
#   Parameters: df, save_location, csv_name
#        df(pandas dataframe): The target dataframe
#        save_location(str): Specified location for the csv file to be saved
#        csv_name(str): Name of the csv file
    
    # creates folder if not existence
    output_dir = Path(save_location)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    save_loctn = f"{save_location}/{csv_name}"
    print(f"Saving {csv_name} at {save_loctn}")
    df.to_csv(save_loctn, index = False)
    print(f"Successfully saved {csv_name}!")

##### Main

In [None]:
def main():
#   Function name: main
#   Description: The entry function of the notebook

    con_memory = duckdb.connect(database=':memory:')
    start_szn = 1990 # subtract one to get offseason dates as well
    end_szn = 2024
    seasons = list(range(start_szn - 1, end_szn + 1))
    df = pd.DataFrame()
    for year in seasons:
        print(f"Collecting {year} season")
        df_temp = collect_year_calndr(con_memory, year)
        df = pd.concat([df, df_temp])
    df = add_offszn_dates(con_memory, df)
    df = df[(df['Season'] != start_szn - 1)] # Filter out season to keep start_szn's offseason
    save_df(df, '../tables', 'nfl_dates_xref.csv')
    display(df)
    con_memory.close()
    
main()