### Author: Rodolfo Elenes

Date Created: 8/5/2025

Change log:
8/5/2025 - Initialized

##### Imports

In [None]:
import pandas as pd
import numpy as np
import duckdb
import time
from pathlib import Path
import warnings
import traceback
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [None]:
%run ./common_utils.ipynb

##### Create gamelog functions

In [None]:
def create_player_gamelog_csv(con_memory, player_name, pfr_id):
#   Function name: create_player_gamelog_csv
#   Description: This function is used to generate a dataframe that contains a player's gamelog
#   Parameters: con_memory, player_name, pfr_id
#        con_memory(DuckDB object): Connect to DuckDB session
#        player_name(str): First and Last name of a player, ex: Saquon Barkley
#        pfr_id(str): Pro Football Reference id used in each players URL to retrieve all gamelog information
#   Return values: df, status
#        df(pandas dataframe): The final dataframe that will be exported as a csv file
#        status: Tells the parent function this function's run result
    
    status = ""
    
    # Pickup regular season data
    lst_nm_initial = player_name[0].capitalize()  # Get last name initial
    time.sleep(6) # to respect website scraping policies
    url = f"https://www.pro-football-reference.com/players/{lst_nm_initial}/{pfr_id}/gamelog/"
    try:
        df = build_career_gmlog_df(url)
    except Exception as e:
        df = pd.DataFrame() # Have to return a value for df for pd.read_html() failures
        print("\nError:", e)
        print(f"Please check: {url}")
        traceback.print_exc()
        return df, status
            
    # Add playoffs data if available
    try:
        playoffs_df = build_career_gmlog_df(url, playoffs=True)
        df = pd.concat([df, playoffs_df])
    except:
        pass
        
    try:
        df = df_rebuild(con_memory, df, player_name)
        total_rush_att = df['RushAtt'].sum()
        if total_rush_att <= 55:   # sets minimum rushing attempt requirement for players to be saved
            print(f"Insufficient gamelog data for {player_name}. Player has {total_rush_att} rushing attempts logged.")
            status = "Insufficient data"
        else:
            print(f"Gamelog for {player_name}")
            display(df)
            status = "Save"
    except Exception as e:
        print(f"Unsupported gamelog schema for {player_name}. Please check: {url}")
        print("\nError:", e, "\n")
        traceback.print_exc()
        return df, status

        
    return df, status

In [None]:
def build_career_gmlog_df(url, playoffs=False):
#   Function name: build_career_gmlog_df
#   Description: This function is used to scrape the raw dataframe of the player's gamelog from pfr's website 
#   Parameters: url, playoffs
#        url(str): The URL that points to a player's gamelog
#        playoffs(boolean): Tells the function to extract the regular season or playoffs table
#   Return values: df, status
#        df(pandas dataframe): The raw dataframe that will be transformed into the final dataframe
    
    if playoffs == True:
        df = pd.read_html(url, header=[0, 1])[1]
    else:
        df = pd.read_html(url, header=[0, 1])[0]
        
    # Fill top-level header missing values forward
    cols = pd.DataFrame(df.columns.tolist())
    cols.iloc[:, 0] = cols.iloc[:, 0].replace("Unnamed:.*", pd.NA, regex=True).fillna(method='ffill')
    # Rebuild MultiIndex
    df.columns = pd.MultiIndex.from_frame(cols)
    df = df[['NaN', 'Rushing', 'Receiving', 'Snap Counts']]

    # Then flatten as before
    df.columns = [
        f"{a}_{b}".strip('_') if b else a 
        for a, b in df.columns
    ]

    if playoffs == True:
        df['Season_type'] = "POST"
    else:
        df['Season_type'] = "REG"
        
    df = df.loc[:, ~df.columns.duplicated()] # Drops duplicate columns
    
    return df

In [None]:
def df_rebuild(con_memory, df, player_name):
#   Function name: df_rebuild
#   Description: This function is used to take the raw dataframe and apply all necessary transformations
#   Parameters: con_memory, df
#        con_memory(DuckDB object): Connect to DuckDB session
#        df(pandas dataframe): The raw input dataframe
#   Return values: df, status
#        df(pandas dataframe): The final dataframe that will be saved as a csv file

    # Remove nan_ from Date and GS, rename Gcar col
    new_cols = []
    for col in df.columns:
        if "nan" in col:
            if col == 'nan_Gcar':  # Exclusively rename Gcar to CarGm
                col = 'CarGm'
            new_cols.append(col.replace("nan_", ""))
        else:
            new_cols.append(col)
    df.columns = new_cols

    # Add columns that are low priority
    process_columns = ['CarGm', 'Date', 'GS', 'Season_type', 'Week', 'Team', 'Rushing_Att', 'Rushing_Yds', 'Rushing_TD', 'Receiving_Tgt', 'Receiving_Rec', 'Receiving_Yds', 'Receiving_TD', 'Snap Counts_OffSnp', 'Snap Counts_STSnp']
    for col in process_columns:
        if col not in ['Snap Counts_OffSnp', 'Snap Counts_STSnp']:
            continue
        if col not in df.columns:
            df[col] = 0
    df = df[process_columns]
    
    # filter out rows that do not contain games (i.e. header rows, summary rows, etc.)
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date']).reset_index(drop = True)
    
    # Create Season column
    min_year = (df['Date'].min().year) - 1  # subtracting for players who debutted in next calendar year
    max_year = df['Date'].max().year
    df['Season'] = ''
    for i in range(max_year, min_year - 1, -1):
        start_date = f"{i}-08-01"
        end_date = f"{i + 1}-03-01"
        date_filter = (df['Date'] > start_date) & (df['Date'] <= end_date)
        df["Season"] = np.where(date_filter, i, df["Season"])

    # Final column rename
    edit_df_cols = df.columns.tolist()
    final_columns = ['CarGm', 'Date', 'GS', 'Season_type', 'Week', 'Team', 'RushAtt', 'RushYds', 'RushTD', 'Tgt', 'Rec', 'RecYds', 'RecTD', 'OffSnp', 'STSnp', 'Season']
    for i in range(df.shape[1]):
        edit_df_cols[i] = final_columns[i]
    df.columns = edit_df_cols
    
    df['GS'] = np.where(df['GS'] == '*', 1, 0)  # make Game Started column binary
    df = apply_schema(df)
        
    # Final order
    column_order = ['CarGm', 'Date', 'Season', 'Season_type', 'Week', 'Team', 'GS', 'RushAtt', 'RushYds', 'RushTD', 'Tgt', 'Rec', 'RecYds', 'RecTD', 'OffSnp', 'STSnp']
    df = df[column_order].sort_values("Date").reset_index(drop=True)
    df['CarGm'] = range(1, len(df) + 1) # Numerize the Career Games based off the Dates
    df = add_DNP_rows(df, player_name)
    df = trnsfrm_bye_weeks(df)
    df = add_DNP_dates(con_memory, df)
    df = trnsfrm_susp_weeks(df, player_name)
    
    return df

In [None]:
def apply_schema(df):
#   Function name: apply_schema
#   Description: This function is used to apply the correct dataframe schema
#   Parameters: df
#        input_df(pandas dataframe): The input dataframe
#   Return values: df, status
#        df(pandas dataframe): The transformed dataframe with correct datatypes schema

    # apply proper schema
    int_cols = ['CarGm', 'Season', 'Week', 'GS', 'RushAtt', 'RushYds', 'RushTD', 'Tgt', 'Rec', 'RecYds', 'RecTD', 'OffSnp', 'STSnp']
    for col in int_cols:                
        if col not in ['CarGm', 'Season', 'Week', 'GS'] and df[col].isna().any():
            df[col] = df[col].fillna(0)
        df[col] = df[col].astype(float).astype(int)
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')

    return df

##### DNP rows transformation functions

In [None]:
def add_DNP_rows(input_df, player_name):
#   Function name: add_DNP_rows
#   Description: This function is used to add missing DNP rows to the dataframe
#   Parameters: input_df, player_name
#        input_df(pandas dataframe): The input dataframe
#        player_name(str): Name of the player
#   Return values: df
#        df(pandas dataframe): The transformed dataframe with DNP rows

    con_memory = duckdb.connect(database=':memory:')
    career_seasons = input_df['Season'].unique().tolist() # get the seasons in a list

    df_gamelog = input_df[(input_df['Season_type'] == 'REG')] # exclude playoffs rows
    df_playoffs = input_df[(input_df['Season_type'] == 'POST')] # store playoffs rows
    final_columns = input_df.columns
    df = pd.DataFrame(columns = final_columns)

    # split df_gamelog by seasons
    for season in career_seasons:
        df_season = df_gamelog[(df_gamelog['Season']) == season]
        if season >= 2021:
            week_games = list(range(1, 19))
        else:
            week_games = list(range(1, 18))

        df_DNP = pd.DataFrame(columns = final_columns)
        games_played = df_season['Week'].tolist()  # get weeks value from season
        weeks_missed = list(set(week_games) - set(games_played))  # get games missed

        #add DNP rows
        for week in weeks_missed:        
            df_DNP.loc[-1] = {'Week': week, 'Season_type': 'DNP', 'Season': season}  # Add DNP row
            df_DNP = df_DNP.reset_index(drop=True)

        df_season = pd.concat([df_season, df_DNP]).sort_values('Week').reset_index(drop=True)
        df_season['Team'] = df_season['Team'].ffill()  # fills DNP week with correct team
        if df_season['Team'].isna().any(): # in case forward fill doesnt work due to a player beginning the season injured
            df_season['Team'] = df_season['Team'].bfill()
        df_season = df_season.fillna(0)
        df = pd.concat([df, df_season])

    df = pd.concat([df, df_playoffs])
    df = df.sort_values(by = ['Season', 'Week']).reset_index(drop=True)
    
    # Add missing seasons
    df_sched = pd.read_csv(f"../tables/nfl_team_schedules/nfl_team_schedules.csv")
    df_roster = construct_df_roster(con_memory)
    df_roster = df_roster[(df_roster.Player == player_name) & (df_roster.ABV.isin(df.Team.unique().tolist()))]

    gamelog_szns = df.Season.unique().tolist()
    roster_szns = df_roster.Season.unique().tolist()

    if gamelog_szns != roster_szns:
        missing_szns_chk = all(b - a == 1 for a, b in zip(gamelog_szns, gamelog_szns[1:]))
    else:
        missing_szns_chk = True

    if missing_szns_chk == False:
        missing_szns = list(set(roster_szns) - set(gamelog_szns))
        for year in missing_szns:
            if year >= 2021:
                week_games = list(range(1, 19))
            else:
                week_games = list(range(1, 18))

            df_temp = df[(df.Season == (year - 1))].reset_index(drop=True)
            df_temp2 = df_roster[(df_roster.Season == year)].reset_index(drop=True)

            df_mss_sched = df_sched[(df_sched.Team == df_temp2.iloc[0].loc['Team']) & (df_sched.Season == year)]   
            df_temp.loc[-1] = {'CarGm': 0, 'Date': df_mss_sched.Date.tolist(), 'Season': year, 'Season_type': 'DNP', 'Team': df_temp2.iloc[0].loc['team'], 'Week': df_mss_sched.Week.tolist()}
            df_temp = df_temp.explode(['Date', 'Week'])
            df_temp = df_temp[(df_temp.Season == year) & (df_temp.Week.isin(week_games))].fillna(0)
            df = pd.concat([df, df_temp]).sort_values(by=['Season', 'Week']).reset_index(drop=True)
    
    df = apply_schema(df)
    con_memory.close()
    
    return df

In [None]:
def trnsfrm_bye_weeks(df):
#   Function name: find_bye_weeks
#   Description: This function is used to correctly identify the bye weeks on DNP rows
#   Parameters: df
#        df(pandas dataframe): The input dataframe
#   Return values: df
#        df(pandas dataframe): The transformed dataframe with BYE week rows

    df_fltrd = df[(df["Season_type"] == 'DNP')]
    teams_played_for = df_fltrd['Team'].unique().tolist()
    years_played_for = df_fltrd['Season'].unique().tolist()

    df_teams = construct_df_teams()

    # Get all the bye weeks relevant to the player
    df_bye = pd.read_csv("../tables/bye_weeks_xref/bye_weeks_xref.csv")
    df_bye = pd.merge(df_bye, df_teams, on=['Team', 'PFR_ABV'], how='inner')
    df_bye = df_bye[df_bye.ABV.isin(teams_played_for)].reset_index(drop=True)
    df_bye = df_bye[df_bye.Season.isin(years_played_for)].reset_index(drop=True)
    df_bye = df_bye[['ABV', 'Season', 'Bye Week']]
    
    # Match the correct bye week to the players log on DNP rows
    merged_df = pd.merge(df_fltrd, df_bye, left_on=['Team', 'Season'], right_on=['ABV', 'Season'], how='inner')
    merged_df = merged_df[(merged_df["Week"] == merged_df["Bye Week"])]
    merged_df['Season_type'] = "BYE"
    merged_df = merged_df[df.columns].reset_index(drop=True)

    # Apply BYE week rows to the final df
    for row in range(merged_df.shape[0]):
        week_entry = merged_df.loc[row]
        season = week_entry.loc['Season']
        week = week_entry.loc['Week']
        index_row = df[(df['Season'] == season) & (df['Week'] == week)].index[0]
        df.loc[index_row] = week_entry
    
    
    return df

In [None]:
def add_DNP_dates(con_memory, df):
#   Function name: add_DNP_dates
#   Description: Fill in correct dates for DNP rows
#   Parameters: con_memory, df
#        con_memory(DuckDB object): Connect to DuckDB session
#        df(pandas dataframe): The input dataframe
#   Return values: df
#        df(pandas dataframe): The transformed dataframe with filled out dates column

    df_sched = pd.read_csv("../tables/nfl_team_schedules/nfl_team_schedules.csv")
    df_DNP = df[(df["Season_type"] == 'DNP')]
    df_teams = construct_df_teams()

    df_sched = con_memory.execute(f"""SELECT df_sched.*, df_teams.ABV FROM df_sched 
                                 JOIN df_teams ON df_sched.Team = df_teams.Team""").fetchdf()

    df_DNP = con_memory.execute(f"""SELECT df_DNP.*, df_teams.Team as Team_Name FROM df_DNP 
                                 JOIN df_teams ON df_DNP.Team = df_teams.ABV""").fetchdf()

    df_DNP = con_memory.execute(f""" SELECT CarGm, df_sched.Date, df_DNP.* EXCLUDE (Team_Name, Date, CarGm)
                                 FROM df_sched JOIN df_DNP 
                                 ON df_sched.Team = df_DNP.Team_Name 
                                 AND df_sched.Season = df_DNP.Season AND df_sched.Week = df_DNP.Week 
                                 ORDER BY df_sched.Date""").fetchdf()
    
    df = df[(df['Season_type'] != 'DNP')]
    df = pd.concat([df, df_DNP]).sort_values(by=['Season', 'Week']).reset_index(drop=True)

    return df

In [None]:
def trnsfrm_susp_weeks(df, player_name):
#   Function name: trnsfrm_susp_weeks
#   Description: Transform DNP weeks into SUSP (suspended) rows based off the susp_weeks_xref.csv table
#   Parameters: df, player_name
#        df(pandas dataframe): The input dataframe
#        player_name(str): Name of the player
#   Return values: df
#        df(pandas dataframe): The transformed dataframe with target DNP rows transformed to SUSP

    szns_plyd = df.Season.unique().tolist()
    df_teams = construct_df_teams()
    df_susp = pd.read_csv("../tables/susp_weeks_xref.csv")
    df_susp = df_susp[(df_susp.Player == player_name) & (df_susp.Season.isin(szns_plyd))].reset_index(drop=True)

    for row in range(df_susp.shape[0]):
        susp_entry = df_susp.loc[row]
        if "game" in susp_entry.loc['Susp_len'].lower():      # For entries of x games suspensions
            susp_entry['Susp_len'] = int(susp_entry.Susp_len.split(" ")[0])
            susp_date = susp_entry.loc['Date']
            susp_len = susp_entry.loc['Susp_len']
            df_susp_dates = df[(df.Date >= susp_date) & (df["Season_type"] == 'DNP')]
            df_susp_dates = df_susp_dates.head(susp_len)
            df_susp_dates['Season_type'] = 'SUSP'
            idx_list = df_susp_dates.index.tolist()
            df.loc[idx_list] = df_susp_dates.loc[idx_list]
        else:                                                 # For entries of Entire xxxx Season suspensions
            susp_date = susp_entry.loc['Date']
            susp_szn = susp_entry.loc['Season']
            df_susp_dates = df[(df.Date >= susp_date) & (df["Season_type"] == 'DNP') & (df["Season"] == susp_szn)]
            df_susp_dates['Season_type'] = 'SUSP'
            idx_list = df_susp_dates.index.tolist()
            df.loc[idx_list] = df_susp_dates.loc[idx_list]

    return df

###### Main

In [None]:
def main():
#   Function name: main
#   Description: The entry function of the notebook

    con_memory = duckdb.connect(database=':memory:')
    print("All players that will have gamelog data scraped.")
    players_xref_path = "../tables/players_xref.csv"
    player_db = pd.read_csv(players_xref_path)
    fltrd_player_db = player_db[player_db['gm_log_rtrvd'] == 0].reset_index(drop = True)
    player_count = 1
    display(fltrd_player_db)
    
    for row in range(player_db.shape[0]):
        player_entry = player_db.loc[row]
        player_name = player_entry.loc['full_name']
        pfr_id = player_entry.loc['pfr_id']
        gm_log_rtrvd = player_entry.loc['gm_log_rtrvd']
        
        if gm_log_rtrvd == 0:
            print(f"Player ({player_count}/{fltrd_player_db.shape[0]}): {player_name}")
            df, status = create_player_gamelog_csv(con_memory, player_name, pfr_id)
            
            if status == "Save":
                gm_log_entry = 1 # Successful save
                save_df(df, "../tables/players_gamelogs/players", f"{player_name}_gamelog.csv")
            elif status == "Insufficient data":
                gm_log_entry = 2 # Insufficient data
            else:
                gm_log_entry = 3 # Failed save (lets me know to debug)
            
            player_db.loc[row, 'gm_log_rtrvd'] = gm_log_entry
            player_db.to_csv(players_xref_path, index = False)
            print(f"Updated gm_log_rtrvd entry to {gm_log_entry} in players_ref.csv for {player_name}")
            player_count = player_count + 1
        else:
            continue

    print(f"Completed acquiring gamelog for {fltrd_player_db.shape[0]} players.")
    con_memory.close()
    
    if fltrd_player_db.shape[0] > 0:
        concatenate_all_files('players_gamelogs', 'players')
    
main()

# Side Work

In [None]:
con_memory = duckdb.connect(database=':memory:')
df_susp = pd.read_csv("../tables/susp_weeks_xref.csv")
df_roster = construct_df_roster(con_memory)
# display(df_susp)
# display(df_roster)

df_susp = con_memory.execute("""SELECT * FROM df_susp JOIN df_roster
                                ON df_susp.Player = df_roster.name
                                """).fetchdf()
display(df_susp)

Prototype code:

In [None]:
# # adding missing seasons in add_DNP_rows()

# player = 'Roosevelt Potts'
# df = pd.read_csv(f"../tables/players_gamelog/{player}_gamelog.csv")
# df_sched = pd.read_csv(f"../tables/nfl_team_schedules.csv")
# df_teams = construct_df_teams()
# df_roster = construct_df_roster(duckdb.connect(database=':memory:'))
# df_roster = pd.merge(df_roster, df_teams, left_on = 'team', right_on = 'ABV')
# df_roster = df_roster[(df_roster.name == player) & (df_roster.team.isin(df.Team.unique().tolist()))]

# gamelog_szns = df.Season.unique().tolist()
# roster_szns = df_roster.season.unique().tolist()

# if gamelog_szns != roster_szns:
#     missing_szns_chk = all(b - a == 1 for a, b in zip(gamelog_szns, gamelog_szns[1:]))
# else:
#     missing_szns_chk = True

# if missing_szns_chk == False:
#     missing_szns = list(set(roster_szns) - set(gamelog_szns))
#     for year in missing_szns:
#         if year >= 2021:
#             week_games = list(range(1, 19))
#         else:
#             week_games = list(range(1, 18))
        
#         df_temp = df[(df.Season == (year - 1))].reset_index(drop=True)
#         df_temp2 = df_roster[(df_roster.season == year)].reset_index(drop=True)
        
#         df_mss_sched = df_sched[(df_sched.Team == df_temp2.iloc[0].loc['Team']) & (df_sched.Season == year)]   
#         df_temp.loc[-1] = {'CarGm': 0, 'Date': df_mss_sched.Date.tolist(), 'Season': year, 'Season_type': 'DNP', 'Team': df_temp2.iloc[0].loc['team'], 'Week': df_mss_sched.Week.tolist()}
#         df_temp = df_temp.explode(['Date', 'Week'])
#         df_temp = df_temp[(df_temp.Season == year) & (df_temp.Week.isin(week_games))].fillna(0)
#         df = pd.concat([df, df_temp]).sort_values(by=['Season', 'Week']).reset_index(drop=True)
# else:
#     print('No missing seasons')
    
# display(df)

##### Injury data exploration (comeback later)

In [None]:
# import nfl_data_py as nfl
# injuries_df = nfl.import_injuries(list(range(2009, 2025))) # Specify the years you want to retrieve

In [None]:
# # Actual Injuries that I will count
# query = injuries_df[(injuries_df['position'] == 'RB') & (injuries_df['full_name'] == 'Kyren Williams')
#                     & (~injuries_df['report_primary_injury'].str.contains("Coach|Decision|Not Injury|Non Injury|COVID|Personal|Non-Football Illness|Non Football Illness|non football injury", case=False, na=False))].dropna(subset=['report_status'])
# query = query[['season', 'game_type', 'team', 'week', 'full_name', 'report_primary_injury', 'report_secondary_injury', 'report_status']]
# for i in ['season', 'week']:
#     query[i] = query[i].astype(int)
# display(query)

# ##############################################################################################################################################################################

# display(pd.read_csv("../tables/players_gamelog/Kyren Williams_gamelog.csv").query("Season_type == 'DNP'"))

# ##############################################################################################################################################################################

# # Non injuries leaves (will stay as DNP, but will be logged)
# query = injuries_df[(injuries_df['position'] == 'RB') & (injuries_df['full_name'] == 'Kyren Williams')
#                     & (injuries_df['report_primary_injury'].str.contains("Coach|Decision|Not Injury|Non Injury|COVID|Personal|Non-Football Illness|Non Football Illness|non football injury", case=False))].dropna(subset=['report_status'])
# query = query[['season', 'game_type', 'team', 'week', 'full_name', 'report_primary_injury', 'report_secondary_injury', 'report_status']]
# for i in ['season', 'week']:
#     query[i] = query[i].astype(int)
# display(query)

In [None]:
# # Make function that checks consecutive DNP rows, and flags when player has big injury

# lst = [4, 5, 6, 9, 10, 11, 13, 14]
# lst.sort(reverse=True)
# print(lst)

# injury_list = []
# sub_list = []
# big_injury = False
# for i in lst:
#     if sub_list == []:
#         sub_list.append(i)
#         continue
#     prev_num = sub_list.pop()
#     subtract = prev_num - i
#     print(subtract)
#     sub_list.append(i)
#     if subtract == 1:
#         injury_list.append(subtract)
#     else:
#         injury_list.clear()

#     if len(injury_list) >= 3:
#         big_injury = True        
        
# if big_injury == True:
#     print('player had big injury this year')
# else:
#     print('player had no big injuries')