### Author: Rodolfo Elenes

Date Created: 8/5/2025

Change log:
8/5/2025 - Initialized

# Notebook to do list
    1.) Add column that decides if the player had a big injury that season
    
# Enhancements
    1.) Add data validation steps
    2.) SQL server implementation over CSV files
    3.) Implement using config json files, to make this notebook usable for other positions and store
        long information like season mappings

##### Imports

In [1]:
import pandas as pd
import numpy as np
import time
from pathlib import Path
import warnings
import traceback
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

##### Create gamelog functions

In [2]:
def create_player_gamelog_csv(player_name, pfr_id):
#   Function name: create_player_gamelog_csv
#   Description: This function is used to generate a dataframe that contains a player's gamelog
#   Parameters: player_name, pfr_id
#        player_name(str): First and Last name of a player, ex: Saquon Barkley
#        pfr_id(str): Pro Football Reference id used in each players URL to retrieve all gamelog information
#   Return values: df, status
#        df(pandas dataframe): The final dataframe that will be exported as a csv file
#        status: Tells the parent function this function's run result
    
    # Get last name initial
    lst_nm_initial = player_name[0].capitalize()
    
    time.sleep(6) # to respect website scraping policies
    url = f"https://www.pro-football-reference.com/players/{lst_nm_initial}/{pfr_id}/gamelog/"
    df = build_career_gmlog_df(url)
    
    try:
        playoffs_df = build_career_gmlog_df(url, playoffs=True)
        df = pd.concat([df, playoffs_df])
    except:
        pass
    
    status = ""
    if df.shape[0] <= 8:   # sets minimum game requirement for players to be saved
        print(f"Insufficient gamelog data for {player_name}. Player has {df.shape[0]} games logged.")
        status = "Insufficient data"
    else:
        try:
            df = df_rebuild(df)
        except Exception as e:
            print(f"Unsupported gamelog schema for {player_name}. Please check: {url}")
            print("\nError:", e, "\n")
            traceback.print_exc()
            return df, status
        print(f"Gamelog for {player_name}")
        display(df)
        status = "Save"
        
    return df, status

In [3]:
def build_career_gmlog_df(url, playoffs=False):
#   Function name: build_career_gmlog_df
#   Description: This function is used to scrape the raw dataframe of the player's gamelog from pfr's website 
#   Parameters: url, playoffs
#        url(str): The URL that points to a player's gamelog
#        playoffs(boolean): Tells the function to extract the regular season or playoffs table
#   Return values: df, status
#        df(pandas dataframe): The raw dataframe that will be transformed into the final dataframe
    
    if playoffs == True:
        df = pd.read_html(url, header=[0, 1])[1]
    else:
        df = pd.read_html(url, header=[0, 1])[0]
        
    # Fill top-level header missing values forward
    cols = pd.DataFrame(df.columns.tolist())
    cols.iloc[:, 0] = cols.iloc[:, 0].replace("Unnamed:.*", pd.NA, regex=True).fillna(method='ffill')
    # Rebuild MultiIndex
    df.columns = pd.MultiIndex.from_frame(cols)
    df = df[['NaN', 'Rushing', 'Receiving', 'Snap Counts']]

    # Then flatten as before
    df.columns = [
        f"{a}_{b}".strip('_') if b else a 
        for a, b in df.columns
    ]

    if playoffs == True:
        df['Season_type'] = "POST"
    else:
        df['Season_type'] = "REG"
    
    return df

In [4]:
def df_rebuild(df):
#   Function name: df_rebuild
#   Description: This function is used to take the raw dataframe and apply all necessary transformations
#   Parameters: df
#        df(pandas dataframe): The raw input dataframe
#   Return values: df, status
#        df(pandas dataframe): The final dataframe that will be saved as a csv file

    # Remove nan_ from Date and GS, rename Gcar col
    new_cols = []
    for col in df.columns:
        if "nan" in col:
            if col == 'nan_Gcar':  # Exclusively rename Gcar to CarGm
                col = 'CarGm'
            new_cols.append(col.replace("nan_", ""))
        else:
            new_cols.append(col)
    df.columns = new_cols

    # Drop unneccesary rows
    process_columns = ['CarGm', 'Date', 'GS', 'Season_type', 'Week', 'Team', 'Rushing_Att', 'Rushing_Yds', 'Rushing_TD', 'Receiving_Tgt', 'Receiving_Rec', 'Receiving_Yds', 'Receiving_TD', 'Snap Counts_OffSnp', 'Snap Counts_Off%', 'Snap Counts_STSnp','Snap Counts_ST%']
    df = df[process_columns]
    
    # filter out rows that do not contain games (i.e. header rows, summary rows, etc.)
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date']).reset_index(drop = True)
    
    # Create Season column
    min_year = (df['Date'].min().year) - 1  # subtracting for players who debutted in next calendar year
    max_year = df['Date'].max().year
    df['Season'] = ''
    for i in range(max_year, min_year - 1, -1):
        start_date = f"{i}-08-01"
        end_date = f"{i + 1}-03-01"
        date_filter = (df['Date'] > start_date) & (df['Date'] <= end_date)
        df["Season"] = np.where(date_filter, i, df["Season"])

    # Final column rename
    edit_df_cols = df.columns.tolist()
    final_columns = ['CarGm', 'Date', 'GS', 'Season_type', 'Week', 'Team', 'RushAtt', 'RushYds', 'RushTD', 'Tgt', 'Rec', 'RecYds', 'RecTD', 'OffSnp', 'OffSnp%', 'STSnp', 'STSnp%', 'Season']
    for i in range(df.shape[1]):
        edit_df_cols[i] = final_columns[i]
    df.columns = edit_df_cols
    
    df['GS'] = np.where(df['GS'] == '*', 1, 0)  # make Game Started column binary
    df = apply_schema(df)
        
    # Final order
    column_order = ['CarGm', 'Date', 'Season', 'Season_type', 'Week', 'Team', 'GS', 'RushAtt', 'RushYds', 'RushTD', 'Tgt', 'Rec', 'RecYds', 'RecTD', 'OffSnp', 'OffSnp%', 'STSnp', 'STSnp%']
    df = df[column_order].sort_values("Date").reset_index(drop=True)
    df['CarGm'] = range(1, len(df) + 1) # Numerize the Career Games based off the Dates
    df = add_DNP_rows(df)
    df = find_bye_weeks(df)
    
    return df

In [5]:
def apply_schema(df):
#   Function name: apply_schema
#   Description: This function is used to apply the correct dataframe schema
#   Parameters: df
#        input_df(pandas dataframe): The input dataframe
#   Return values: df, status
#        df(pandas dataframe): The transformed dataframe with correct datatypes schema
    
    # apply proper schema
    int_cols = ['CarGm', 'Season', 'Week', 'GS', 'RushAtt', 'RushYds', 'RushTD', 'Tgt', 'Rec', 'RecYds', 'RecTD', 'OffSnp', 'STSnp']
    for col in int_cols:
        df[col] = df[col].astype(float).astype(int)
    
    float_cols = ['OffSnp%', 'STSnp%']
    for col in float_cols:
        df[col] = df[col].astype(float)
        
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    
    return df

##### DNP rows manipulation functions

In [6]:
def add_DNP_rows(input_df):
#   Function name: add_DNP_rows
#   Description: This function is used to add missing DNP rows to the dataframe
#   Parameters: input_df
#        input_df(pandas dataframe): The input dataframe
#   Return values: df
#        df(pandas dataframe): The transformed dataframe with DNP rows
    
    career_seasons = input_df['Season'].unique().tolist() # get the seasons in a list

    df_gamelog = input_df[(input_df['Season_type'] == 'REG')] # exclude playoffs rows
    df_playoffs = input_df[(input_df['Season_type'] == 'POST')] # store playoffs rows
    final_columns = input_df.columns
    df = pd.DataFrame(columns = final_columns)

    # split df_gamelog by seasons
    for season in career_seasons:
        df_season = df_gamelog[(df_gamelog['Season']) == season]
        if season >= 2021:
            week_games = list(range(1, 19))
        else:
            week_games = list(range(1, 18))

        df_DNP = pd.DataFrame(columns = final_columns)
        games_played = df_season['Week'].tolist()  # get weeks value from season
        weeks_missed = list(set(week_games) - set(games_played))  # get games missed

        #add DNP rows
        for week in weeks_missed:        
            df_DNP.loc[-1] = {'Week': week, 'Season_type': 'DNP', 'Season': season}  # Add DNP row
            df_DNP = df_DNP.reset_index(drop=True)

        df_season = pd.concat([df_season, df_DNP]).sort_values('Week').reset_index(drop=True)
        df_season['Team'] = df_season['Team'].ffill()  # fills DNP week with correct team
        if df_season['Team'].isna().any(): # in case forward fill doesnt work due to a player beginning the season injured
            df_season['Team'] = df_season['Team'].bfill()
        df_season = df_season.fillna(0)
        df = pd.concat([df, df_season])

    df = pd.concat([df, df_playoffs])
    df = df.sort_values(by = ['Season', 'Week']).reset_index(drop=True)
    df = apply_schema(df)
    
    return df

In [7]:
def find_bye_weeks(df):
#   Function name: find_bye_weeks
#   Description: This function is used to correctly identify the bye weeks on DNP rows
#   Parameters: df
#        df(pandas dataframe): The input dataframe
#   Return values: df
#        df(pandas dataframe): The transformed dataframe with BYE week rows

    df_fltrd = df[(df["Season_type"] == 'DNP')]
    teams_played_for = df_fltrd['Team'].unique().tolist()
    years_played_for = df_fltrd['Season'].unique().tolist()

    df_teams = pd.read_csv("../tables/team_info_xref.csv")
    df_ABV = df_teams.dropna(subset=['ABV2']).reset_index(drop=True)

    # Flatten team_info_xref to have extra ABVs in its own rows
    row_loc = -1 # the pointer for the last row of the dataframe
    for row in range(df_ABV.shape[0]):
        team_entry = df_ABV.loc[row]
        team_name = team_entry.loc['Team']
        ABV2 = team_entry.loc['ABV2']
        ABV3 = team_entry.loc['ABV3']
        df_teams.loc[row_loc] = {'Team': team_name, 'ABV': ABV2}  # Add row with secondary abbreviations       
        row_loc = row_loc - 1
        if str(ABV3) != 'nan':
            df_teams.loc[row_loc] = {'Team': team_name, 'ABV': ABV3}  # Add row with third abbreviations 
            row_loc = row_loc - 1
    df_teams = df_teams[['Team', 'ABV']].sort_values('Team').reset_index(drop=True)

    # Get all the bye weeks relevant to the player
    df_bye = pd.read_csv("../tables/bye_weeks_xref/bye_weeks_xref.csv")
    df_bye = pd.merge(df_bye, df_teams, on='Team', how='inner')
    drop_condition = df_bye[(df_bye['Team'] == 'Tennessee Titans') & (df_bye['ABV'] == 'HOU') & 
                            (df_bye['Season'] >= 1997)].index
    df_bye = df_bye.drop(drop_condition) # Remove Titans HOU ABV (meant for the oilers) where the season >= 1997
    df_bye = df_bye[df_bye.ABV.isin(teams_played_for)].reset_index(drop=True)
    df_bye = df_bye[df_bye.Season.isin(years_played_for)].reset_index(drop=True)
    df_bye = df_bye[['ABV', 'Season', 'Bye Week']]
    
    # Match the correct bye week to the players log on DNP rows
    merged_df = pd.merge(df_fltrd, df_bye, left_on=['Team', 'Season'], right_on=['ABV', 'Season'], how='inner')
    merged_df = merged_df[(merged_df["Week"] == merged_df["Bye Week"])]
    merged_df['Season_type'] = "BYE"
    merged_df = merged_df[df.columns].reset_index(drop=True)

    # Apply BYE week rows to the final df
    for row in range(merged_df.shape[0]):
        week_entry = merged_df.loc[row]
        season = week_entry.loc['Season']
        week = week_entry.loc['Week']
        index_row = df[(df['Season'] == season) & (df['Week'] == week)].index[0]
        df.loc[index_row] = week_entry
    
    
    return df

##### Other functions

In [8]:
def save_player_tbl(df, player_name):
#   Function name: save_player_tbl
#   Description: This function is used to save the final dataframe as a csv file
#   Parameters: df, player_name
#        df(pandas dataframe): The final dataframe
#        player_name(str): The first and last name of the player
    
    # creates players folder if not existence
    output_dir = Path('../tables/players_gamelog')
    output_dir.mkdir(parents=True, exist_ok=True)
    
    save_loctn = f"../tables/players_gamelog/{player_name}_gamelog.csv"
    print(f"Saving gamelog data for {player_name} to {save_loctn}.")
    df.to_csv(save_loctn, index = False)
    print("Gamelog data saved!")

###### Main

In [15]:
def main():
#   Function name: main
#   Description: The entry function of the notebook

    print("All players that will have gamelog data scraped.")
    players_xref_path = "../tables/players_xref.csv"
    player_db = pd.read_csv(players_xref_path)
    fltrd_player_db = player_db[player_db['gm_log_rtrvd'] == 0].reset_index(drop = True)
    player_count = 1
    display(fltrd_player_db)
    
    for row in range(player_db.shape[0]):
        player_entry = player_db.loc[row]
        player_name = player_entry.loc['full_name']
        pfr_id = player_entry.loc['pfr_id']
        gm_log_rtrvd = player_entry.loc['gm_log_rtrvd']
        
        if gm_log_rtrvd == 0:
            print(f"Player ({player_count}/{fltrd_player_db.shape[0]}): {player_name}")
            df, status = create_player_gamelog_csv(player_name, pfr_id)
            
            if status == "Save":
                save_player_tbl(df, player_name)
                player_db.loc[row, 'gm_log_rtrvd'] = 1 # Successful save
            elif status == "Insufficient data":
                player_db.loc[row, 'gm_log_rtrvd'] = 2 # Insufficient data
            else:
                player_db.loc[row, 'gm_log_rtrvd'] = 3 # Failed save (lets me know to debug)
            player_db.to_csv(players_xref_path, index = False)
            
            print(f"Updated gm_log_rtrvd entry in players_ref.csv for {player_name}")
            player_count = player_count + 1
        else:
            continue

    print(f"Completed acquiring gamelog for {fltrd_player_db.shape[0]} players.")
main()

All players that will have gamelog data scraped.


Unnamed: 0,season,team,position,full_name,height,weight,age,years_exp,pfr_id,gm_log_rtrvd


Completed acquiring gamelog for 0 players.
