### Author: Rodolfo Elenes

Date Created: 9/24/2025

Purpose: To pull injury information from available sources and store the injury logs into one table

Change log:
- 9/24/2025 - Initialized

In [None]:
import pandas as pd
import numpy as np
import duckdb
import time
from pathlib import Path
import warnings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [None]:
%run ./common_utils.ipynb

In [None]:
def collect_inj_nfl_web(con):
#   Function name: collect_inj_nflverse
#   Description: Collect injuries from NFL.com weekly injury reports
#   Parameters: con
#        con(duckdb database): Used to connect to duckdb session
#   Return values: df
#        df(pandas dataframe): The dataframe with weekly NFL.com injuries

    start_szn = 2001 # Expect to see injuries report come up around 2002 week 1
    df_dates = pd.read_csv("../tables/nfl_dates_xref.csv")

    seasons = list(range(start_szn, 2025))
    df = pd.DataFrame()
    for year in seasons:
        df_temp = con.execute(f"""SELECT * FROM df_dates WHERE Season = '{year}' AND Season_type = 'REG'""").fetchdf()
        display(df_temp)
        weeks = df_temp.Week.unique().tolist()
        for week in weeks:
            print(f"Collecting Week {week} from the {year} Season.")
            url = f"https://www.nfl.com/injuries/league/{year}/reg{week}"
            time.sleep(4)
            try:
                result = pd.read_html(url)
            except Exception as e:
                print(e)
                continue
            tm_count = 0
            for i in range(0, 40):
                try:
                    df_temp = result[i]
                    df_temp['Season'] = year
                    df_temp['Week'] = week
                    df_temp = df_temp[['Season', 'Week', 'Player', 'Position', 'Injuries', 'Practice Status', 'Game Status']]
                    tm_count += 1
                    df = pd.concat([df, df_temp])
                except:
                    pass
            print(f"{tm_count} injury reports collected.")
    df.to_csv("../src/nfl_web_injuries.csv", index=False)
    
    return df

In [None]:
def collect_inj_nflverse(con):
#   Function name: collect_inj_nflverse
#   Description: Collect injuries from the downloaded NFLVerse dataset
#   Parameters: con
#        con(duckdb database): Used to connect to duckdb session
#   Return values: df
#        df(pandas dataframe): The dataframe with weekly NFLVerse injuries

    # Allocate all rosters.csv files
    save_location = "../src/injuries"
    directory_path = Path(save_location)
    file_paths = [entry for entry in directory_path.iterdir() if entry.is_file()]
    file_names = [file.name for file in file_paths]
    df = pd.DataFrame()
    for i in file_names:
        df_temp = pd.read_csv(save_location + "/" + i)
        df = pd.concat([df, df_temp])

    df = df[(df.game_type == 'REG') & (df.position.isin(['FB', 'RB'])) & ~(df.report_status.isnull())]

    # Make team names consistent with team_info_xref table
    team_nm_fixes = [('GB', 'GNB'), ('KC', 'KAN'), ('LA', 'LAR'), ('LV', 'LVR'), ('NE', 'NWE'), ('NO', 'NOR'),
                     ('SD', 'SDG'), ('SF', 'SFO'), ('TB', 'TAM')]
    for wrong_nm, right_nm in team_nm_fixes:
        df['team'] = np.where(df.team == wrong_nm, right_nm, df.team)

    df['Injuries'] = df[['report_primary_injury', 'report_secondary_injury']] \
        .fillna('') \
        .apply(lambda x: ', '.join([i for i in x if i != '']), axis=1)


    df['Injuries'] = np.where(
        df['Injuries'] == '',  # element-wise access to second part
        np.nan,        # keep first part if second is empty
        df['Injuries']                                  # otherwise keep original
    )

    df = con.execute("""SELECT season as Season, week as Week, full_name as Player, team as Team, position as Position, Injuries, report_status FROM df""").fetchdf()

    return df

In [None]:
def main():
#   Function name: main
#   Description: The entry function of the notebook, also does final transformations for final table

    con = duckdb.connect(database=':memory:')
    df = collect_inj_nfl_web(con)    
#     df = pd.read_csv("../src/nfl_web_injuries.csv") # To perform quicker testing

    display(df)
    df = df[~(df['Game Status'].isnull()) & (df.Position.isin(['RB', 'FB', 'HB']))]
    df = con.execute("""SELECT Season, Week, Player, Position, Injuries, "Game Status" as report_status FROM df""").fetchdf()
    df_roster = construct_df_roster(con)
    df = con.execute("""SELECT df.Season, df.Week, df.Player, df_roster.ABV as Team, df.Position, df.Injuries, df.report_status FROM df 
                         JOIN df_roster ON df.Season = df_roster.Season AND df.Player = df_roster.Player""").fetchdf()

    df2 = collect_inj_nflverse(con)

    df3 = con.execute("""SELECT * EXCLUDE (report_status), report_status AS "Game Status" FROM(SELECT * FROM df UNION SELECT * FROM df2)""").fetchdf()
    display(df3)
    df3.to_csv("../tables/injuries_xref.csv", index=False)
    
main()