In [None]:
def save_df(df, save_location, csv_name):
#   Function name: save_df
#   Description: This function is used to save any dataframe as a csv
#   Parameters: df, save_location, csv_name
#        df(pandas dataframe): The target dataframe
#        save_location(str): Specified location for the csv file to be saved
#        csv_name(str): Name of the csv file
    
    # creates folder if not existence
    output_dir = Path(save_location)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    save_loctn = f"{save_location}/{csv_name}"
    print(f"Saving {csv_name} at {save_loctn}")
    df.to_csv(save_loctn, index = False)
    print(f"Successfully saved {csv_name}!")

In [None]:
def construct_df_roster(con_memory):
#   Function name: construct_roster_df
#   Description: Create the base of the roster table
#   Parameters: con_memory
#        con_memory(ducbdb object): used to carry duckdb queries
#   Return values: df
#        df(pandas dataframe): The dataframe with all runningbacks in the ../src/rosters folder
    
    # Allocate all rosters.csv files
    save_location = "../src/rosters"
    directory_path = Path(save_location)
    file_paths = [entry for entry in directory_path.iterdir() if entry.is_file()]
    file_names = [file.name for file in file_paths]
    df = pd.DataFrame()
    for i in file_names:
        df_temp = pd.read_csv(save_location + "/" + i)
        df = pd.concat([df, df_temp])

    # Make team names consistent with team_info_xref table
    team_nm_fixes = [('ARZ', 'ARI'), ('BLT', 'BAL'), ('CLV', 'CLE'), ('GB', 'GNB'), ('HST', 'HOU'),  
                     ('KC', 'KAN'), ('LA', 'LAR'), ('LV', 'LVR'), ('NE', 'NWE'), ('NO', 'NOR'),
                     ('SD', 'SDG'), ('SF', 'SFO'), ('SL', 'STL'), ('TB', 'TAM')]
    for wrong_nm, right_nm in team_nm_fixes:
        df['team'] = np.where(df.team == wrong_nm, right_nm, df.team)

    # Create a dataframe from all the rosters.csv files
    df_teams_exp = construct_df_teams(expanded=True)

    df = con_memory.execute("""SELECT df.* EXCLUDE(team), df_teams_exp.* FROM df JOIN df_teams_exp 
                               ON df.team = df_teams_exp.ABV AND df.season = df_teams_exp.Season
                               WHERE position IN ('RB', 'FB', 'HB')""").fetchdf()
    df.columns = df.columns.str.capitalize()
    df['Position'] = 'RB'
    df['ABV'] = df.Abv
    df['pfr_id'] = df.Pfr_id
    df['Player'] = df.Full_name
    df['Player'] = df['Player'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True) # Drop special characters from player names
    df = df[['Team', 'ABV', 'Season', 'Position', 'Player', 'Birth_date', 'Height', 'Weight', 'pfr_id', 'Years_exp']] \
        .sort_values(by=['Season', 'Team', 'Player']).reset_index(drop=True)

    return df

In [None]:
def construct_df_teams(expanded=False):
#   Function name: construct_df_teams
#   Description: Create the base of the teams table
#   Parameters: expanded
#        expanded(boolean)[default-False]: To return the expanded or regular version of the table
#   Return values: df_teams OR df_teams_exp
#        df_teams(pandas dataframe): The dataframe with all NFL teams and their aliases
#        df_teams_exp(pandas dataframe): The expanded dataframe with all NFL teams, aliases, and the seasons each alias 
#                                        belongs placed in individual rows

    # Construct df_teams by flattening team_info_xref.csv
    df_teams = pd.read_csv("../src/team_info_xref.csv")
    df_ABV = df_teams.dropna(subset=['ABV2']).reset_index(drop=True)
    row_loc = -1 # the pointer for the last row of the dataframe
    for row in range(df_ABV.shape[0]):
        team_entry = df_ABV.loc[row]
        pfr_abv = team_entry["PFR_ABV"]

        for i in [2, 3, 4, 5]:
            team_name = team_entry.get(f"Team{i}")
            abv = team_entry.get(f"ABV{i}")
            tm_legacy = team_entry.get(f"TmLegacy{i}")

            # Skip if ABV is NaN
            if pd.notna(abv):
                df_teams.loc[row_loc] = {
                    "Team": team_name,
                    "ABV": abv,
                    "PFR_ABV": pfr_abv,
                    "TmLegacy": tm_legacy,
                }
                row_loc -= 1
    df_teams['short_name'] = df_teams.Team.str.split(" ").str[-1]
    df_teams['short_name'] = np.where(df_teams.Team == 'Washington Football Team', 'Washington', df_teams.short_name)
    df_teams['TmLegacy'] = df_teams['TmLegacy'].str.replace('present', '2024') # HARD CODED: 2024
    df_teams = df_teams[['Team', 'ABV', 'PFR_ABV', 'short_name', 'TmLegacy']].sort_values('PFR_ABV').reset_index(drop=True)
    
    
    if expanded == True:
        
        # gather all pfr_abv
        pfr_abvs = df_teams.PFR_ABV.unique().tolist()

        # Take tmleg and make a list of seasons
        df_teams_exp = pd.DataFrame()
        for franchise in pfr_abvs:
            df_temp = df_teams[(df_teams.PFR_ABV == franchise)]
            df_temp['TmLegacy'] = np.where(df_temp.TmLegacy == '', '-2024', df_temp.TmLegacy)
            df_temp['strt_yr'] = df_temp.TmLegacy.str.split("-").str[0]
            df_temp['end_yr'] = df_temp.TmLegacy.str.split("-").str[1]
            df_temp = df_temp.sort_values('end_yr').reset_index(drop=True)

            yrs = sorted(df_temp.strt_yr.unique().tolist() + df_temp.end_yr.unique().tolist() + ['1990'])
            if yrs == ['1990', '2024']:
                df_temp['strt_yr'] = '1990'

            # Take the last row, then take the row before that and get the end_yr value
            if df_temp.shape[0] > 1:
                last_row = df_temp.iloc[-1]
                prev_end_yr = df_temp.iloc[last_row.name - 1]['end_yr']
                last_row['strt_yr'] = int(prev_end_yr) + 1

            # Explode out the rows for each season per team
            df_temp['strt_yr'] = pd.to_numeric(df_temp.strt_yr).astype(int)
            df_temp['end_yr'] = pd.to_numeric(df_temp.end_yr).astype(int)
            df_temp = (df_temp.assign(Season=df_temp.apply(lambda r: list(range(r["strt_yr"], r["end_yr"] + 1)), axis=1))
              .explode("Season", ignore_index=True)[["Team", "ABV", "PFR_ABV", "short_name", "Season"]])

            df_teams_exp = pd.concat([df_teams_exp, df_temp]).reset_index(drop=True) # df teams expanded contains seasons each 
                                                                                      # franchise was active in row by row format
        return df_teams_exp
    else:
        return df_teams

In [None]:
def concatenate_all_files(file_name, sub_folder):
#   Function name: concatenate_all_files
#   Description: This function is used to save the final dataframe as a csv file by collecting all individual sub csv files
#   Parameters: file_name, sub_folder
#        file_name(str): name of the main folder and also main file name
#        sub_folder(str): name of the sub_folder inside the main folder used to hold sub .csv files

    folder_location = f"../tables/{file_name}"
    directory_path = Path(folder_location + f"/{sub_folder}")
    file_paths = [entry for entry in directory_path.iterdir() if entry.is_file()]
    file_names = [file.name for file in file_paths]

    df = pd.DataFrame()
    for file in file_names:
        df_temp = pd.read_csv(folder_location + f"/{sub_folder}/" + file)
        df = pd.concat([df, df_temp])
        
    display(df)
    save_df(df, folder_location, f'{file_name}.csv')

In [None]:
import_dict = {"save_df()": True, "construct_df_roster()": True, "construct_df_teams()": True, 
               "concatenate_all_files()": True}
import_list = []
for function, boolean in import_dict.items():
    if boolean == True:
        import_list.append(function)

print(f"Importing following functions: {str(import_list).replace("[", "").replace("]", "").replace("'", "")}")