### Author: Rodolfo Elenes

Date Created: 8/5/2025

Purpose: To create the pool of players that will be consumed in create_players_gamelogs

Change log:
 - 8/5/2025 - Initialized to only have data from 2024 players
 - 9/10/2025 - Brought in all players since 1990 

##### Imports

In [None]:
import pandas as pd
import duckdb
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [None]:
%run ./common_utils.ipynb

# Get Runningbacks

In [None]:
# Setup the starting dataframe

con_memory = duckdb.connect(database=':memory:')
df_roster = construct_df_roster(con_memory)
df = df_roster[(df_roster['Position'] == 'RB')].sort_values(by=['Player', 'Season']).reset_index(drop=True)

# Take every player since 1990 and get their years experience
df_temp = con_memory.execute("""SELECT Player, Birth_date, min(Season) as Entry_year FROM df 
                                WHERE Player IN (SELECT Player FROM df WHERE Season >= 1990)
                                GROUP BY Player, Birth_date""").fetchdf()
df = con_memory.execute("""SELECT df.*, df_temp.Entry_year FROM df JOIN df_temp 
                                ON df.Player = df_temp.Player AND df.Birth_date = df_temp.Birth_date
                                """).fetchdf()

# Fill in missing pfr_ids for players who have them
df['pfr_id'] = df.groupby('Player')['pfr_id'].transform(lambda x: x.ffill().bfill())

# Take most recent season of each player
df = con_memory.execute("""SELECT DISTINCT ON (Player) * FROM df ORDER BY Player, Season DESC""").fetchdf()

# Bring table with manually inserted pfr_ids
df2 = pd.read_csv("../src/null_pfr_ids.csv")
df2 = df2[(df2.pfr_id.notnull())]
df = pd.concat([df, df2])

# Manual edits
df['Player'] = np.where(df.Player == 'Kenneth Walker III', 'Kenneth Walker', df.Player)
df['Player'] = np.where(df.Player == 'Jeffery Wilson', 'Jeff Wilson', df.Player)
df['Player'] = np.where(df.Player == 'Bam Knight', 'Zonovan Knight', df.Player)
df['Player'] = np.where(df.Player == 'Lew Nichols III', 'Lew Nichols', df.Player)
df['Player'] = np.where(df.Player == 'Chris Rodriguez Jr', 'Chris Rodriguez', df.Player)
df['Player'] = np.where(df.Player == 'Jamycal Hasty', 'JaMycal Hasty', df.Player)
df = df.sort_values('Season').drop_duplicates(subset="pfr_id", keep="last")

# Correct birthdate column
df['Birth_date1'] = pd.to_datetime(df['Birth_date'], format='%m/%d/%Y', errors = 'coerce')
df['Birth_date2'] = pd.to_datetime(df['Birth_date'], format='%Y-%m-%d', errors = 'coerce')
df['Birth_date1'] = np.where(df.Birth_date1.isnull(), df.Birth_date2, df.Birth_date1)
df['Birth_date'] = df.Birth_date1
df = df.drop(['Birth_date1', 'Birth_date2'], axis=1)

df = df[(df.pfr_id.notnull())].sort_values(by=['Season', 'Player']).reset_index(drop=True)

# Final table transformation
df['Years_exp'] = df.Years_exp.fillna((df.Season - df.Entry_year))
for col in ['Height', 'Weight', 'Years_exp']:
    df[col] = df[col].astype(int)
df['Birth_date'] = pd.to_datetime(df['Birth_date'])
df["season_dt"] = pd.to_datetime(df["Season"], format="%Y")
df['Age'] = ((df['season_dt'] - df['Birth_date']) / pd.Timedelta(days=365)).astype(int)
df = df.drop('season_dt', axis=1)
df['gm_log_rtrvd'] = 0
df = df[['Team', 'ABV', 'Season', 'Position', 'Player', 'pfr_id', 
         'Birth_date', 'Age', 'Years_exp', 'Entry_year', 'Height','Weight', 'gm_log_rtrvd']]
# con_memory.close()

# Export final table
strt_yr = 1990
total_players = df.shape[0]
df = df[(df.Season.isin(list(range(strt_yr, 2025))))].sort_values(by=['Season', 'Years_exp'], ascending=[True, False]).reset_index(drop=True)
df.to_csv('../tables/players_xref_all.csv', index = False)
print(f"Exported players from {strt_yr}-2024.\n{df.shape[0]}/{total_players} Available players exported.")
display(df)

# Exploratory Data Work

In [None]:
display(df.head()) #visualize the table
print(df.info()) #check data structure
print("\nDuplicates check:")
print(df.isnull().any()) #check for nulls

#check duplicate rows
if df.duplicated().any():
    print('\nDuplicates found.')
else:
    print('\nNo duplicates found.')

In [None]:
# check splits
def check_splits(col):
    print(f"Splits for {col} column.")
    df_numeric = df.select_dtypes(include=['number'])
    df_numeric = df_numeric[col].value_counts().reset_index()
    df_numeric.columns = [col, 'count']
    display(df_numeric.sort_values(col).reset_index(drop=True))

check_splits("Years_exp")
check_splits("Age")