### Author: Rodolfo Elenes

Date Created: 8/5/2025

Change log:
 - 8/5/2025 - Initialized to only have data from 2024 players
 - 9/10/2025 - Brought in all players since 1990 

##### Imports

In [1]:
import pandas as pd
import duckdb
from datetime import datetime
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [2]:
%run ./common_utils.ipynb

Importing following functions: save_df(), construct_df_roster(), construct_df_teams(), concatenate_all_files()


# Get Runningbacks

In [3]:
# Setup the starting dataframe

con_memory = duckdb.connect(database=':memory:')
df_roster = construct_df_roster(con_memory)
df = df_roster[(df_roster['Position'] == 'RB')].sort_values(by=['Player', 'Season']).reset_index(drop=True)

# Take every player since 1990 and get their years experience
df_temp = con_memory.execute("""SELECT Player, Birth_date, min(Season) as Entry_year FROM df 
                                WHERE Player IN (SELECT Player FROM df WHERE Season >= 1990)
                                GROUP BY Player, Birth_date""").fetchdf()
df = con_memory.execute("""SELECT df.*, df_temp.Entry_year FROM df JOIN df_temp 
                                ON df.Player = df_temp.Player AND df.Birth_date = df_temp.Birth_date
                                """).fetchdf()

# Fill in missing pfr_ids for players who have them
df['pfr_id'] = df.groupby('Player')['pfr_id'].transform(lambda x: x.ffill().bfill())

# Take most recent season of each player
df = con_memory.execute("""SELECT DISTINCT ON (Player) * FROM df ORDER BY Player, Season DESC""").fetchdf()

# Bring table with manually inserted pfr_ids
df2 = pd.read_csv("../src/null_pfr_ids.csv")
df2 = df2[(df2.pfr_id.notnull())]
df = pd.concat([df, df2])

# Manual edits
df['Player'] = np.where(df.Player == 'Kenneth Walker III', 'Kenneth Walker', df.Player)
df['Player'] = np.where(df.Player == 'Jeffery Wilson', 'Jeff Wilson', df.Player)
df['Player'] = np.where(df.Player == 'Bam Knight', 'Zonovan Knight', df.Player)
df['Player'] = np.where(df.Player == 'Lew Nichols III', 'Lew Nichols', df.Player)
df['Player'] = np.where(df.Player == 'Chris Rodriguez Jr', 'Chris Rodriguez', df.Player)
df['Player'] = np.where(df.Player == 'Jamycal Hasty', 'JaMycal Hasty', df.Player)
df = df.sort_values('Season').drop_duplicates(subset="pfr_id", keep="last")

# Correct birthdate column
df['Birth_date1'] = pd.to_datetime(df['Birth_date'], format='%m/%d/%Y', errors = 'coerce')
df['Birth_date2'] = pd.to_datetime(df['Birth_date'], format='%Y-%m-%d', errors = 'coerce')
df['Birth_date1'] = np.where(df.Birth_date1.isnull(), df.Birth_date2, df.Birth_date1)
df['Birth_date'] = df.Birth_date1
df = df.drop(['Birth_date1', 'Birth_date2'], axis=1)

df = df[(df.pfr_id.notnull())].sort_values(by=['Season', 'Player']).reset_index(drop=True)

# Final table transformation
df['Years_exp'] = df.Years_exp.fillna((df.Season - df.Entry_year))
for col in ['Height', 'Weight', 'Years_exp']:
    df[col] = df[col].astype(int)
df['Birth_date'] = pd.to_datetime(df['Birth_date'])
df["season_dt"] = pd.to_datetime(df["Season"], format="%Y")
df['Age'] = ((df['season_dt'] - df['Birth_date']) / pd.Timedelta(days=365)).astype(int)
df = df.drop('season_dt', axis=1)
df['gm_log_rtrvd'] = 0
df = df[['Team', 'ABV', 'Season', 'Position', 'Player', 'pfr_id', 
         'Birth_date', 'Age', 'Years_exp', 'Entry_year', 'Height','Weight', 'gm_log_rtrvd']]
# con_memory.close()

# Export final table
strt_yr = 1990
total_players = df.shape[0]
df = df[(df.Season.isin(list(range(strt_yr, 2025))))].sort_values(by=['Season', 'Years_exp'], ascending=[True, False]).reset_index(drop=True)
df.to_csv('../tables/players_xref_all.csv', index = False)
print(f"Exported players from {strt_yr}-2024.\n{df.shape[0]}/{total_players} Available players exported.")
display(df)

Exported players from 1990-2024.
1239/1239 Available players exported.


Unnamed: 0,Team,ABV,Season,Position,Player,pfr_id,Birth_date,Age,Years_exp,Entry_year,Height,Weight,gm_log_rtrvd
0,Detroit Lions,DET,1990,RB,James Wilder,WildJa00,1958-05-12,31,9,1981,75,225,0
1,Miami Dolphins,MIA,1990,RB,Tony Collins,CollTo01,1959-05-27,30,9,1981,71,203,0
2,Cleveland Browns,CLE,1990,RB,Barry Redden,ReddBa00,1960-07-21,29,8,1982,70,205,0
3,Denver Broncos,DEN,1990,RB,Sammy Winder,WindSa00,1959-07-15,30,8,1982,71,203,0
4,Los Angeles Rams,RAM,1990,RB,Curt Warner,WarnCu00,1961-03-18,28,7,1983,71,205,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234,Detroit Lions,DET,2024,RB,Sione Vaki,VakiSi00,2001-07-30,22,0,2024,71,208,0
1235,New England Patriots,NWE,2024,RB,Terrell Jennings,JennTe00,2001-01-03,23,0,2024,72,225,0
1236,Arizona Cardinals,ARI,2024,RB,Trey Benson,BensTr01,2002-07-23,21,0,2024,73,215,0
1237,New York Giants,NYG,2024,RB,Tyrone Tracy Jr,TracTy00,1999-11-23,24,0,2024,71,210,0


# Exploratory Data Work

In [4]:
display(df.head()) #visualize the table
print(df.info()) #check data structure
print("\nDuplicates check:")
print(df.isnull().any()) #check for nulls

#check duplicate rows
if df.duplicated().any():
    print('\nDuplicates found.')
else:
    print('\nNo duplicates found.')

Unnamed: 0,Team,ABV,Season,Position,Player,pfr_id,Birth_date,Age,Years_exp,Entry_year,Height,Weight,gm_log_rtrvd
0,Detroit Lions,DET,1990,RB,James Wilder,WildJa00,1958-05-12,31,9,1981,75,225,0
1,Miami Dolphins,MIA,1990,RB,Tony Collins,CollTo01,1959-05-27,30,9,1981,71,203,0
2,Cleveland Browns,CLE,1990,RB,Barry Redden,ReddBa00,1960-07-21,29,8,1982,70,205,0
3,Denver Broncos,DEN,1990,RB,Sammy Winder,WindSa00,1959-07-15,30,8,1982,71,203,0
4,Los Angeles Rams,RAM,1990,RB,Curt Warner,WarnCu00,1961-03-18,28,7,1983,71,205,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Team          1239 non-null   object        
 1   ABV           1239 non-null   object        
 2   Season        1239 non-null   int64         
 3   Position      1239 non-null   object        
 4   Player        1239 non-null   object        
 5   pfr_id        1239 non-null   object        
 6   Birth_date    1239 non-null   datetime64[ns]
 7   Age           1239 non-null   int64         
 8   Years_exp     1239 non-null   int64         
 9   Entry_year    1239 non-null   int64         
 10  Height        1239 non-null   int64         
 11  Weight        1239 non-null   int64         
 12  gm_log_rtrvd  1239 non-null   int64         
dtypes: datetime64[ns](1), int64(7), object(5)
memory usage: 126.0+ KB
None

Duplicates check:
Team            False
ABV             False
Seas

In [5]:
# check splits
def check_splits(col):
    print(f"Splits for {col} column.")
    df_numeric = df.select_dtypes(include=['number'])
    df_numeric = df_numeric[col].value_counts().reset_index()
    df_numeric.columns = [col, 'count']
    display(df_numeric.sort_values(col).reset_index(drop=True))

check_splits("Years_exp")
check_splits("Age")

Splits for Years_exp column.


Unnamed: 0,Years_exp,count
0,0,122
1,1,140
2,2,170
3,3,146
4,4,138
5,5,141
6,6,91
7,7,85
8,8,65
9,9,59


Splits for Age column.


Unnamed: 0,Age,count
0,19,1
1,20,3
2,21,30
3,22,69
4,23,128
5,24,156
6,25,160
7,26,156
8,27,134
9,28,122
