# Rookie's Data

We need to load and clean the data from the draft classes to predict their fantasy scores. 

In [90]:
# imports
import pandas as pd
import numpy as np  

# load the data
draft2020 = pd.read_csv('newdata/2020draftraw.csv', header=[0,1])
draft2021 = pd.read_csv('newdata/2021draftraw.csv', header=[0,1])
draft2022 = pd.read_csv('newdata/2022draftraw.csv', header=[0,1])
draft2023 = pd.read_csv('newdata/2023draftraw.csv', header=[0,1])
draft2024 = pd.read_csv('newdata/2024draftraw.csv', header=[0,1])
draft2025 = pd.read_csv('newdata/2025draftraw.csv', header=[0,1])

# add the year column
draft2020['Year'] = 2020
draft2021['Year'] = 2021
draft2022['Year'] = 2022
draft2023['Year'] = 2023
draft2024['Year'] = 2024
draft2025['Year'] = 2025

# combine the data
drafts_raw = pd.concat([draft2020, draft2021, draft2022, draft2023, draft2024, draft2025], ignore_index=True)
drafts_raw.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Misc,Misc,Unnamed: 9_level_0,...,Receiving,Receiving,Receiving,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,-additional,Year
Unnamed: 0_level_1,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,St,...,Rec,Yds,TD,Solo,Int,Sk,College/Univ,Unnamed: 28_level_1,-9999,Unnamed: 21_level_1
0,1,1,CIN,Joe Burrow,QB,23.0,2025.0,0,2,5,...,0.0,0.0,0.0,1.0,,,LSU,College Stats,BurrJo01,2020
1,1,2,WAS,Chase Young,DE,21.0,2024.0,0,1,1,...,0.0,0.0,0.0,88.0,,22.0,Ohio St.,College Stats,YounCh04,2020
2,1,3,DET,Jeff Okudah,CB,21.0,2025.0,0,0,2,...,0.0,0.0,0.0,150.0,2.0,,Ohio St.,College Stats,OkudJe00,2020
3,1,4,NYG,Andrew Thomas,T,21.0,2025.0,0,0,5,...,1.0,2.0,1.0,7.0,,,Georgia,College Stats,ThomAn02,2020
4,1,5,MIA,Tua Tagovailoa,QB,22.0,2025.0,0,1,5,...,0.0,0.0,0.0,,,,Alabama,College Stats,TagoTu00,2020


### Fix the positions

The draft is for real NFL teams to pick players that they need in every position. Not all of these positions are fantasy relevant, so we are only going to look at the players in the following positions: WR, QB, RB, TE, FB.

In [91]:
# select only relevant positions
drafts_raw = drafts_raw[drafts_raw[('Unnamed: 4_level_0', 'Pos')].isin(['WR', 'QB', 'RB', 'TE', 'FB'])]

#see the distinct positions
drafts_raw[('Unnamed: 4_level_0', 'Pos')].unique()

array(['QB', 'WR', 'RB', 'TE', 'FB'], dtype=object)

#### Clean the column names

In [92]:
drafts_raw.columns = ['_'.join(col).strip() for col in drafts_raw.columns.values]

# drop underscores that are not needed
drafts_raw.columns = [c.replace('__', '_').replace('-', '').strip('_') for c in drafts_raw.columns]

# check column names
drafts_raw.columns

# rename some columns
drafts_raw = drafts_raw.rename(columns={"Unnamed: 0_level_0_Rnd": "Round",
                                      "Unnamed: 1_level_0_Pick": "Pick",
                                        "Unnamed: 2_level_0_Tm": "Team",
                                        "Unnamed: 3_level_0_Player": "Player",
                                        "Unnamed: 4_level_0_Pos": "Position",
                                        "Unnamed: 5_level_0_Age": "Age",
                                        "Unnamed: 27_level_0_College/Univ": "College"})
drafts_raw.columns
                                      

Index(['Round', 'Pick', 'Team', 'Player', 'Position', 'Age',
       'Unnamed: 6_level_0_To', 'Misc_AP1', 'Misc_PB', 'Unnamed: 9_level_0_St',
       'Approx Val_wAV', 'Approx Val_DrAV', 'Unnamed: 12_level_0_G',
       'Passing_Cmp', 'Passing_Att', 'Passing_Yds', 'Passing_TD',
       'Passing_Int', 'Rushing_Att', 'Rushing_Yds', 'Rushing_TD',
       'Receiving_Rec', 'Receiving_Yds', 'Receiving_TD',
       'Unnamed: 24_level_0_Solo', 'Unnamed: 25_level_0_Int',
       'Unnamed: 26_level_0_Sk', 'College',
       'Unnamed: 28_level_0_Unnamed: 28_level_1', 'additional_9999', 'Year'],
      dtype='object')

#### Now we can select only the data that we need

In [93]:
#select only the data that we need
drafts_raw = drafts_raw[['Year', 'Round', 'Pick', 'Team', 'Player', 'Position', 'Age', 'College']]

# strip *+ from player names
drafts_raw['Player'] = drafts_raw['Player'].str.replace(r'[\*\+]', '', regex=True)

# create a dual key with name and year
drafts_raw = drafts_raw.set_index(['Player','Year'])
drafts_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Round,Pick,Team,Position,Age,College
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Joe Burrow,2020,1,1,CIN,QB,23.0,LSU
Tua Tagovailoa,2020,1,5,MIA,QB,22.0,Alabama
Justin Herbert,2020,1,6,LAC,QB,22.0,Oregon
Henry Ruggs III,2020,1,12,LVR,WR,21.0,Alabama
Jerry Jeudy,2020,1,15,DEN,WR,21.0,Alabama


Now we have finished cleaning the data for the drafts, now we can work on cleaning their respective college stats from the year before they were drafted. 

# Clean College Stats

In [94]:
# load and combine the data by stat type
def load_and_combine_data(stat_type):
    df_list = []
    for year in range(2019, 2024):
        if stat_type != "passing":
            df = pd.read_csv(f'college stats/{year}_{stat_type}.csv', header=[0,1])
        else:
            df = pd.read_csv(f'college stats/{year}_passing.csv')
        df['Draft_Year'] = year + 1 # add 1 to align with draft year
        df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# load the data
passing_raw, rushing_raw, receiving_raw = [load_and_combine_data(stat) for stat in ['passing', 'rushing', 'receiving']]

passing_raw.head()

Unnamed: 0,Rk,Player,Team,Conf,G,Cmp,Att,Cmp%,Yds,TD,...,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,Awards,Player-additional,Draft_Year
0,1,Joe Burrow*,LSU,SEC,15,402,527,76.3,5671,60,...,6,1.1,10.8,12.53,14.1,378.1,202.0,H-1MaxwellAACamp,joe-burrow-1,2020
1,2,Anthony Gordon*,Washington State,Pac-12,13,493,689,71.6,5579,48,...,16,2.3,8.1,8.45,11.3,429.2,157.9,,anthony-gordon-2,2020
2,3,Cole McDonald*,Hawaii,MWC,14,326,511,63.8,4135,33,...,14,2.7,8.1,8.15,12.7,295.4,147.6,,cole-mcdonald-1,2020
3,4,Brady White*,Memphis,American,14,269,420,64.0,4014,33,...,11,2.6,9.6,9.95,14.9,286.7,165.0,,brady-white-1,2020
4,5,Brock Purdy*,Iowa State,Big 12,13,312,475,65.7,3982,27,...,9,1.9,8.4,8.67,12.8,306.3,151.1,,brock-purdy-1,2020


In [95]:
rushing_raw.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Rushing,Rushing,Rushing,Rushing,Rushing,...,Receiving,Receiving,Receiving,Scrimmage,Scrimmage,Scrimmage,Scrimmage,Unnamed: 19_level_0,-additional,Draft_Year
Unnamed: 0_level_1,Rk,Player,Team,Conf,G,Att,Yds,Y/A,TD,Y/G,...,Y/R,TD,Y/G,Plays,Yds,Avg,TD,Awards,-9999,Unnamed: 21_level_1
0,1,Chuba Hubbard*,Oklahoma State,Big 12,13,328,2094,6.4,21,161.1,...,8.6,0,15.2,351,2292,6.5,21,H-8AA,chuba-hubbard-1,2020
1,2,Malcolm Perry*,Navy,American,13,295,2017,6.8,21,155.2,...,,0,0.0,295,2017,6.8,21,,malcolm-perry-1,2020
2,3,J.K. Dobbins*,Ohio State,Big Ten,14,301,2003,6.7,21,143.1,...,10.7,2,17.6,324,2250,6.9,23,H-6,jk-dobbins-1,2020
3,4,Jonathan Taylor*,Wisconsin,Big Ten,14,320,2003,6.3,21,143.1,...,9.7,5,18.0,346,2255,6.5,26,H-5AA,jonathan-taylor-1,2020
4,5,Jaret Patterson*,Buffalo,MAC,13,312,1799,5.8,19,138.4,...,16.1,1,16.1,325,2008,6.2,20,,jaret-patterson-1,2020


In [96]:
receiving_raw.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Receiving,Receiving,Receiving,Receiving,Receiving,...,Rushing,Rushing,Rushing,Scrimmage,Scrimmage,Scrimmage,Scrimmage,Unnamed: 19_level_0,-additional,Draft_Year
Unnamed: 0_level_1,Rk,Player,Team,Conf,G,Rec,Yds,Y/R,TD,Y/G,...,Y/A,TD,Y/G,Plays,Yds,Avg,TD,Awards,-9999,Unnamed: 21_level_1
0,1,Ja'Marr Chase*,LSU,SEC,14,84,1780,21.2,20,127.1,...,5.0,0,0.4,85,1785,21.0,20,AA,jamarr-chase-1,2020
1,2,Omar Bayless*,Arkansas State,Sun Belt,13,93,1653,17.8,17,127.2,...,,0,0.0,93,1653,17.8,17,,omar-bayless-1,2020
2,3,Justin Jefferson*,LSU,SEC,15,111,1540,13.9,18,102.7,...,,0,0.0,111,1540,13.9,18,,justin-jefferson-1,2020
3,4,Antonio Gandy-Golden*,Liberty,Ind,13,79,1396,17.7,10,107.4,...,,0,0.0,79,1396,17.7,10,,antonio-gandy-golden-1,2020
4,5,Devin Duvernay*,Texas,Big 12,13,106,1386,13.1,9,106.6,...,2.4,1,1.8,116,1410,12.2,10,,devin-duvernay-1,2020


#### Fix Column names and row entries

In [97]:
# check column names for passing
passing_raw.columns

Index(['Rk', 'Player', 'Team', 'Conf', 'G', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD',
       'TD%', 'Int', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'Awards',
       'Player-additional', 'Draft_Year'],
      dtype='object')

In [98]:
#remove some columns for passing
passing_raw = passing_raw.drop(columns = ["Rk","Cmp%","TD%","Y/A","AY/A","Y/C","Y/G","Rate","Awards","Player-additional","Int%"])
passing_raw.columns

Index(['Player', 'Team', 'Conf', 'G', 'Cmp', 'Att', 'Yds', 'TD', 'Int',
       'Draft_Year'],
      dtype='object')

In [99]:
#rename some columns for passing
passing_raw = passing_raw.rename(columns={"Att": "Pass_Att",
                                          "Yds": "Pass_Yds",
                                            "TD": "Pass_TD"})

# strip * from player names
passing_raw['Player'] = passing_raw['Player'].str.replace(r'[\*\+]', '', regex=True)
passing_raw.head()

Unnamed: 0,Player,Team,Conf,G,Cmp,Pass_Att,Pass_Yds,Pass_TD,Int,Draft_Year
0,Joe Burrow,LSU,SEC,15,402,527,5671,60,6,2020
1,Anthony Gordon,Washington State,Pac-12,13,493,689,5579,48,16,2020
2,Cole McDonald,Hawaii,MWC,14,326,511,4135,33,14,2020
3,Brady White,Memphis,American,14,269,420,4014,33,11,2020
4,Brock Purdy,Iowa State,Big 12,13,312,475,3982,27,9,2020


In [100]:
# create a dual key with name and year
passing_raw = passing_raw.set_index(['Player','Draft_Year'])
passing_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Conf,G,Cmp,Pass_Att,Pass_Yds,Pass_TD,Int
Player,Draft_Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Joe Burrow,2020,LSU,SEC,15,402,527,5671,60,6
Anthony Gordon,2020,Washington State,Pac-12,13,493,689,5579,48,16
Cole McDonald,2020,Hawaii,MWC,14,326,511,4135,33,14
Brady White,2020,Memphis,American,14,269,420,4014,33,11
Brock Purdy,2020,Iowa State,Big 12,13,312,475,3982,27,9


In [101]:
# move onto rushing
# combine both headers into one
rushing_raw.columns = ['_'.join(col).strip() for col in rushing_raw.columns.values]

rushing_raw.columns

Index(['Unnamed: 0_level_0_Rk', 'Unnamed: 1_level_0_Player',
       'Unnamed: 2_level_0_Team', 'Unnamed: 3_level_0_Conf',
       'Unnamed: 4_level_0_G', 'Rushing_Att', 'Rushing_Yds', 'Rushing_Y/A',
       'Rushing_TD', 'Rushing_Y/G', 'Receiving_Rec', 'Receiving_Yds',
       'Receiving_Y/R', 'Receiving_TD', 'Receiving_Y/G', 'Scrimmage_Plays',
       'Scrimmage_Yds', 'Scrimmage_Avg', 'Scrimmage_TD',
       'Unnamed: 19_level_0_Awards', '-additional_-9999', 'Draft_Year_'],
      dtype='object')

In [102]:
#rename some columns
rushing_raw = rushing_raw.rename(columns={"Unnamed: 1_level_0_Player": "Player",
                                          "Unnamed: 2_level_0_Team": "Team",
                                          "Unnamed: 3_level_0_Conf": "Conf",
                                           "Unnamed: 4_level_0_G": "G",
                                           "Draft_Year_": "Draft_Year"})


# select only the relevant columns
rushing_raw = rushing_raw[['Player','Team','Conf', 'Draft_Year','G', 'Rushing_Att', 'Rushing_Yds', 'Rushing_TD']]
rushing_raw.head()

Unnamed: 0,Player,Team,Conf,Draft_Year,G,Rushing_Att,Rushing_Yds,Rushing_TD
0,Chuba Hubbard*,Oklahoma State,Big 12,2020,13,328,2094,21
1,Malcolm Perry*,Navy,American,2020,13,295,2017,21
2,J.K. Dobbins*,Ohio State,Big Ten,2020,14,301,2003,21
3,Jonathan Taylor*,Wisconsin,Big Ten,2020,14,320,2003,21
4,Jaret Patterson*,Buffalo,MAC,2020,13,312,1799,19


In [103]:
# remove * from player names
rushing_raw['Player'] = rushing_raw['Player'].str.replace(r'[\*\+]', '', regex=True)

# create a dual key with name and year
rushing_raw = rushing_raw.set_index(['Player','Draft_Year'])
rushing_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Conf,G,Rushing_Att,Rushing_Yds,Rushing_TD
Player,Draft_Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Chuba Hubbard,2020,Oklahoma State,Big 12,13,328,2094,21
Malcolm Perry,2020,Navy,American,13,295,2017,21
J.K. Dobbins,2020,Ohio State,Big Ten,14,301,2003,21
Jonathan Taylor,2020,Wisconsin,Big Ten,14,320,2003,21
Jaret Patterson,2020,Buffalo,MAC,13,312,1799,19


In [104]:
# now for receiving
# combine both headers into one
receiving_raw.columns = ['_'.join(col).strip() for col in receiving_raw.columns.values]
receiving_raw.columns

Index(['Unnamed: 0_level_0_Rk', 'Unnamed: 1_level_0_Player',
       'Unnamed: 2_level_0_Team', 'Unnamed: 3_level_0_Conf',
       'Unnamed: 4_level_0_G', 'Receiving_Rec', 'Receiving_Yds',
       'Receiving_Y/R', 'Receiving_TD', 'Receiving_Y/G', 'Rushing_Att',
       'Rushing_Yds', 'Rushing_Y/A', 'Rushing_TD', 'Rushing_Y/G',
       'Scrimmage_Plays', 'Scrimmage_Yds', 'Scrimmage_Avg', 'Scrimmage_TD',
       'Unnamed: 19_level_0_Awards', '-additional_-9999', 'Draft_Year_'],
      dtype='object')

In [105]:
# rename some columns for receiving
receiving_raw = receiving_raw.rename(columns={"Unnamed: 1_level_0_Player": "Player",
                                              "Unnamed: 2_level_0_Team": "Team",
                                                "Unnamed: 3_level_0_Conf": "Conf",
                                                "Unnamed: 4_level_0_G": "G",
                                                "Draft_Year_": "Draft_Year",
                                                "Receiving_Rec":"Rec"})

# select only the relevant columns
receiving_raw = receiving_raw[['Player','Team','Conf', 'Draft_Year', 'G','Rec', 'Receiving_Yds', 'Receiving_TD']]
receiving_raw.head()

Unnamed: 0,Player,Team,Conf,Draft_Year,G,Rec,Receiving_Yds,Receiving_TD
0,Ja'Marr Chase*,LSU,SEC,2020,14,84,1780,20
1,Omar Bayless*,Arkansas State,Sun Belt,2020,13,93,1653,17
2,Justin Jefferson*,LSU,SEC,2020,15,111,1540,18
3,Antonio Gandy-Golden*,Liberty,Ind,2020,13,79,1396,10
4,Devin Duvernay*,Texas,Big 12,2020,13,106,1386,9


In [106]:
# strip * from player names
receiving_raw['Player'] = receiving_raw['Player'].str.replace(r'[\*\+]', '', regex=True)

# create a dual key with name and year
receiving_raw = receiving_raw.set_index(['Player','Draft_Year'])
receiving_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Conf,G,Rec,Receiving_Yds,Receiving_TD
Player,Draft_Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ja'Marr Chase,2020,LSU,SEC,14,84,1780,20
Omar Bayless,2020,Arkansas State,Sun Belt,13,93,1653,17
Justin Jefferson,2020,LSU,SEC,15,111,1540,18
Antonio Gandy-Golden,2020,Liberty,Ind,13,79,1396,10
Devin Duvernay,2020,Texas,Big 12,13,106,1386,9


#### Now we will need to combine all of the three categories into one big dataset

In [108]:
# combine all of the three categories into one big dataset
# will join on Player and Draft_Year and also Team, Conf, and G to ensure accuracy
# Use outer joins to ensure no data is lost
combined_stats = pd.merge(passing_raw, rushing_raw, how='outer', left_on=['Player','Draft_Year','Team','Conf','G'], right_on=['Player','Draft_Year','Team','Conf','G'], suffixes=('_pass', '_rush'))

# now merge with receiving
combined_stats = pd.merge(combined_stats, receiving_raw, how='outer', left_on=['Player','Draft_Year','Team','Conf','G'], right_on=['Player','Draft_Year','Team','Conf','G'])

# fill in the NaN values with 0
combined_stats = combined_stats.fillna(0)
combined_stats.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Conf,G,Cmp,Pass_Att,Pass_Yds,Pass_TD,Int,Rushing_Att,Rushing_Yds,Rushing_TD,Rec,Receiving_Yds,Receiving_TD
Player,Draft_Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
A'Jon Vivens,2021,Colorado State,MWC,4,0.0,0.0,0.0,0.0,0.0,49.0,210.0,1.0,1.0,1.0,0.0
A'Jon Vivens,2022,Colorado State,MWC,9,0.0,0.0,0.0,0.0,0.0,90.0,324.0,0.0,10.0,73.0,0.0
A'Jon Vivens,2023,Colorado State,MWC,5,0.0,0.0,0.0,0.0,0.0,39.0,104.0,0.0,4.0,26.0,0.0
A'Marion Peterson,2024,USC,Pac-12,2,0.0,0.0,0.0,0.0,0.0,2.0,14.0,0.0,0.0,0.0,0.0
A'Montae Spivey,2020,Arkansas,SEC,2,0.0,0.0,0.0,0.0,0.0,7.0,24.0,0.0,0.0,0.0,0.0
