# Rookie's Data

We need to load and clean the data from the draft classes to predict their fantasy scores. 

In [1]:
# imports
import pandas as pd
import numpy as np  

# load the data
draft2020 = pd.read_csv('newdata/2020draftraw.csv', header=[0,1])
draft2021 = pd.read_csv('newdata/2021draftraw.csv', header=[0,1])
draft2022 = pd.read_csv('newdata/2022draftraw.csv', header=[0,1])
draft2023 = pd.read_csv('newdata/2023draftraw.csv', header=[0,1])
draft2024 = pd.read_csv('newdata/2024draftraw.csv', header=[0,1])
draft2025 = pd.read_csv('newdata/2025draftraw.csv', header=[0,1])

# add the year column
draft2020['Year'] = 2020
draft2021['Year'] = 2021
draft2022['Year'] = 2022
draft2023['Year'] = 2023
draft2024['Year'] = 2024
draft2025['Year'] = 2025

# combine the data
drafts_raw = pd.concat([draft2020, draft2021, draft2022, draft2023, draft2024, draft2025], ignore_index=True)
drafts_raw.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Misc,Misc,Unnamed: 9_level_0,...,Receiving,Receiving,Receiving,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,-additional,Year
Unnamed: 0_level_1,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,St,...,Rec,Yds,TD,Solo,Int,Sk,College/Univ,Unnamed: 28_level_1,-9999,Unnamed: 21_level_1
0,1,1,CIN,Joe Burrow,QB,23.0,2025.0,0,2,5,...,0.0,0.0,0.0,1.0,,,LSU,College Stats,BurrJo01,2020
1,1,2,WAS,Chase Young,DE,21.0,2024.0,0,1,1,...,0.0,0.0,0.0,88.0,,22.0,Ohio St.,College Stats,YounCh04,2020
2,1,3,DET,Jeff Okudah,CB,21.0,2025.0,0,0,2,...,0.0,0.0,0.0,150.0,2.0,,Ohio St.,College Stats,OkudJe00,2020
3,1,4,NYG,Andrew Thomas,T,21.0,2025.0,0,0,5,...,1.0,2.0,1.0,7.0,,,Georgia,College Stats,ThomAn02,2020
4,1,5,MIA,Tua Tagovailoa,QB,22.0,2025.0,0,1,5,...,0.0,0.0,0.0,,,,Alabama,College Stats,TagoTu00,2020


### Fix the positions

The draft is for real NFL teams to pick players that they need in every position. Not all of these positions are fantasy relevant, so we are only going to look at the players in the following positions: WR, QB, RB, TE, FB.

In [2]:
# select only relevant positions
drafts_raw = drafts_raw[drafts_raw[('Unnamed: 4_level_0', 'Pos')].isin(['WR', 'QB', 'RB', 'TE', 'FB'])]

#see the distinct positions
drafts_raw[('Unnamed: 4_level_0', 'Pos')].unique()

array(['QB', 'WR', 'RB', 'TE', 'FB'], dtype=object)

#### Clean the column names

In [3]:
drafts_raw.columns = ['_'.join(col).strip() for col in drafts_raw.columns.values]

# drop underscores that are not needed
drafts_raw.columns = [c.replace('__', '_').replace('-', '').strip('_') for c in drafts_raw.columns]

# check column names
drafts_raw.columns

# rename some columns
drafts_raw = drafts_raw.rename(columns={"Unnamed: 0_level_0_Rnd": "Round",
                                      "Unnamed: 1_level_0_Pick": "Pick",
                                        "Unnamed: 2_level_0_Tm": "Team",
                                        "Unnamed: 3_level_0_Player": "Player",
                                        "Unnamed: 4_level_0_Pos": "Position",
                                        "Unnamed: 5_level_0_Age": "Age",
                                        "Unnamed: 27_level_0_College/Univ": "College"})
drafts_raw.columns
                                      

Index(['Round', 'Pick', 'Team', 'Player', 'Position', 'Age',
       'Unnamed: 6_level_0_To', 'Misc_AP1', 'Misc_PB', 'Unnamed: 9_level_0_St',
       'Approx Val_wAV', 'Approx Val_DrAV', 'Unnamed: 12_level_0_G',
       'Passing_Cmp', 'Passing_Att', 'Passing_Yds', 'Passing_TD',
       'Passing_Int', 'Rushing_Att', 'Rushing_Yds', 'Rushing_TD',
       'Receiving_Rec', 'Receiving_Yds', 'Receiving_TD',
       'Unnamed: 24_level_0_Solo', 'Unnamed: 25_level_0_Int',
       'Unnamed: 26_level_0_Sk', 'College',
       'Unnamed: 28_level_0_Unnamed: 28_level_1', 'additional_9999', 'Year'],
      dtype='object')

#### Now we can select only the data that we need

In [None]:
#select only the data that we need
drafts_raw = drafts_raw[['Year', 'Round', 'Pick', 'Team', 'Player', 'Position', 'Age', 'College']]

# strip *+ from player names
drafts_raw['Player'] = drafts_raw['Player'].str.replace(r'[\*\+]', '', regex=True)

# create a dual key with name and year
drafts_raw = drafts_raw.set_index(['Player','Year'])
drafts_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Round,Pick,Team,Position,Age,College
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Joe Burrow,2020,1,1,CIN,QB,23.0,LSU
Tua Tagovailoa,2020,1,5,MIA,QB,22.0,Alabama
Justin Herbert,2020,1,6,LAC,QB,22.0,Oregon
Henry Ruggs III,2020,1,12,LVR,WR,21.0,Alabama
Jerry Jeudy,2020,1,15,DEN,WR,21.0,Alabama
