# Cleaning up the Data

We are going to use this notebook to show all of the steps we took to clean our data from Football Reference

In [1]:
import pandas as pd
import numpy as np
path = "newdata/"

### NFL data
We will start by cleaning the data that is from the actual NFL season that happened previously

In [2]:
# read in the data and view in raw form, we have two rows of headers
stats2019 = pd.read_csv(path + "2019statsraw.csv",header=[0,1])
stats2020 = pd.read_csv(path + "2020statsraw.csv",header=[0,1])
stats2021 = pd.read_csv(path + "2021statsraw.csv",header=[0,1])
stats2022 = pd.read_csv(path + "2022statsraw.csv",header=[0,1])
stats2023 = pd.read_csv(path + "2023statsraw.csv",header=[0,1])
stats2024 = pd.read_csv(path + "2024statsraw.csv",header=[0,1])
stats2019.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Games,Games,Passing,Passing,Passing,...,Scoring,Scoring,Fantasy,Fantasy,Fantasy,Fantasy,Fantasy,Fantasy,Fantasy,-additional
Unnamed: 0_level_1,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Att,Yds,...,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,-9999
0,1,Christian McCaffrey*+,CAR,RB,23,16,16,0,2,0,...,1.0,,355.0,471.2,477.2,413.2,215.0,1,1.0,McCaCh01
1,2,Lamar Jackson*+,BAL,QB,22,15,15,265,401,3127,...,,,416.0,415.7,429.7,421.7,152.0,1,2.0,JackLa00
2,3,Derrick Henry*,TEN,RB,25,15,15,0,0,0,...,,,277.0,294.6,303.6,285.6,136.0,2,3.0,HenrDe00
3,4,Aaron Jones,GNB,RB,25,16,16,0,0,0,...,,,266.0,314.8,322.8,290.3,125.0,3,4.0,JoneAa00
4,5,Ezekiel Elliott*,DAL,RB,24,16,16,0,0,0,...,,,258.0,311.7,319.7,284.7,117.0,4,5.0,ElliEz00


Fixing the column formatting

In [3]:
# label each dataframe with the year
stats2019[('Year',' ')] = 2019
stats2020[('Year',' ')] = 2020
stats2021[('Year',' ')] = 2021
stats2022[('Year',' ')] = 2022
stats2023[('Year',' ')] = 2023
stats2024[('Year',' ')] = 2024

# combine all dataframes into one
stats_raw = pd.concat([stats2019,stats2020,stats2021,stats2022,stats2023,stats2024])
stats_raw.columns = ['_'.join(col).strip() for col in stats_raw.columns.values]

# drop underscores that are not needed
stats_raw.columns = [c.replace('__', '_').replace('-', '').strip('_') for c in stats_raw.columns]

# check column names
stats_raw.columns

# rename some columns for easier access
stats_raw = stats_raw.rename(columns={'Unnamed: 0_level_0_Rk':'Rank', 'Unnamed: 1_level_0_Player':'Name',
       'Unnamed: 2_level_0_Tm':'Team', 'Unnamed: 3_level_0_FantPos':'Position',
       'Unnamed: 4_level_0_Age':'Age', 'Games_G':'Games', 'Games_GS':'Games_Started',
       'Fumbles_Fmb':'Fumbles', 'Fumbles_FL':'Fumbles_Lost',
       'Scoring_2PM':'2PM', 'Scoring_2PP':'2PP'})

# drop unnecessary columns
stats = stats_raw.drop(columns=['Rank', 'Games_Started', 'Fumbles', 'Scoring_TD','Rushing_Y/A','Receiving_Y/R','Fantasy_FantPt',
       'Fantasy_DKPt', 'Fantasy_FDPt', 'Fantasy_VBD', 'Fantasy_PosRank','Fantasy_OvRank', 'additional_9999'])

# strip *+ from player names
stats['Name'] = stats['Name'].str.replace(r'[\*\+]', '', regex=True)

# create a multi key index with player name and year
stats = stats.set_index(['Name','Year'])

stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Position,Age,Games,Passing_Cmp,Passing_Att,Passing_Yds,Passing_TD,Passing_Int,Rushing_Att,Rushing_Yds,Rushing_TD,Receiving_Tgt,Receiving_Rec,Receiving_Yds,Receiving_TD,Fumbles_Lost,2PM,2PP,Fantasy_PPR
Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Christian McCaffrey,2019,CAR,RB,23,16,0.0,2.0,0.0,0.0,0.0,287.0,1387.0,15.0,142.0,116.0,1005.0,4.0,0.0,1.0,,471.2
Lamar Jackson,2019,BAL,QB,22,15,265.0,401.0,3127.0,36.0,6.0,176.0,1206.0,7.0,0.0,0.0,0.0,0.0,2.0,,,415.7
Derrick Henry,2019,TEN,RB,25,15,0.0,0.0,0.0,0.0,0.0,303.0,1540.0,16.0,24.0,18.0,206.0,2.0,3.0,,,294.6
Aaron Jones,2019,GNB,RB,25,16,0.0,0.0,0.0,0.0,0.0,236.0,1084.0,16.0,68.0,49.0,474.0,3.0,2.0,,,314.8
Ezekiel Elliott,2019,DAL,RB,24,16,0.0,0.0,0.0,0.0,0.0,301.0,1357.0,12.0,71.0,54.0,420.0,2.0,2.0,,,311.7


Now we need to replace some of the NAs present with 0s or other corresponding values

In [4]:
# see where the NAs are
stats.isna().sum()


Team                0
Position            0
Age                 0
Games               0
Passing_Cmp         2
Passing_Att         2
Passing_Yds         2
Passing_TD          2
Passing_Int         2
Rushing_Att         2
Rushing_Yds         2
Rushing_TD          2
Receiving_Tgt       2
Receiving_Rec       2
Receiving_Yds       2
Receiving_TD        2
Fumbles_Lost        2
2PM              3552
2PP              3715
Fantasy_PPR       471
dtype: int64

In [5]:
# See who has NAs for each Column
# first the Passing stats
stats[stats['Passing_Cmp'].isna()]
# These players did not play this year, so we can just remove them
# remove players with no data for the year
stats = stats[~stats['Passing_Cmp'].isna()]


In [6]:
# Check NAs again
stats.isna().sum()

# Now most of the NAs are in columns related to 2 point conversions, which can just be replaced with 0s
stats['2PM'] = stats['2PM'].fillna(0)
stats['2PP'] = stats['2PP'].fillna(0)

# Check NAs again
stats.isna().sum()

# Now the NAs are in Fantasy PPR Column, lets see who has them
stats[stats['Fantasy_PPR'].isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Position,Age,Games,Passing_Cmp,Passing_Att,Passing_Yds,Passing_TD,Passing_Int,Rushing_Att,Rushing_Yds,Rushing_TD,Receiving_Tgt,Receiving_Rec,Receiving_Yds,Receiving_TD,Fumbles_Lost,2PM,2PP,Fantasy_PPR
Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jerell Adams,2019,HOU,TE,27,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Stephen Anderson,2019,LAC,TE,26,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Evan Baylis,2019,GNB,TE,26,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Kendall Blanton,2019,LAR,TE,24,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Deante Burton,2019,DAL,WR,25,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Rodney Williams,2024,PIT,TE,26,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Trayveon Williams,2024,CIN,RB,27,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Roman Wilson,2024,PIT,WR,23,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Colson Yankoff,2024,WAS,TE,24,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


Since there are a lot of rows that have NAs in the PPR fantasy points column, we can do our own calculation based on standard PPR scoring

We can also verify this using players that already have a PPR score

In [7]:
ppr_formula = (stats['Passing_Yds'] * 0.04 + stats['Passing_TD'] * 4 - stats['Passing_Int'] * 2 + stats['Rushing_Yds'] * 0.1  + stats['Rushing_TD'] * 6 + 0.1 * stats['Receiving_Yds'] + stats['Receiving_TD'] * 6 + stats['Receiving_Rec'] * 1 - stats['Fumbles_Lost'] * 2 + stats['2PM'] * 2 + stats['2PP'] * 2)

# Fill in the NAs with our calculated PPR values
stats['Fantasy_PPR'] = stats['Fantasy_PPR'].fillna(ppr_formula)

# Check NAs again
stats.isna().sum()

Team             0
Position         0
Age              0
Games            0
Passing_Cmp      0
Passing_Att      0
Passing_Yds      0
Passing_TD       0
Passing_Int      0
Rushing_Att      0
Rushing_Yds      0
Rushing_TD       0
Receiving_Tgt    0
Receiving_Rec    0
Receiving_Yds    0
Receiving_TD     0
Fumbles_Lost     0
2PM              0
2PP              0
Fantasy_PPR      0
dtype: int64

Now we are done with cleaning the data.

We just need to make it so the previous seasons stats are associated with the following season's fantasy score
This will allow us to predict the season's score based on the previous season's stats

In [8]:
# move the fantasy score to the next year and renmame the column to PPR_NextYear
stats['Fantasy_PPR_NextYear'] = stats.groupby(level=0)['Fantasy_PPR'].shift(-1)

#add the players team next year as well
stats['Team_NextYear'] = stats.groupby(level=0)['Team'].shift(-1)

# check if NAs are in the new column
stats.isna().sum()

# Check who has NAs in the new column
stats[stats['Fantasy_PPR_NextYear'].isna()]

# These players did not play the next year, so we can just remove them
# In the case of 2024, we do not have next years data at the time of this project
# we can store 2024 data for future use
data_2024 = stats.xs(2024, level=1)
data_2024.head()

clean_stats = stats.dropna(subset=['Fantasy_PPR_NextYear'])
clean_stats.shape


(2554, 22)

In [9]:
# Save the 2024 data for future use
data_2024.to_csv("clean_data/data_2024.csv")

# Save the cleaned data to a new csv
clean_stats.to_csv("clean_data/clean_stats.csv")
