# Missing Values

Different techniques for handling missing values in datasets.

In [1]:
# Module imports
import numpy as np
import pandas as pd

In [2]:
# Prescribe seed for reproducibility
np.random.seed(0)

In [3]:
# Load data
data_string = r'C:\Developer\scratch-pad-python\Datasets\NFL Play by Play 2009-2017 (v4).csv'

nfl_data = pd.read_csv(data_string)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Peek at dataset
nfl_data.head()

Unnamed: 0,Date,GameID,Drive,qtr,down,time,TimeUnder,TimeSecs,PlayTimeDiff,SideofField,...,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season
0,2009-09-10,2009091000,1,1,,15:00,15,3600.0,0.0,TEN,...,,0.485675,0.514325,0.546433,0.453567,0.485675,0.060758,,,2009
1,2009-09-10,2009091000,1,1,1.0,14:53,15,3593.0,7.0,PIT,...,1.146076,0.546433,0.453567,0.551088,0.448912,0.546433,0.004655,-0.032244,0.036899,2009
2,2009-09-10,2009091000,1,1,2.0,14:16,15,3556.0,37.0,PIT,...,,0.551088,0.448912,0.510793,0.489207,0.551088,-0.040295,,,2009
3,2009-09-10,2009091000,1,1,3.0,13:35,14,3515.0,41.0,PIT,...,-5.031425,0.510793,0.489207,0.461217,0.538783,0.510793,-0.049576,0.106663,-0.156239,2009
4,2009-09-10,2009091000,1,1,4.0,13:27,14,3507.0,8.0,PIT,...,,0.461217,0.538783,0.558929,0.441071,0.461217,0.097712,,,2009


# Identifying Missing Data

Looking at the percentage of missing values in the dataset can help give an idea of scale.

In [9]:
# Get the number of null points per column
isnull_count = nfl_data.isnull().sum()

print(isnull_count)

Date             0
GameID           0
Drive            0
qtr              0
down         61154
             ...  
Win_Prob     25009
WPA           5541
airWPA      248501
yacWPA      248762
Season           0
Length: 102, dtype: int64


In [10]:
# Calculating percent of missing values across whole dataset

# Calculate numerator and denominator
total_cells = np.product(nfl_data.shape)
total_missing = isnull_count.sum()

# Calculate percentage
percent_missing = (total_missing / total_cells) * 100
print(percent_missing)

24.87214126835169


# Handling Missing Data

Is the value missing because it wasn't recorded, or it doesn't exist?

Doesn't exist -> Leave as NaN

Wasn't recorded -> Infer based on other values, Imputation.

Missing data can also be dropped by column or row.

In [11]:
# Removing columns with missing data

dropped_columns = nfl_data.dropna(axis = 1)
dropped_columns.head()

Unnamed: 0,Date,GameID,Drive,qtr,TimeUnder,ydstogo,ydsnet,PlayAttempted,Yards.Gained,sp,...,Timeout_Indicator,Timeout_Team,posteam_timeouts_pre,HomeTimeouts_Remaining_Pre,AwayTimeouts_Remaining_Pre,HomeTimeouts_Remaining_Post,AwayTimeouts_Remaining_Post,ExPoint_Prob,TwoPoint_Prob,Season
0,2009-09-10,2009091000,1,1,15,0,0,1,39,0,...,0,,3,3,3,3,3,0.0,0.0,2009
1,2009-09-10,2009091000,1,1,15,10,5,1,5,0,...,0,,3,3,3,3,3,0.0,0.0,2009
2,2009-09-10,2009091000,1,1,15,5,2,1,-3,0,...,0,,3,3,3,3,3,0.0,0.0,2009
3,2009-09-10,2009091000,1,1,14,8,2,1,0,0,...,0,,3,3,3,3,3,0.0,0.0,2009
4,2009-09-10,2009091000,1,1,14,8,2,1,0,0,...,0,,3,3,3,3,3,0.0,0.0,2009
