# NFL BIG DATA BOWL 2020 - 2021

# 0. Introduction
In this notebook we continue on from the 'BDB20-21_EDA_and_cleanup' notebook, which performs the Exploratory Data Analysis, and does some basic data wrangling on this data.  
This notebook focuses on some data visualizations to gain a better understanding on the data, and does some more elaborate feature engineering to create new data points in preparation for a machine learning algorithm

# 1. Notebook preparation
In this section of the notebook we load in all the necessary libraries, we read in the data and perform the data wrangling actions as found necessary in the BDB20-21_EDA_and_cleanup notebook.

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette = 'deep')
import datetime as dt
import time
from datetime import date
from tqdm.notebook import tqdm
import math
import cv2
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import ipywidgets as widgets
py.offline.init_notebook_mode(connected=True)

In [2]:
# Load the data
# Import the data contained in individual files
games = pd.read_csv("../Data/games.csv")
players = pd.read_csv("../Data/players.csv")
plays = pd.read_csv("../Data/plays.csv")

# Import the weekly tracking data
weeks = range(1,18)
tracking = pd.DataFrame()
for i in tqdm(weeks):
    week_data = pd.read_csv('../Data/week' + str(i) + '.csv')
    week_data['week'] = i
    tracking = tracking.append(week_data)

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




In [3]:
# Clean up the games dataset based on EDA notebook
games['gameDateTime'] = games.gameDate + " - " + games.gameTimeEastern
games.gameDateTime = pd.to_datetime(games.gameDateTime, format = '%m/%d/%Y - %H:%M:%S')
games['gameMonth'] = games.gameDateTime.dt.month_name()
games['gameWeekDay'] = games.gameDateTime.dt.day_name()
games['gameHour'] = games.gameDateTime.dt.hour
games['gameTimeSlot'] = np.NaN
games.loc[games.gameHour < 12, 'gameTimeSlot'] = 'Morning'
games.loc[(games.gameHour >=12) & (games.gameHour < 15), 'gameTimeSlot'] = 'Noon'
games.loc[(games.gameHour >=15) & (games.gameHour < 18), 'gameTimeSlot'] = 'Afternoon'
games.loc[games.gameHour >=18, 'gameTimeSlot'] = 'Evening'
games.drop(['gameHour', 'gameDate', 'gameTimeEastern'], axis = 1, inplace = True)
games['week'] = games.week.astype(str)

In [4]:
# Clean up the players dataset based on the EDA notebook
players['nflId'] = players.nflId.astype(str)
players.height = np.where(players.height.str.contains("-"), players.height, "0-" + players.height)
players.height = (players.height.str.split("-").str[0].astype(int) *12) + players.height.str.split("-").str[-1].astype(int)
players['birthDate'] = pd.to_datetime(players.birthDate)
players['age'] = ((pd.to_datetime("09/01/2018", format = "%m/%d/%Y") - 
                  players.birthDate).dt.days/365.25).apply(np.floor).astype(int)

players['consPosition'] = players.position
players.consPosition = players.consPosition.replace('OLB','LB')
players.consPosition = players.consPosition.replace('ILB','LB')
players.consPosition = players.consPosition.replace('MLB','LB')
players.consPosition = players.consPosition.replace('FS','S')
players.consPosition = players.consPosition.replace('SS','S')
players.consPosition = players.consPosition.replace('LS','S')
players.consPosition = players.consPosition.replace('HB','RB')
players.consPosition = players.consPosition.replace('NT','DT')
players.consPosition = players.consPosition.replace('DB','CB')

In [5]:
# Clean up the plays dataset based on the EDA notebook
plays.yardlineSide = plays.yardlineSide.replace(np.NaN, 'Midfield')

plays['offenseQB'] = 0
plays['offenseRB'] = 0
plays['offenseWR'] = 0
plays['offenseTE'] = 0
plays['offenseOL'] = 0
plays['offenseDL'] = 0
plays['offenseDB'] = 0
plays['offenseP'] = 0
plays['offenseLS'] = 0
plays['offenseK'] = 0
plays['offenseLB'] = 0

for i in tqdm(range(0,len(plays))):
    if str(plays.loc[i,'personnelO']) == 'nan':
        plays.loc[i,'offenseRB'] = np.nan
        plays.loc[i,'offenseWR'] = np.nan
        plays.loc[i,'offenseTE'] = np.nan
        plays.loc[i,'offenseQB'] = np.nan
        plays.loc[i,'offenseOL'] = np.nan
        plays.loc[i,'offenseDL'] = np.nan
        plays.loc[i,'offenseDB'] = np.nan
        plays.loc[i,'offenseP'] = np.nan
        plays.loc[i,'offenseLS'] = np.nan
        plays.loc[i,'offenseK'] = np.nan
        plays.loc[i,'offenseLB'] = np.nan
        continue
    personnelO_split = plays.personnelO.loc[i].split(",")
    personnelO_split = [x.strip(' ') for x in personnelO_split]
    for j in personnelO_split:
        if "RB" in j:
            plays.loc[i,'offenseRB'] = int(j[0])
        if "WR" in j:
            plays.loc[i,'offenseWR'] = int(j[0])
        if "TE" in j:
            plays.loc[i,'offenseTE'] = int(j[0])
        if "QB" in j:
            plays.loc[i,'offenseQB'] = int(j[0])
        if "OL" in j:
            plays.loc[i,'offenseOL'] = int(j[0])
        if "DL" in j:
            plays.loc[i,'offenseDL'] = int(j[0])
        if "DB" in j:
            plays.loc[i,'offenseDB'] = int(j[0])
        if "P" in j:
            plays.loc[i,'offenseP'] = int(j[0])
        if "LS" in j:
            plays.loc[i,'offenseLS'] = int(j[0])
        if "K" in j:
            plays.loc[i,'offenseK'] = int(j[0])
        if "LB" in j:
            plays.loc[i,'offenseLB'] = int(j[0])
positions = ['offenseRB', 'offenseWR', 'offenseTE', 'offenseQB', 'offenseOL', 'offenseDL',
            'offenseDB', 'offenseP', 'offenseLS', 'offenseK', 'offenseLB']
plays['positions_summed'] = plays[positions].sum(axis=1)
for i in tqdm(range(0,len(plays))):
    if str(plays.loc[i,'personnelO']) == 'nan':
        continue
    if (("QB" not in plays.personnelO.loc[i]) & (plays.positions_summed.loc[i] < 11) &
       ("K" not in plays.personnelO.loc[i]) & ("P" not in plays.personnelO.loc[i])):
        plays.loc[i,'offenseQB'] = 1
plays['positions_summed'] = plays[positions].sum(axis=1)
for i in tqdm(range(0,len(plays))):
    if str(plays.loc[i,'personnelO']) == 'nan':
        continue
    if (("OL" not in plays.personnelO.loc[i]) & (plays.positions_summed.loc[i] < 11)):
        plays.loc[i,'offenseOL'] = 11 - plays.loc[i,'positions_summed']
plays.drop('positions_summed', axis = 1, inplace = True)   
        
plays['defenseDL'] = 0
plays['defenseLB'] = 0
plays['defenseDB'] = 0
plays['defenseWR'] = 0
plays['defenseOL'] = 0
plays['defenseTE'] = 0
plays['defenseRB'] = 0
plays['defenseQB'] = 0
for i in tqdm(range(0,len(plays))):
    if str(plays.personnelD.loc[i]) == 'nan':
        plays.loc[i,'defenseDL'] = np.nan
        plays.loc[i,'defenseLB'] = np.nan
        plays.loc[i,'defenseDB'] = np.nan
        plays.loc[i,'defenseWR'] = np.nan
        plays.loc[i,'defenseOL'] = np.nan
        plays.loc[i,'defenseTE'] = np.nan
        plays.loc[i,'defenseRB'] = np.nan
        plays.loc[i,'defenseQB'] = np.nan
        continue
    personnelD_split = plays.personnelD.loc[i].split(",")
    personnelD_split = [x.strip(' ') for x in personnelD_split]
    for j in personnelD_split:
        if "DL" in j:
            plays.loc[i,'defenseDL'] = float(j[0])
        if "LB" in j:
            plays.loc[i,'defenseLB'] = float(j[0])
        if "DB" in j:
            plays.loc[i,'defenseDB'] = float(j[0])
        if "WR" in j:
            plays.loc[i,'defenseWR'] = float(j[0])
        if "OL" in j:
            plays.loc[i,'defenseOL'] = float(j[0])
        if "TE" in j:
            plays.loc[i,'defenseTE'] = float(j[0])
        if "RB" in j:
            plays.loc[i,'defenseRB'] = float(j[0])
        if "QB" in j:
            plays.loc[i,'defenseQB'] = float(j[0])
      
plays['gameClock_minutes'] = plays.gameClock.str.split(":").str[0].astype(float)
plays['gameClock_seconds'] = plays.gameClock.str.split(":").str[1].astype(float)
plays['secondsInQuarter'] = plays.gameClock_minutes * 60 + plays.gameClock_seconds
plays.drop(['gameClock_minutes', 'gameClock_seconds'], axis = 1, inplace = True)

plays['quarter_reversed'] = (plays.quarter - 4) * (-1)
plays['secondsInGame'] = plays.quarter_reversed * 900 + plays.secondsInQuarter
plays[plays['secondsInGame'] < 0].secondsInGame = np.NaN
plays.drop('quarter_reversed', axis = 1, inplace = True)

plays.absoluteYardlineNumber = plays.absoluteYardlineNumber - 10


penalties = plays.penaltyCodes.str.split(";", expand=True).stack().str.get_dummies().sum(level=0)
penalties = penalties[['DPI','OPI','ICT','DH']].copy()
plays = plays.join(penalties)
plays[['DPI','OPI','ICT','DH']] = plays[['DPI','OPI','ICT','DH']].fillna(0)
plays.ICT = plays.ICT.astype(str).replace("2.0","1.0").astype(float)
plays.DH = plays.DH.astype(str).replace("2.0","1.0").astype(float)
plays.drop('penaltyCodes', axis = 1, inplace = True)

plays.drop('penaltyJerseyNumbers', axis = 1, inplace = True)

plays = plays[plays['passResult'] != 'R'].reset_index(drop=True)

plays['uniquePlay'] = plays['gameId'].astype(str) + '_' + plays['playId'].astype(str)

HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [6]:
# Clean tracking dataset based on the EDA notebook
tracking.time = pd.to_datetime(tracking.time, format = '%Y-%m-%dT%H:%M:%S.%f')

tracking = tracking.drop(tracking[(tracking['gameId'] == 2018102101) & (tracking['playId'] == 3078)].index)
plays = plays.drop(plays[(plays['gameId'] == 2018102101) & (plays['playId'] == 3078)].index)

tracking['s'].at[18040204] = 2.74

tracking = tracking.drop(tracking[(tracking['gameId'] == 2018091602) & (tracking['playId'] == 3122)].index)
plays = plays.drop(plays[(plays['gameId'] == 2018091602) & (plays['playId'] == 3122)].index)

tracking.a.at[18040204] = 2.43

tracking.drop('jerseyNumber', axis = 1, inplace = True)

tracking['consPosition'] = tracking.position
tracking.consPosition = tracking.consPosition.replace('OLB','LB')
tracking.consPosition = tracking.consPosition.replace('ILB','LB')
tracking.consPosition = tracking.consPosition.replace('MLB','LB')
tracking.consPosition = tracking.consPosition.replace('FS','S')
tracking.consPosition = tracking.consPosition.replace('SS','S')
tracking.consPosition = tracking.consPosition.replace('LS','S')
tracking.consPosition = tracking.consPosition.replace('HB','RB')
tracking.consPosition = tracking.consPosition.replace('NT','DT')
tracking.consPosition = tracking.consPosition.replace('DB','CB')
tracking.drop('position', axis = 1, inplace = True)

tracking['uniqueFrame'] = tracking['gameId'].astype(str) + '_' + tracking['playId'].astype(str) + '_' + tracking['frameId'].astype(str)
tracking['uniquePlay'] = tracking['gameId'].astype(str) + '_' + tracking['playId'].astype(str)

trick_events = ['qb_spike', 'punt_fake', 'field_goal_fake', 'field_goal_blocked', 'field_goal_play']
plays_to_remove = tracking[tracking['event'].isin(trick_events)].uniquePlay.unique()
tracking = tracking[~tracking['uniquePlay'].isin(plays_to_remove)]
plays = plays[~plays['uniquePlay'].isin(plays_to_remove)]

multi_event_frames = tracking[['uniqueFrame','event']].groupby(by = 'uniqueFrame').nunique()
multi_event_frames = multi_event_frames[multi_event_frames['event'] > 1]
multi_event_frames = list(multi_event_frames.index)
multi_event_frames_count = tracking[tracking['uniqueFrame'].isin(multi_event_frames)][['uniquePlay','uniqueFrame','event','time']].groupby(by = ['uniquePlay','uniqueFrame','event']).count()
multi_event_plays = list(multi_event_frames_count.index.unique(level = 'uniquePlay'))
tracking = tracking[~tracking['uniquePlay'].isin(multi_event_plays)]
plays = plays[~plays['uniquePlay'].isin(multi_event_plays)]

time_values_check = tracking.groupby(by = ['gameId', 'uniquePlay', 'uniqueFrame', 'frameId']).agg({'time': ['min','max']}).reset_index()
time_values_check.columns = [' '.join(col).strip() for col in time_values_check.columns.values]
time_values_check['differential'] = time_values_check['time max'] - time_values_check['time min']
time_values_check['differential'] = time_values_check['differential'].astype('timedelta64[s]')
plays_to_validate = time_values_check[time_values_check['differential'] > 0].reset_index(drop = True)
events = ['pass_arrived', 'pass_outcome_caught', 'tackle', 'pass_outcome_incomplete', 'out_of_bounds', 'qb_sack',
          'touchdown', 'pass_outcome_touchdown', 'pass_outcome_interception', 'fumble', 'qb_strip_sack',
          'fumble_defense_recovered', 'fumble_offense_recovered', 'qb_spike']
plays_to_remove = []
frames_to_remove = []
for i in tqdm(plays_to_validate.uniquePlay.unique()):
    smallest_frame = plays_to_validate[plays_to_validate['uniquePlay'] == str(i)].frameId.min()
    largest_frame = plays_to_validate[plays_to_validate['uniquePlay'] == str(i)].frameId.max()
    play_data = tracking[tracking['uniquePlay'] == str(i)]
    largest_overall_frame = play_data.frameId.max()
    if largest_frame < largest_overall_frame:
        plays_to_remove.append(play_data.uniquePlay.iloc[0])
    else:
        problem_frames = play_data[play_data['frameId'] >= smallest_frame]
        
        if problem_frames.event.isin(events).any():
            plays_to_remove.append(problem_frames.uniquePlay.iloc[0])
        else:
            for j in problem_frames.uniqueFrame.unique():
                frames_to_remove.append(j)
tracking = tracking[~tracking['uniqueFrame'].isin(frames_to_remove)]
tracking = tracking[~tracking['uniquePlay'].isin(plays_to_remove)]
plays = plays[~plays['uniquePlay'].isin(plays_to_remove)]

time_diffs = tracking[['time', 'gameId', 'playId', 'frameId']].drop_duplicates().reset_index(drop = True)
time_diffs['time_diff'] = time_diffs['time'].diff()
time_diffs['time_diff'] = time_diffs['time_diff'].dt.total_seconds()
time_diffs['time_diff'] = np.where((time_diffs.gameId != time_diffs.gameId.shift(1)) |
                                   (time_diffs.playId != time_diffs.playId.shift(1)),
                                  np.NaN, time_diffs.time_diff)
time_diffs.drop('time', axis = 1, inplace = True)
problem_data = time_diffs[(time_diffs['time_diff'] < 0.08) |
                          (time_diffs['time_diff'] > 0.12)]
problem_data['uniquePlay'] = problem_data['gameId'].astype(str) + '_' + problem_data['playId'].astype(str)
problem_data.groupby(by = 'uniquePlay').count()
plays_to_remove = list(problem_data.uniquePlay.unique())
tracking = tracking[~tracking['uniquePlay'].isin(plays_to_remove)]
plays = plays[~plays['uniquePlay'].isin(plays_to_remove)]
tracking[tracking['frameId'] == 1].dis = np.NaN

tracking.drop_duplicates(inplace = True)

tracking = tracking.merge(games[['gameId','homeTeamAbbr','visitorTeamAbbr']],how = 'left',on = 'gameId')
tracking = tracking.merge(plays[['gameId','playId','absoluteYardlineNumber','possessionTeam']],how = 'left',on = ['gameId','playId'])
tracking['side'] = np.NaN
tracking['side'] = np.where(tracking['team'] == 'football','football',tracking['side'])
tracking['side'] = np.where((tracking['team'] == 'home') & (tracking['homeTeamAbbr'] == tracking['possessionTeam']),'offense',tracking['side'])
tracking['side'] = np.where((tracking['team'] == 'home') & (tracking['visitorTeamAbbr'] == tracking['possessionTeam']),'defense',tracking['side'])
tracking['side'] = np.where((tracking['team'] == 'away') & (tracking['homeTeamAbbr'] == tracking['possessionTeam']),'defense',tracking['side'])
tracking['side'] = np.where((tracking['team'] == 'away') & (tracking['visitorTeamAbbr'] == tracking['possessionTeam']),'offense',tracking['side'])

HBox(children=(FloatProgress(value=0.0, max=181.0), HTML(value='')))






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [7]:
# Function to plot data labels to scatterplots
def set_labels(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.3, point['y']+.3, str(point['val']))

In [8]:
# Function to plot positions at a particular event on the field
# Valid event choices are: snap; pass; pass_arrived; pass_caught; tackle; contact; pass_cinomplete; play_action;
# out_of_bounds; line_set; sack; first; last

def plot_event_positions(event, gameId = 2018090600, playId = 75):
    # Set the event labels
    event_labels = {'snap':'ball_snap', 'pass':'pass_forward', 'pass_arrived':'pass_arrived',
                'pass_caught':'pass_outcome_caught', 'tackle':'tackle', 'contact':'first_contact',
                'pass_incomplete':'pass_outcome_incomplete', 'play_action':'play_action',
                'out_of_bounds':'out_of_bounds', 'line_set':'line_set', 'sack':'qb_sack'
                }
    
    # First we filter out the necessary data
    # We start by filtering out the relevant game and play data
    example_play = tracking[(tracking['gameId'] == gameId) & 
                            (tracking['playId'] == playId)]
    # Then we filter out the correct event
    if event == 'first':
        example_play = example_play[example_play['frameId'] == 1]
    elif event == 'last':
        example_play = example_play[example_play['frameId'] == max(example_play['frameId'])]
    elif event in event_labels.keys():
        if event_labels[event] in list(example_play.event):
            example_play = example_play[example_play['event'] == event_labels[event]]
        else:
            return(print('The selected event does not occur in the selected play'))
    else:
        return(print('The selected event is not valid'))
    example_play.reset_index(drop=True, inplace = True)
    
    # Set the line of scrimmage
    LOS = example_play.absoluteYardlineNumber[0] + 10
    
    # Next we use seaborn to create a scatterplot of the data
    fig, ax = plt.subplots(figsize = (14.4, 6.4))
    g = sns.scatterplot(data = example_play
                       , x = 'x', y = 'y'
                       , hue = 'side' # Do we need this if we also set style?
                       , s = 200
                       , style = 'side'
                       , markers = {'offense': 'X', 'defense':'o', 'football':'D'}
                       )
    plt.axvline(x = LOS, lw = 4, c = 'red')
    # Set the size of the plot to include the full football field
    g.set(xlim=(0,120))
    g.set(ylim=(0,53.3))
    
    # Use our custom set_labels function to assign labels to the markers
    set_labels(example_play.x, example_play.y, example_play.consPosition, plt.gca())
    
    # Add the picture of the football field to the background
    ### NOTE: THE ASSIGNMENT REQUIRES THAT NO EXTERNAL DATA IS ADDED, SO THIS WILL NEED
    ### TO BE REPLACED BY A WAY OF SHOWING THE FOOTBALL FIELD WITHOUT AN IMPORTED IMAGE
    with open('../Resources/football_field.png', 'rb') as file:
        img = plt.imread(file, 'PNG')
    plt.imshow(img, zorder = 0, extent = [0.0, 120.0, 0.00, 53.3])

In [9]:
# Function to plot routes on the field
def plot_routes(gameId = 2018090600, playId = 75):
    # First we filter out the necessary data
    # This means getting the data for the game and play
    example_play = tracking[(tracking['gameId'] == gameId) &
                            (tracking['playId'] == playId)
                           ]
    example_play.reset_index(drop = True, inplace = True)
    
    # Set the line of scrimmage
    LOS = example_play.absoluteYardlineNumber[0] + 10
    
    # Next we use plotly to create a scatterplot for the data where each player and ball is traced.
    # Initiate the plotting
    fig = go.Figure()
    
    # Add the offense players traces
    for i in pd.unique(example_play[example_play['side'] == 'offense'].nflId):
        fig.add_trace(go.Scatter(x = example_play[example_play['nflId'] == i].x
                                , y = example_play[example_play['nflId'] == i].y
                                , name = example_play[example_play['nflId'] == i].displayName.iloc[0]
                                , mode = 'lines+markers'
                                , marker_color = 'rgba(83, 51, 237, 1)'
                                )
                     )
        
    # Add the defensive players traces
    for i in pd.unique(example_play[example_play['side'] == 'defense'].nflId):
        fig.add_trace(go.Scatter(x = example_play[example_play['nflId'] == i].x
                                , y = example_play[example_play['nflId'] == i].y
                                , name = example_play[example_play['nflId'] == i].displayName.iloc[0]
                                , mode = 'lines+markers'
                                , marker_color = 'rgba(242, 38, 19, 1)'
                                )
                     )
        
    # Add the football trace
    fig.add_trace(go.Scatter(x = example_play[example_play['nflId'].isnull()].x
                            , y = example_play[example_play['nflId'].isnull()].y
                            , name = 'football'
                            , mode = 'lines+markers'
                            , marker_color = 'rgba(240, 255, 0, 1)'
                            )
                 )
    
    # Add the line of scrimmage
    fig.add_trace(go.Scatter(x = [LOS, LOS]
                            , y = [0, 53.3]
                            , name = 'line of scrimmage'
                            , mode = 'lines'
                            , line = dict(
                                color = 'rgba(255, 0, 0, 1)'
                                , width = 4
                                )
                            )
                 )
    
    # Set the axes of the plot to include the full football field
    fig.update_xaxes(range = [0, 120])
    fig.update_yaxes(range = [0, 53.3])
    
    # Add the background image of a football field
    ### NOTE: THE ASSIGNMENT REQUIRES THAT NO EXTERNAL DATA IS ADDED, SO THIS WILL NEED
    ### TO BE REPLACED BY A WAY OF SHOWING THE FOOTBALL FIELD WITHOUT AN IMPORTED IMAGE
    fig.add_layout_image(
        dict(
            source="../Resources/football_field.png"
            , xref="x"
            , yref="y"
            , x=0
            , y=53.3
            , sizex=120
            , sizey=53.3
            , sizing="stretch"
            , opacity=0.8
            , layer="below"
        )
    )
    
    # Show the plot
    fig.show()