# NFL BIG DATA BOWL 2020 - 2021

# 0. Introduction
In this notebook we continue on from the 'BDB20-21_EDA_and_cleanup' notebook, which performs the Exploratory Data Analysis, and does some basic data wrangling on this data.  
This notebook focuses on some data visualizations to gain a better understanding on the data, and does some more elaborate feature engineering to create new data points in preparation for a machine learning algorithm

# 1. Notebook preparation
In this section of the notebook we load in all the necessary libraries, we read in the data and perform the data wrangling actions as found necessary in the BDB20-21_EDA_and_cleanup notebook.

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette = 'deep')
import datetime as dt
import time
from datetime import date
from tqdm.notebook import tqdm
import math
import cv2
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import ipywidgets as widgets
py.offline.init_notebook_mode(connected=True)

In [2]:
# Load the data
# Import the data contained in individual files
games = pd.read_csv("../Data/games.csv")
players = pd.read_csv("../Data/players.csv")
plays = pd.read_csv("../Data/plays.csv")

# Import the weekly tracking data
weeks = range(1,18)
tracking = pd.DataFrame()
for i in tqdm(weeks):
    week_data = pd.read_csv('../Data/week' + str(i) + '.csv')
    week_data['week'] = i
    tracking = tracking.append(week_data)

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




In [3]:
# Clean up the games dataset
def clean_games_data(games):
    games['gameDateTime'] = games.gameDate + " - " + games.gameTimeEastern
    games.gameDateTime = pd.to_datetime(games.gameDateTime, format = '%m/%d/%Y - %H:%M:%S')
    games['gameMonth'] = games.gameDateTime.dt.month_name()
    games['gameWeekDay'] = games.gameDateTime.dt.day_name()
    games['gameHour'] = games.gameDateTime.dt.hour
    games['gameTimeSlot'] = np.NaN
    games.loc[games.gameHour < 12, 'gameTimeSlot'] = 'Morning'
    games.loc[(games.gameHour >=12) & (games.gameHour < 15), 'gameTimeSlot'] = 'Noon'
    games.loc[(games.gameHour >=15) & (games.gameHour < 18), 'gameTimeSlot'] = 'Afternoon'
    games.loc[games.gameHour >=18, 'gameTimeSlot'] = 'Evening'
    games.drop(['gameHour', 'gameDate', 'gameTimeEastern'], axis = 1, inplace = True)
    games['week'] = games.week.astype(str)
    
games = clean_games_data(games)

In [4]:
# Clean up the players dataset
def clean_players_data(players):
    players['nflId'] = players.nflId.astype(str)
    players.height = np.where(players.height.str.contains("-"), players.height, "0-" + players.height)
    players.height = (players.height.str.split("-").str[0].astype(int) *12) + players.height.str.split("-").str[-1].astype(int)
    players['birthDate'] = pd.to_datetime(players.birthDate)
    players['age'] = ((pd.to_datetime("09/01/2018", format = "%m/%d/%Y") - 
                       players.birthDate).dt.days/365.25).apply(np.floor).astype(int)
    
    players['consPosition'] = players.position
    players.consPosition = players.consPosition.replace('OLB','LB')
    players.consPosition = players.consPosition.replace('ILB','LB')
    players.consPosition = players.consPosition.replace('MLB','LB')
    players.consPosition = players.consPosition.replace('FS','S')
    players.consPosition = players.consPosition.replace('SS','S')
    players.consPosition = players.consPosition.replace('LS','S')
    players.consPosition = players.consPosition.replace('HB','RB')
    players.consPosition = players.consPosition.replace('NT','DT')
    players.consPosition = players.consPosition.replace('DB','CB')
    
players = clean_players_data(players)

In [5]:
# Clean up the plays dataset
def clean_plays_data(plays):
    ###### Replace NaN for 'Midfield' on the yardlineSide variable ######
    plays.yardlineSide = plays.yardlineSide.replace(np.NaN, 'Midfield')
    
    ###### Count the number of players per position on offense for each play ######
    # First we initiate a variable set to 0 for all positions that occur in the personnelO variable
    plays['offenseQB'] = 0
    plays['offenseRB'] = 0
    plays['offenseWR'] = 0
    plays['offenseTE'] = 0
    plays['offenseOL'] = 0
    plays['offenseDL'] = 0
    plays['offenseDB'] = 0
    plays['offenseP'] = 0
    plays['offenseLS'] = 0
    plays['offenseK'] = 0
    plays['offenseLB'] = 0

    # Next we complete the columns we can based on the given input from the personnelO variable
    for i in tqdm(range(0,len(plays))):
        if str(plays.loc[i,'personnelO']) == 'nan':
            plays.loc[i,'offenseRB'] = np.nan
            plays.loc[i,'offenseWR'] = np.nan
            plays.loc[i,'offenseTE'] = np.nan
            plays.loc[i,'offenseQB'] = np.nan
            plays.loc[i,'offenseOL'] = np.nan
            plays.loc[i,'offenseDL'] = np.nan
            plays.loc[i,'offenseDB'] = np.nan
            plays.loc[i,'offenseP'] = np.nan
            plays.loc[i,'offenseLS'] = np.nan
            plays.loc[i,'offenseK'] = np.nan
            plays.loc[i,'offenseLB'] = np.nan
            continue
        personnelO_split = plays.personnelO.loc[i].split(",")
        personnelO_split = [x.strip(' ') for x in personnelO_split]
        for j in personnelO_split:
            if "RB" in j:
                plays.loc[i,'offenseRB'] = int(j[0])
            if "WR" in j:
                plays.loc[i,'offenseWR'] = int(j[0])
            if "TE" in j:
                plays.loc[i,'offenseTE'] = int(j[0])
            if "QB" in j:
                plays.loc[i,'offenseQB'] = int(j[0])
            if "OL" in j:
                plays.loc[i,'offenseOL'] = int(j[0])
            if "DL" in j:
                plays.loc[i,'offenseDL'] = int(j[0])
            if "DB" in j:
                plays.loc[i,'offenseDB'] = int(j[0])
            if "P" in j:
                plays.loc[i,'offenseP'] = int(j[0])
            if "LS" in j:
                plays.loc[i,'offenseLS'] = int(j[0])
            if "K" in j:
                plays.loc[i,'offenseK'] = int(j[0])
            if "LB" in j:
                plays.loc[i,'offenseLB'] = int(j[0])
                
    # Now we need to complete the line-ups based on the assumptions we had set out.
    # We can first sum the positional columns to see how many positions we're missing.
    positions = ['offenseRB', 'offenseWR', 'offenseTE', 'offenseQB', 'offenseOL', 'offenseDL',
                'offenseDB', 'offenseP', 'offenseLS', 'offenseK', 'offenseLB']
    plays['positions_summed'] = plays[positions].sum(axis=1)

    # Next we apply the first assumption: If we don't have 11 players yet and the QB position
    # was not explicitly mentioned, we assume that there is 1 QB, except if there is a punter
    # or a kicker on the field
    for i in tqdm(range(0,len(plays))):
        if str(plays.loc[i,'personnelO']) == 'nan':
            continue
        if (("QB" not in plays.personnelO.loc[i]) & (plays.positions_summed.loc[i] < 11) &
           ("K" not in plays.personnelO.loc[i]) & ("P" not in plays.personnelO.loc[i])):
            plays.loc[i,'offenseQB'] = 1

    # We can now recalculate the summed positions
    plays['positions_summed'] = plays[positions].sum(axis=1)
    
    # We can now fill up the rest of the positions with offensive line man, unless
    # we the number of OL is specified
    for i in tqdm(range(0,len(plays))):
        if str(plays.loc[i,'personnelO']) == 'nan':
            continue
        if (("OL" not in plays.personnelO.loc[i]) & (plays.positions_summed.loc[i] < 11)):
            plays.loc[i,'offenseOL'] = 11 - plays.loc[i,'positions_summed']

    # We can now recalculate the summed positions
    plays['positions_summed'] = plays[positions].sum(axis=1)
    plays.drop('positions_summed', axis = 1, inplace = True)
    
    
    ###### Count the defensive players per position on the play ######
    plays['defenseDL'] = 0
    plays['defenseLB'] = 0
    plays['defenseDB'] = 0
    plays['defenseWR'] = 0
    plays['defenseOL'] = 0
    plays['defenseTE'] = 0
    plays['defenseRB'] = 0
    plays['defenseQB'] = 0

    # Next we complete the columns we can based on the given input from the personnelD variable
    for i in tqdm(range(0,len(plays))):
        if str(plays.personnelD.loc[i]) == 'nan':
            plays.loc[i,'defenseDL'] = np.nan
            plays.loc[i,'defenseLB'] = np.nan
            plays.loc[i,'defenseDB'] = np.nan
            plays.loc[i,'defenseWR'] = np.nan
            plays.loc[i,'defenseOL'] = np.nan
            plays.loc[i,'defenseTE'] = np.nan
            plays.loc[i,'defenseRB'] = np.nan
            plays.loc[i,'defenseQB'] = np.nan
            continue
        personnelD_split = plays.personnelD.loc[i].split(",")
        personnelD_split = [x.strip(' ') for x in personnelD_split]
        for j in personnelD_split:
            if "DL" in j:
                plays.loc[i,'defenseDL'] = float(j[0])
            if "LB" in j:
                plays.loc[i,'defenseLB'] = float(j[0])
            if "DB" in j:
                plays.loc[i,'defenseDB'] = float(j[0])
            if "WR" in j:
                plays.loc[i,'defenseWR'] = float(j[0])
            if "OL" in j:
                plays.loc[i,'defenseOL'] = float(j[0])
            if "TE" in j:
                plays.loc[i,'defenseTE'] = float(j[0])
            if "RB" in j:
                plays.loc[i,'defenseRB'] = float(j[0])
            if "QB" in j:
                plays.loc[i,'defenseQB'] = float(j[0])
                
    # Next we check which records do not have a full 11 man line up yet
    positions = ['defenseDL', 'defenseLB', 'defenseDB', 'defenseWR', 'defenseOL', 'defenseTE',
                'defenseRB', 'defenseQB']
    plays['positions_summed'] = plays[positions].sum(axis=1)
    
    ###### Clean up the gameClock variable ######
    plays['gameClock_minutes'] = plays.gameClock.str.split(":").str[0].astype(float)
    plays['gameClock_seconds'] = plays.gameClock.str.split(":").str[1].astype(float)
    plays['secondsInQuarter'] = plays.gameClock_minutes * 60 + plays.gameClock_seconds
    plays.drop(['gameClock_minutes', 'gameClock_seconds'], axis = 1, inplace = True)
    
    ###### Calculate the total number of seconds left in the game ######
    plays['quarter_reversed'] = (plays.quarter - 4) * (-1)
    plays['secondsInGame'] = plays.quarter_reversed * 900 + plays.secondsInQuarter
    plays[plays['secondsInGame'] < 0] = np.NaN
    plays.drop('quarter_reversed', axis = 1, inplace = True)
    
    ###### Calculate the absolute yardline number, excluding end zones ######
    plays.absoluteYardlineNumber = plays.absoluteYardlineNumber - 10
    
    ###### Split out the relevant penalty codes into separate columns ######
    # First we split the penalty codes by the semicolon delimiter and dummy-code them
    penalties = plays.penaltyCodes.str.split(";", expand=True).stack().str.get_dummies().sum(level=0)
    # We're only interested in defensive and offensive pass interference, illegal contact and defensive holding.
    penalties = penalties[['DPI','OPI','ICT','DH']].copy()
    # We join the table with the original plays data
    plays = plays.join(penalties)
    # Plays without penalties get a 0 instead of a NaN for the penalty columns
    plays[['DPI','OPI','ICT','DH']] = plays[['DPI','OPI','ICT','DH']].fillna(0)
    plays.ICT = plays.ICT.astype(str).replace("2.0","1.0").astype(float)
    plays.DH = plays.DH.astype(str).replace("2.0","1.0").astype(float)
    plays.drop('penaltyCodes', axis = 1, inplace = True)
    
    ###### Drop the penaltyJerseyNumber variable ######
    plays.drop('penaltyJerseyNumbers', axis = 1, inplace = True)
    
    ###### Drop rows where passresult is 'R' and reset the index ######
    plays = plays[plays['passResult'] != 'R'].reset_index(drop=True)
    
    ###### Drop the 'isDefensivePI' variable ######
    plays.drop('isDefensivePI', axis = 1, inplace = True)
    
    ###### Drop the play where we see excessive player speeds ######
    plays = plays.drop(plays[(plays['gameId'] == 2018102101) & (plays['playId'] == 3078)].index)
    
plays = clean_plays_data(plays)

HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19239.0), HTML(value='')))




In [6]:
# Clean tracking data
########## TO DO #############

# 2. Data Visualization
We've learnt that in our tracking data, we have the following information:
* For each play we have a number of data points on most players on offense and defense (except for the offensive and defensive line)
* These data points represent the players or balls position on the field, direction and speed.
* A number of timestamps in the play are marked as specific events, such as the moment of the snap, the moment of the pass, the arrival of the pass, etc.

The above information on each play, gives us a lot of information to visualize each play, helping us to gain some more insights into the data we have access to.
To do this, we're going to visualize the following:
* The starting position of each tracked player and the ball on the field at the start of a play.
* The routes that each tracked player runs and the track of the ball on the field throughout the play.

Using a few custom functions, we can visualize the data.

In [None]:
# Function to plot data labels to scatterplots
def set_labels(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.3, point['y']+.3, str(point['val']))

In [None]:
# Function to plot positions at a particular event on the field
# Valid event choices are: snap; pass; pass_arrived; pass_caught; tackle; contact; pass_cinomplete; play_action;
# out_of_bounds; line_set; sack; first; last

def plot_event_positions(event, gameId = 2018090600, playId = 75):
    # Set the event labels
    event_labels = {'snap':'ball_snap', 'pass':'pass_forward', 'pass_arrived':'pass_arrived',
                'pass_caught':'pass_outcome_caught', 'tackle':'tackle', 'contact':'first_contact',
                'pass_incomplete':'pass_outcome_incomplete', 'play_action':'play_action',
                'out_of_bounds':'out_of_bounds', 'line_set':'line_set', 'sack':'qb_sack'
                }
    
    # First we filter out the necessary data
    # We start by filtering out the relevant game and play data
    example_play = tracking[(tracking['gameId'] == gameId) & 
                            (tracking['playId'] == playId)]
    # Then we filter out the correct event
    if event == 'first':
        example_play = example_play[example_play['frameId'] == 1]
    elif event == 'last':
        example_play = example_play[example_play['frameId'] == max(example_play['frameId'])]
    elif event in event_labels.keys():
        if event_labels[event] in list(example_play.event):
            example_play = example_play[example_play['event'] == event_labels[event]]
        else:
            return(print('The selected event does not occur in the selected play'))
    else:
        return(print('The selected event is not valid'))
    example_play.reset_index(drop=True, inplace = True)
    
    # Set the line of scrimmage
    LOS = example_play.absoluteYardlineNumber[0] + 10
    
    # Next we use seaborn to create a scatterplot of the data
    fig, ax = plt.subplots(figsize = (14.4, 6.4))
    g = sns.scatterplot(data = example_play
                       , x = 'x', y = 'y'
                       , hue = 'side' # Do we need this if we also set style?
                       , s = 200
                       , style = 'side'
                       , markers = {'offense': 'X', 'defense':'o', 'football':'D'}
                       )
    plt.axvline(x = LOS, lw = 4, c = 'red')
    # Set the size of the plot to include the full football field
    g.set(xlim=(0,120))
    g.set(ylim=(0,53.3))
    
    # Use our custom set_labels function to assign labels to the markers
    set_labels(example_play.x, example_play.y, example_play.position, plt.gca())
    
    # Add the picture of the football field to the background
    ### NOTE: THE ASSIGNMENT REQUIRES THAT NO EXTERNAL DATA IS ADDED, SO THIS WILL NEED
    ### TO BE REPLACED BY A WAY OF SHOWING THE FOOTBALL FIELD WITHOUT AN IMPORTED IMAGE
    with open('../Resources/football_field.png', 'rb') as file:
        img = plt.imread(file, 'PNG')
    plt.imshow(img, zorder = 0, extent = [0.0, 120.0, 0.00, 53.3])

In [None]:
# Function to plot routes on the field
def plot_routes(gameId = 2018090600, playId = 75):
    # First we filter out the necessary data
    # This means getting the data for the game and play
    example_play = tracking[(tracking['gameId'] == gameId) &
                            (tracking['playId'] == playId)
                           ]
    example_play.reset_index(drop = True, inplace = True)
    
    # Set the line of scrimmage
    LOS = example_play.absoluteYardlineNumber[0] + 10
    
    # Next we use plotly to create a scatterplot for the data where each player and ball is traced.
    # Initiate the plotting
    fig = go.Figure()
    
    # Add the offense players traces
    for i in pd.unique(example_play[example_play['side'] == 'offense'].nflId):
        fig.add_trace(go.Scatter(x = example_play[example_play['nflId'] == i].x
                                , y = example_play[example_play['nflId'] == i].y
                                , name = example_play[example_play['nflId'] == i].displayName.iloc[0]
                                , mode = 'lines+markers'
                                , marker_color = 'rgba(83, 51, 237, 1)'
                                )
                     )
        
    # Add the defensive players traces
    for i in pd.unique(example_play[example_play['side'] == 'defense'].nflId):
        fig.add_trace(go.Scatter(x = example_play[example_play['nflId'] == i].x
                                , y = example_play[example_play['nflId'] == i].y
                                , name = example_play[example_play['nflId'] == i].displayName.iloc[0]
                                , mode = 'lines+markers'
                                , marker_color = 'rgba(242, 38, 19, 1)'
                                )
                     )
        
    # Add the football trace
    fig.add_trace(go.Scatter(x = example_play[example_play['nflId'].isnull()].x
                            , y = example_play[example_play['nflId'].isnull()].y
                            , name = 'football'
                            , mode = 'lines+markers'
                            , marker_color = 'rgba(240, 255, 0, 1)'
                            )
                 )
    
    # Add the line of scrimmage
    fig.add_trace(go.Scatter(x = [LOS, LOS]
                            , y = [0, 53.3]
                            , name = 'line of scrimmage'
                            , mode = 'lines'
                            , line = dict(
                                color = 'rgba(255, 0, 0, 1)'
                                , width = 4
                                )
                            )
                 )
    
    # Set the axes of the plot to include the full football field
    fig.update_xaxes(range = [0, 120])
    fig.update_yaxes(range = [0, 53.3])
    
    # Add the background image of a football field
    ### NOTE: THE ASSIGNMENT REQUIRES THAT NO EXTERNAL DATA IS ADDED, SO THIS WILL NEED
    ### TO BE REPLACED BY A WAY OF SHOWING THE FOOTBALL FIELD WITHOUT AN IMPORTED IMAGE
    fig.add_layout_image(
        dict(
            source="../Resources/football_field.png"
            , xref="x"
            , yref="y"
            , x=0
            , y=53.3
            , sizex=120
            , sizey=53.3
            , sizing="stretch"
            , opacity=0.8
            , layer="below"
        )
    )
    
    # Show the plot
    fig.show()

In [None]:
plot_routes()