In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Preprocessing

In [None]:
df = pd.read_csv("../input/nfl-big-data-bowl-2020/train.csv", low_memory=False)

**Checking for mispellings in variables**

Correting string values in the column "WindSpeed"

In [None]:
df["WindSpeed"].unique()

In [None]:
WindSpeed_dict = {
       'SSW':np.nan, '11-17':14, '14-23':18.5, '13 MPH':13, '12-22':17, '4 MPh':4, '15 gusts up to 25':15,
       '10MPH':10, '10mph':10, 'E':np.nan, '7 MPH':7, 'Calm':np.nan, '6 mph':6, 'SE':np.nan, '10-20':15, '12mph':12,
       '6mph':6, '9mph':9, 'SSE':np.nan, '14 Gusting to 24':14, '6 mph, Gusts to 10':6, '2 mph, gusts to 5':2,
       '12 mph':12,'9 mph, gusts to 13':9, '10 mph, gusts to 15':10
}

In [None]:
df["WindSpeed"] = df["WindSpeed"].replace(WindSpeed_dict).astype(float)

Correcting the names of the teams in 'PossessionTeam' and 'FieldPosition'

In [None]:
df[(df['PossessionTeam']!=df['HomeTeamAbbr'])&(df['PossessionTeam']!=df['VisitorTeamAbbr'])]['PossessionTeam'].unique()

In [None]:
df[(df['FieldPosition']!=df['HomeTeamAbbr'])&(df['FieldPosition']!=df['VisitorTeamAbbr'])]['FieldPosition'].unique()

In [None]:
df['HomeTeamAbbr'].unique()

In [None]:
TeamAbbr_dict = {
    'BLT': 'BAL', 'CLV': 'CLE', 'ARZ':'ARI', 'HST':'HOU'
}

In [None]:
df['FieldPosition'].fillna("None", inplace=True)
df['PossessionTeam'].replace(TeamAbbr_dict, inplace=True)
df['FieldPosition'].replace(TeamAbbr_dict, inplace=True)

**Setting PlayerBirthDate to date format**

In [None]:
df['PlayerBirthDate'] = pd.to_datetime(df['PlayerBirthDate'], infer_datetime_format=True)

**Correcting Position**

In [None]:
df['Position'].replace({'SAF':'S'}, inplace=True)

Correcting Stadium

In [None]:
Stadium_dict = {
       'Broncos Stadium at Mile High' : 'Broncos Stadium At Mile High', 
       'CenturyField' : 'CenturyLink Field',
       'Tottenham Hotspur' : 'Tottenham Hotspur Stadium',
       'Azteca Stadium' : 'Estadio Azteca',
       'Twickenham' : 'Twickenham Stadium',
       'MetLife' : 'MetLife Stadium',
       'CenturyLink' : 'CenturyLink Field',
       'M&T Stadium':'M&T Bank Stadium',
       'First Energy Stadium' : 'FirstEnergy Stadium',
       'Los Angeles Memorial Coliesum':'Los Angeles Memorial Coliseum',
       'M & T Bank Stadium' : 'M&T Bank Stadium',
       'FirstEnergyStadium' : 'FirstEnergy Stadium',
       'Paul Brown Stdium' : 'Paul Brown Stadium', 
       'FedexField': 'FedExField',
       'FirstEnergy' : 'FirstEnergy Stadium',
       'Everbank Field' : 'EverBank Field',
       'Mercedes-Benz Dome' : 'Mercedes-Benz Superdome',
       'Lambeau field' : 'Lambeau Field',
       'NRG' : 'NRG Stadium'
}

In [None]:
df['Stadium'].replace(Stadium_dict, inplace=True)

**Beacause there is no city with two or more stadiums, i'm using just the State from the Location variable. Some States have more than one stadium, like Florida**

In [None]:
df['Location'].unique()

In [None]:
Location_dict = {
    'Foxborough, MA' : 'MA',
    'Orchard Park NY': 'NY',
    'Chicago. IL' : 'IL',
    'Cincinnati, Ohio': 'OH',
    'Cleveland, Ohio' : 'OH',
    'Detroit, MI': 'MI',
    'Houston, Texas': 'TX',
    'Nashville, TN' : 'TN',
    'Landover, MD' : 'MD',
    'Los Angeles, Calif.' : 'CA',
    'Green Bay, WI' : 'WI',
    'Santa Clara, CA' : 'CA',
    'Arlington, Texas': 'TX',
    'Minneapolis, MN' : 'MN',
    'Denver, CO' : 'CO',
    'Baltimore, Md.' : 'MD',
    'Charlotte, North Carolina' : 'NC',
    'Indianapolis, Ind.' : 'ID',
    'Jacksonville, FL' : 'FL',
    'Kansas City, MO' : 'MO',
    'New Orleans, LA' : 'LA',
    'Pittsburgh' : 'PA',
    'Tampa, FL' : 'FL',
    'Carson, CA' : 'CA',
    'Oakland, CA' : 'CA',
    'Seattle, WA' : 'WA',
    'Atlanta, GA' : 'GA',
    'East Rutherford, NJ': 'NJ',
    'London, England' : 'ENG',
    'Chicago, IL' : 'IL',
    'Detroit' : 'MI',
    'Philadelphia, Pa.' : 'PA',
    'Glendale, AZ' : 'AZ',
    'Cleveland, OH' : 'OH',
    'Foxborough, Ma' : 'MA',
    'E. Rutherford, NJ' : 'NJ',
    'Miami Gardens, Fla.' : 'FL',
    'Houston, TX' : 'TX',
    'London':'ENG',
    'New Orleans, La.' : 'LA',
    'Mexico City' : 'MEX',
    'Baltimore, Maryland':'MA',
    'Arlington, TX' : 'TX',
    'Jacksonville, Fl' : 'FL',
    'Jacksonville, Florida' : 'FL',
    'Pittsburgh, PA': 'PA',
    'Charlotte, NC' : 'NC',
    'Cleveland,Ohio' : 'OH',
    'East Rutherford, N.J.' : 'NJ',
    'Philadelphia, PA' : 'PA',
    'Seattle' : 'WA',
    'Cleveland Ohio' : 'OH',
    'Miami Gardens, FLA' : 'FL',
    'Orchard Park, NY' : 'NY',
    'Cleveland' : 'OH',
    'Cincinnati, OH' : 'OH',
    'Kansas City,  MO' : 'MO',
    'Jacksonville Florida' : 'FL',
    'Los Angeles, CA' : 'CA',
    'New Orleans' : 'LA',
    'Chicago' : 'IL',
    'Charlotte North Carolina' : 'NC',
    'Miami Gardens, FL' : 'FL',
    'Denver CO' : 'CO',
    'Santa Clara, CSA' : 'CA',
    'Baltimore, MD' : 'MD',
    'Mexico City, Mexico' : 'MEX'
}

In [None]:
df['Location'].replace(Location_dict, inplace=True)

**Correcting StadiumType**

In [None]:
df['StadiumType'].sort_values().unique()

In [None]:
df['StadiumType'].fillna("None", inplace=True)

In [None]:
StadiumType_dict = {
    'Bowl' : 'Outdoor',
    'Closed Dome': 'Closed',
    'Cloudy' : 'None',
    'Dome' : 'Closed',
    'Dome, closed' : 'Closed',
    'Domed' : 'Closed',
    'Domed, Open' : 'Open',
    'Domed, closed' : 'Closed',
    'Domed, open' : 'Open',
    'Heinz Field' : 'Outdoor',
    'Indoor' : 'Closed',
    'Indoor, Open Roof' : 'Open',
    'Indoor, Roof Closed' : 'Closed',
    'Indoor, roof open' : 'Open',
    'Indoors' : 'Closed',
    'OUTDOOR' : 'Outdoor',
    'Open' : 'Outdoor',
    'Oudoor' : 'Outdoor',
    'Ourdoor' : 'Outdoor',
    'Outddors' : 'Outdoor',
    'Outdoor Retr Roof-Open' : 'Open',
    'Outdoors' : 'Outdoor',
    'Outdor' : 'Outdoor',
    'Outside' : 'Outdoor',
    'Retr. Roof - Closed' : 'Closed',
    'Retr. Roof - Open' : 'Open',
    'Retr. Roof Closed' : 'Closed',
    'Retr. Roof-Closed' : 'Closed',
    'Retr. Roof-Open' : 'Open',
    'Retractable Roof' : 'Closed',
    'Retractable Roof - Closed' : 'Closed',
    'indoor' : 'Closed'
}

In [None]:
df['StadiumType'].replace(StadiumType_dict, inplace=True)

**Correcting Turf**

In [None]:
df['Turf'].sort_values().unique()

In [None]:
Turf_dict = {
    'A-Turf Titan' : 'Artificial',
    'Artifical' : 'Artificial',
    'DD GrassMaster' : 'Artificial',
    'Field Turf' : 'Artificial',
    'Field turf' : 'Artificial',
    'FieldTurf' : 'Artificial',
    'FieldTurf 360' : 'Artificial',
    'FieldTurf360' : 'Artificial',
    'Grass' : 'Natural',
    'Natural Grass' : 'Natural' ,
    'Natural grass' : 'Natural',
    'Naturall Grass' : 'Natural',
    'SISGrass' : 'Artificial',
    'Turf' : 'Artificial',
    'Twenty Four/Seven Turf' : 'Artificial',
    'Twenty-Four/Seven Turf' : 'Artificial',
    'UBU Speed Series-S5-M' : 'Artificial',
    'UBU Sports Speed S5-M' : 'Artificial',
    'UBU-Speed Series-S5-M' : 'Artificial',
    'grass' : 'Natural',
    'natural grass' : 'Natural'
}

In [None]:
df['Turf'].replace(Turf_dict, inplace=True)

In [None]:
df['GameWeather'].sort_values().unique()

In [None]:
df['GameWeather'].fillna('None', inplace=True)

In [None]:
GameWeather_dict = {
    '30% Chance of Rain' : 'None',
    'Breezy': 'Clear',
    'Clear Skies' : 'Clear',
    'Clear and Cool' : 'Clear',
    'Clear and Sunny' : 'Clear',
    'Clear and cold' : 'Clear',
    'Clear and sunny' : 'Clear',
    'Clear and warm' : 'Clear',
    'Clear skies' : 'Clear',
    'Cloudy and Cool' : 'Cloudy',
    'Cloudy and cold' : 'Cloudy',
    'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.' : 'Cloudy',
    'Cloudy with showers and wind' : 'Cloudy',
    'Cloudy, 50% change of rain' : 'Cloudy',
    'Cloudy, Rain' : 'Cloudy',
    'Cloudy, chance of rain' : 'Cloudy',
    'Cloudy, fog started developing in 2nd quarter' : 'Cloudy',
    'Cloudy, light snow accumulating 1-3"' : 'Snow',
    'Cold' : 'None',
    'Controlled Climate' : 'None',
    'Coudy' : 'Cloudy',
    'Fair' : 'Clear',
    'Hazy' : 'Cloudy',
    'Heavy lake effect snow' : 'Snow',
    'Indoor' : 'None',
    'Indoors' : 'None',
    'Light Rain' : 'Rain',
    'Light rain' : 'Rain',
    'Mostly Clear' : 'Clear',
    'Mostly Cloudy' :'Cloudy',
    'Mostly Coudy' : 'Cloudy',
    'Mostly Sunny' : 'Clear',
    'Mostly Sunny Skies' : 'Clear',
    'Mostly clear' : 'Clear',
    'Mostly cloudy' : 'Cloudy',
    'Mostly sunny' : 'Clear',
    'N/A (Indoors)' : 'None',
    'N/A Indoor' : 'None',
    'N/A Indoors' : 'None',
    'Overcast' : 'Cloudy',
    'Partly Cloudy' : 'Cloudy',
    'Partly Clouidy' : 'Cloudy',
    'Partly Sunny' : 'Clear',
    'Partly clear' : 'Clear',
    'Partly cloudy' : 'Cloudy',
    'Partly cloudy and mild' : 'Cloudy',
    'Partly sunny' : 'Clear',
    'Party Cloudy' : 'Cloudy',
    'Rain' : 'Rain',
    'Rain Chance 40%' : 'None',
    'Rain and Wind': 'Rain',
    'Rain likely, temps in low 40s.' : 'None',
    'Rain shower' : 'Rain',
    'Raining' : 'Rain',
    'Rainy' : 'Rain',
    'Scattered Showers' : 'Rain',
    'Showers' : 'Rain',
    'Sun & clouds' : 'Cloudy',
    'Sunny' : 'Clear',
    'Sunny Skies' : 'Clear',
    'Sunny and clear' : 'Clear',
    'Sunny and cold' : 'Clear',
    'Sunny and warm' : 'Clear',
    'Sunny, Windy' : 'Clear',
    'Sunny, highs to upper 80s' : 'Clear',
    'T: 51; H: 55; W: NW 10 mph' : 'None',
    'cloudy' : 'Cloudy',
    'overcast' : 'Cloudy',
    'partly cloudy' : 'Cloudy',
    'sUNNY' : 'Clear'
}

In [None]:
df['GameWeather'].replace(GameWeather_dict, inplace=True)

**Correcting WindDirection**

In [None]:
df['WindDirection'].sort_values().unique()

In [None]:
df['WindDirection'].fillna("None", inplace=True)

In [None]:
WindDirection_dict = {
    '1' : 'None',
    '13' : 'None',
    '8' : 'None',
    'Calm' : 'None',
    'EAST' : 'E',
    'East' : 'E',
    'East North East' : 'ENE',
    'East Southeast' : 'ESE',
    'From ESE' : 'ESE',
    'From NE' : 'NE',
    'From NNE' : 'NNE',
    'From NNW' : 'NNW',
    'From S' : 'S',
    'From SSE' : 'SSE',
    'From SSW' : 'SSW',
    'From SW' : 'SW',
    'From W' : 'W',
    'From WSW' : 'WSW',
    'N-NE' : 'NNE',
    'North' : 'N',
    'North East' : 'NE',
    'North/Northwest' : 'NNW',
    'NorthEast' : 'NE',
    'Northeast' : 'NE',
    'Northwest' : 'NW',
    'S-SW' : 'SSW',
    'South' : 'S',
    'South Southeast' : 'SSE',
    'South Southwest' : 'SSW',
    'South west' : 'SW',
    'South, Southeast' : 'SSE',
    'SouthWest' : 'SW',
    'Southeast' : 'SE',
    'Southerly' : 'S',    
    'Southwest' : 'SW',
    'W-NW' : 'WNW',
    'W-SW' : 'WSW',
    'West' : 'W',
    'West Northwest' : 'WNW',
    'West-Southwest' : 'WSW',
    'from W' : 'W',
    's' : 'S'
}

In [None]:
df['WindDirection'].replace(WindDirection_dict, inplace=True)

**Adjusting OffenseFormation NAs**

In [None]:
df['OffenseFormation'].fillna("EMPTY", inplace=True)

**Transforming GameClock to seconds**

In [None]:
aux = df['GameClock'].str.split(':', expand=True).astype(int)
df['GameClockSeconds'] = aux[0]*60+aux[1]
df.drop(columns='GameClock', inplace=True)

Converting the height to a single number

In [None]:
aux = df['PlayerHeight'].str.split('-', expand=True).astype('int')
df['PlayerHeightFt'] = aux[0]*12+aux[1]
df.drop(['PlayerHeight'], axis=1, inplace=True)

Ajusting 'YardLine' so that it ranges from 0 to 100 and does not deppend on 'FieldPosition' (NA's values corresponds to the 50-yard line)

In [None]:
df['NewYardLine'] = 0
df.loc[df['FieldPosition']=="None", ['NewYardLine']] = 50
df.loc[df['FieldPosition']==df['PossessionTeam'], ['NewYardLine']] = df['YardLine']
df.loc[df['FieldPosition']!=df['PossessionTeam'], ['NewYardLine']] = 100 - df['YardLine']

Adding a flag to the runner

In [None]:
df['Rusher'] = 0
df.loc[df['NflIdRusher']==df['NflId'],['Rusher']] = 1

PossessionTeam, HomeTeamAbbr and AwayTeamAbbr are the classified with the team abrreviation. It will be useful to the model having a flag indicating whether the player is on the possession team.

In [None]:
df['PossessionFlag'] = 0
df.loc[(df['PossessionTeam']==df['HomeTeamAbbr']) & (df['Team']=='home'), ['PossessionFlag']] = 1
df.loc[(df['PossessionTeam']==df['VisitorTeamAbbr']) & (df['Team']=='away'), ['PossessionFlag']] = 1

**Dealing with NA's in numerical columns**

* For simplicity in 'Orientation' i used "fillna" function with zero, because the number of NA's is low. But maybe "fillna" can be used considerdering the direction of the play and whether the player is on the offense or defense, filling with 90º or 270º, assuming that players are likely to be looking in the direction of the play in the offense, and the opposite direction in the defense.
* Doesn't make sense using mean to "fillna" in 'Dir' column, because it have three frequent values that are around 0, 180 and 360, and also are very few NA's
* Number of lines with 'Humidity' == 0 are around 9%, since 0 humidity is very unlikely, i will assume 0 as NA's, an change them to the mean also
* DefendersInTheBox, Temperature and WindSpeed NA's can be filled with the mean without big concerns

In [None]:
df["Orientation"].fillna(0, inplace=True)
df["Dir"].fillna(0, inplace=True)
df["Humidity"].replace(0, np.nan, inplace=True)
df["Humidity"].fillna(df["Humidity"].mean(), inplace=True)
df["DefendersInTheBox"].fillna(df["DefendersInTheBox"].mean(), inplace=True)
df["Temperature"].fillna(df["Temperature"].mean(), inplace=True)
df["WindSpeed"].fillna(df["WindSpeed"].mean(), inplace=True)

## Adjusting the dataframe to the model

In [None]:
df.columns

In [None]:
play_columns = ['GameId', 'PlayId', 'Season', 'Quarter', 'PossessionTeam',
       'Down', 'Distance', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'NflIdRusher',
       'OffenseFormation', 'OffensePersonnel', 'DefendersInTheBox',
       'DefensePersonnel', 'PlayDirection', 'Yards',
       'HomeTeamAbbr', 'VisitorTeamAbbr', 'Week', 'Stadium', 'Location',
       'StadiumType', 'Turf', 'GameWeather', 'Temperature', 'Humidity',
       'WindSpeed', 'WindDirection', 'GameClockSeconds',
       'NewYardLine']

In [None]:
player_columns = ['PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation','Dir','PlayerWeight',
                  'PlayerBirthDate', 'PlayerCollegeName', 'Position',  'PlayerHeightFt', 'Rusher', 'PossessionFlag']

In [None]:
plays = df[play_columns].drop_duplicates()

In [None]:
plays.shape[0]

In [None]:
players = df[player_columns]
players

In [None]:
players['N'] = players.groupby('PlayId').cumcount()

**Creating a DF with observations that corresponds to plays, putting the players in columns**

In [None]:
players = players.pivot_table(
    values = [
        'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation',
        'Dir', 'PlayerWeight', 'PlayerBirthDate',
        'PlayerCollegeName', 'Position', 'PlayerHeightFt',
        'Rusher', 'PossessionFlag', 
             ],
    index = [
        'PlayId'
    ],
    columns = [
        'N'
    ],
    aggfunc = 'first'
)

**Casting the multi-level columns index to a simple index**

In [None]:
players.columns = players.columns.get_level_values(0).astype(str)+'_'+players.columns.get_level_values(1).astype(str)

Separating the Categorical Variables in the Players DF

In [None]:
categ_players_cols = players.select_dtypes(include='object').columns
num_players_cols = players.select_dtypes(exclude='object').columns

Using get_dummies in the categorical variables, and concatenating with the numerical ones

In [None]:
players = pd.concat([players[num_players_cols],pd.get_dummies(players[categ_players_cols])], axis=1).reset_index()

In [None]:
categ_plays_cols = [
    'PossessionTeam', 'OffenseFormation', 'OffensePersonnel', 
    'DefensePersonnel', 'PlayDirection', 'Stadium',
    'Location', 'StadiumType', 'Turf', 'GameWeather',
    'WindDirection'
]

In [None]:
num_plays_cols = [
    'PlayId', 'Season', 'Quarter', 'Down', 'Distance',
    'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
    'DefendersInTheBox', 'Week', 'Temperature', 'Yards',
    'Humidity', 'WindSpeed', 'GameClockSeconds', 'NewYardLine'
]

In [None]:
plays = pd.concat([plays[num_plays_cols], pd.get_dummies(plays[categ_plays_cols])], axis=1).reset_index(drop=True)

In [None]:
model_df = pd.concat([plays, players], axis=1)

Substituir por get_dummies + concat

In [None]:
aux = pd.DataFrame(columns=['Yards' + str(i) for i in range(-99, 100)])
yards_df = pd.concat([aux, pd.concat([pd.DataFrame([1], columns=['Yards'+str(i)]) for i in plays['Yards']], ignore_index=True)]).fillna(0).astype(int)

In [None]:
model_df = pd.concat([model_df, yards_df], axis=1)
model_df.head()