In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from string import punctuation
import datetime
import re

In [None]:
from IPython.display import Image
import os
!ls ../input/

### In this competition you will predict How many yards will an NFL player gain after receiving a handoff ?

![Imgur](https://i.imgur.com/bwHiMwM.gif)

**Handoff :** An exchange made by handing the ball to a teammate



![Imgur](https://i.imgur.com/iMmwaLO.png)

**Yards :** 1 yard = 0.9144 metre

- **The rectangular field of play of American football games measures 100 yards (91.44 m) long between the goal lines, and 160 feet (48.8 m) (53 1⁄3 yards) wide. In addition, there are end zones extending another 10 yards (9.144 m) past the goal lines to the "end lines", for a total length of 120 yards (109.7 m). When the "football field" is used as unit of measurement, it is usually understood to mean 100 yards (91.44 m), although technically the full length of the official field, including the end zones, is 120 yards (109.7 m).**







### Loading data

In [None]:
train_df = pd.read_csv("../input/nfl-big-data-bowl-2020/train.csv", low_memory=False)
train_df.head()

In [None]:
# https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt

def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


## Knowing the data

In [None]:
resumetable(train_df)[:]

In [None]:
## Reducting memory
train_df = reduce_mem_usage(train_df)

In [None]:
train_df.shape

### Data Format

The shape of train data is 509762 x 49.

**Key Points :**

- A game played by two teams, Offensive and Defensive each team have 11 players at that time of play.
- total 22 players in field.
- This dataset contains Next Gen Stats tracking data for running plays
- When the ball is handed off (`TimeHandoff`) to forecast the yardage gained on thet play (`PlayId`).
- When the ball is handedOff, in that particular time 22 players in field, represented as `PlayId`
- In that perticular time we have all 22 players data.
- using those players data we want to predict "How many yards will an NFL player gain after receiving a handoff?"

In [None]:
print("Total number of games : ",train_df.GameId.nunique())
print("Total number of HandOff's : ",train_df.PlayId.nunique())
print("Total number of uniq players : ", train_df.NflId.nunique())

In [None]:
### Checking Train data sorted by PlayId and Team.

# https://www.kaggle.com/hukuda222/nfl-simple-model-using-lightgbm

ok = True
for i in range(0,509762,22):
    p=train_df["PlayId"][i]
    for j in range(1,22):
        if(p!=train_df["PlayId"][i+j]):
            ok=False
            break
print("train data is sorted by PlayId." if ok else "train data is not sorted by PlayId.")
ok = True
for i in range(0,509762,11):
    p=train_df["Team"][i]
    for j in range(1,11):
        if(p!=train_df["Team"][i+j]):
            ok=False
            break
            
print("train data is sorted by Team." if ok else "train data is not sorted by Team.")

> Training data was sorted

## Exploratery Data Analysis 

- Total number of columns 49
- EDA based on columns by columns


### 1. GameId

- A unique game identifier
- Every game have unique GameId
- In every game we have many number of handOff's

In [None]:
print("Total number of games : ",train_df.GameId.nunique())
print("Average number of HandOff's in every game : ", train_df.GameId.value_counts().mean())
print("Max number of HandOff's in one game : ", train_df.GameId.value_counts().max())
print("Min number of HandOff's in one game : ", train_df.GameId.value_counts().min())

## 2.PlayId
- A unique play identifier
- In every game(GameId) have many number of handOff's 
- Every HandOff identify using PlayId
- Every PlayId have 22 datapoints
- Every datapoint represents player data in that perticlur HandOff in field
- Every HandOff happened 22 players in ground(field)

In [None]:
print("Total number of HandOff's : ",train_df.PlayId.nunique())
print("Every HandOff have ", int(train_df.PlayId.value_counts().mean()), "Players Data")

- We need to create groupby using "PlayId"

In [None]:
playId_groupby = train_df.groupby("PlayId")

## 3.Team
- home or away
- This is categorical feature
- Every `PlayId` have 22 datapoints(players)
- 11 from home and 11 from away
- We already seen that train data sorted by Team

In [None]:
print("Total number of Teams :", train_df.Team.value_counts())
print("Every PlayId have ", playId_groupby["Team"].value_counts().max() , "players from each category")

In [None]:
plt.figure()
sns.countplot(train_df["Team"])
plt.title("Away and Home team countplot")
plt.show()

## 4.X && 5.Y
- X : player position along tha long axis of the field.
- Y : player position along the sort axis of the field

![Imgur](https://i.imgur.com/ZMyTDks.jpg)

- From X and Y we know the player position in field.
- Each and every player have they own positions when HandOff happend.

In [None]:
print("Total number of positions of X : ", train_df.X.shape[0])
print("Total number of positions of Y : ", train_df.Y.shape[0])
print("*"*50)
print("max of X : ", train_df.X.max())
print("max of Y : ", train_df.Y.max())
print("*"*50)
print("min of X : ", train_df.X.min())
print("min of Y : ", train_df.Y.min())
print("*"*50)
print("mean of X : ", train_df.X.values.mean())
print("mean of Y : ", train_df.Y.values.mean())

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(121)
sns.distplot(train_df.X)
plt.vlines(train_df.X.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.text(train_df.X.values.mean()-8, plt.ylim()[1]-0.001, "Mean of X", size=15, color='r')
plt.title("X axis Distribution")
plt.subplot(122)
sns.distplot(train_df.Y)
plt.vlines(train_df.Y.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.title("Y axis Distribution")
plt.text(train_df.Y.values.mean()-8, plt.ylim()[1]-0.003, "Mean of Y", size=15, color='r')

- We observe that, those two distributions look linke identical it self
- Because of it very posible that two team player positions opposite to each other

In [None]:
plt.figure(figsize=(16,12))
sns.scatterplot(train_df["X"], train_df["Y"])
plt.xlabel('X axis', fontsize=12)
plt.ylabel('Y axis', fontsize=12)
plt.title("Players positions", fontsize=20)
plt.show()

- Players position like Tsuname, those covers all over the place

### Football Field Plot

In [None]:
# https://www.kaggle.com/robikscube/nfl-big-data-bowl-plotting-player-position

def create_football_field(linenumbers=True,
                          endzones=True,
                          highlight_line=False,
                          highlight_line_number=50,
                          highlighted_name='Line of Scrimmage',
                          fifty_is_los=False,
                          figsize=(12*2, 6.33*2)):
    """
    Function that plots the football field for viewing plays.
    Allows for showing or hiding endzones.
    """
    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)

    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
    # Endzones
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)

    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')

    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax

create_football_field()
plt.show()

### Ball Carrier Direction Analysis

Let us take the playid '20181007011551' to start with.

- We will plot the home team using blue color and away team using orange color.
- Ball carrier is plotted using red color
- Direction of movement of the ball carrier is shown using arrow
- Yards covered (target) variable is avilable in the title

In [None]:
# https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-nfl

import math
def get_dx_dy(angle, dist):
    cartesianAngleRadians = (450-angle)*math.pi/180.0
    dx = dist * math.cos(cartesianAngleRadians)
    dy = dist * math.sin(cartesianAngleRadians)
    return dx, dy

play_id = 20181007011551
fig, ax = create_football_field()
train_df.query("PlayId == @play_id and Team == 'away'") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='orange', s=50, legend='Away')
train_df.query("PlayId == @play_id and Team == 'home'") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='blue', s=50, legend='Home')
train_df.query("PlayId == @play_id and NflIdRusher == NflId") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='red', s=100, legend='Rusher')
rusher_row = train_df.query("PlayId == @play_id and NflIdRusher == NflId")
yards_covered = rusher_row["Yards"].values[0]

x = rusher_row["X"].values[0]
y = rusher_row["Y"].values[0]
rusher_dir = rusher_row["Dir"].values[0]
rusher_speed = rusher_row["S"].values[0]
dx, dy = get_dx_dy(rusher_dir, rusher_speed)

ax.arrow(x, y, dx, dy, length_includes_head=True, width=0.3)
plt.title(f'Play # {play_id} and yard distance is {yards_covered}', fontsize=20)
plt.legend()
plt.show()


- using X and Y features we identify where the play exactly in field when HandOff happend

## 6.S && 7.A

- S : speed in yards/second
- A : acceleration in yards/second^2

- S is the speed of player when HandOff happend
- A is the acceleration of player when HandOff happend

In [None]:
print("Total number of  S : ", train_df.S.shape[0])
print("Total number of  A : ", train_df.A.shape[0])
print("*"*50)
print("max of S : ", train_df.S.max())
print("max of A : ", train_df.A.max())
print("*"*50)
print("min of S : ", train_df.S.min())
print("min of A : ", train_df.A.min())
print("*"*50)
print("mean of S : ", train_df.S.values.mean())
print("mean of A : ", train_df.A.values.mean())

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(121)
sns.distplot(train_df.S)
plt.vlines(train_df.S.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.text(train_df.S.values.mean(), plt.ylim()[1]-0.01, "Mean of S", size=15, color='r')
plt.title("Speed('S') Distribution")
plt.subplot(122)
sns.distplot(train_df.A)
plt.vlines(train_df.A.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.text(train_df.A.values.mean(), plt.ylim()[1]-0.02, "Mean of A", size=15, color='r')
plt.title("Acceleration('A') Distribution")
plt.show()

- we observe most of players Speed < 6 yards per sec when HandOff happend
- we observe most of players Acceleration < 4 yards per sec^2 when HandOff happend

## 8.Dis
- distance traveled from prior time point, in yards

- `Dis` measures time covered in the most recent window of player tracking data. Given that tracking data roughly covers 10 frames per second, `Dis` corresponds to distance traveled in the recent 0.1 seconds. Note that speed and acceleration are directly calculated using `Dis` (this is done in the data pre-processing) [link](https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112246)

In [None]:
print("Total number of  Dis : ", train_df.Dis.shape[0])
print("*"*50)
print("max of Dis : ", train_df.Dis.max())
print("*"*50)
print("min of Dis : ", train_df.Dis.min())
print("*"*50)
print("mean of Dis : ", train_df.Dis.values.mean())

In [None]:
plt.figure(figsize=(16,6))
sns.distplot(train_df.Dis)
plt.vlines(train_df.Dis.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.text(train_df.Dis.values.mean(), plt.ylim()[1]-0.01, "Mean of Dis", size=15, color='r')
plt.title("Distance(Dis) distribution")
plt.show()

## 9.Orientation

- Orientation of player (deg)
- this angle of player, direction of player

In [None]:
print("Total number of  Orientation : ", train_df.Orientation.shape[0])
print("*"*50)
print("max of Orientation : ", train_df.Orientation.max())
print("*"*50)
print("min of Orientation : ", train_df.Orientation.min())
print("*"*50)
print("Number of missing values : ", train_df.Orientation.isna().sum())

In [None]:
drop_na_Orientation = train_df.Orientation.dropna()

In [None]:
plt.figure(figsize=(16,6))
sns.distplot(drop_na_Orientation)
plt.vlines(drop_na_Orientation.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.text(drop_na_Orientation.values.mean(), plt.ylim()[1]-0.0002, "Mean of Orienataion", size=15, color='r')
plt.title("Orientation distribution")
plt.show()

## 10.Dir
- angle of player motion (deg)
- the angle of player moving when the HandOff made

In [None]:
print("Total number of  Dir : ", train_df.Dir.shape[0])
print("*"*50)
print("max of Dir : ", train_df.Dir.max())
print("*"*50)
print("min of Dir : ", train_df.Dir.min())
print("*"*50)
print("Number of missing values : ", train_df.Dir.isna().sum())

In [None]:
drop_na_Dir = train_df.Dir.dropna()

In [None]:
plt.figure(figsize=(16,6))
sns.distplot(drop_na_Dir)
plt.vlines(drop_na_Dir.values.mean(), plt.ylim()[0], plt.ylim()[1], color='r', linestyles='--');
plt.text(drop_na_Dir.values.mean(), plt.ylim()[1]-0.0002, "Mean of Dir", size=15, color='r')
plt.title("Direction(Dir) distribution")
plt.show()

## 11.NflId
- a unique identifier of the player
- train data we want to identify player uniquely using `NflId` feature
- During the regular season, each team is allowed a maximum of 53 players on its roster; only 46 of these may be active (eligible to play) on game days.
- Each team play with 11 players at that time of play.

In [None]:
print("Total number unique players : ", train_df.NflId.nunique())
print("*"*50)
print("max number of times PlayId player id is : ", train_df.NflId.value_counts().index[0] , "number of HandOffs is : ", train_df.NflId.value_counts().values[0])
print("*"*50)
print("min number of times PlayId player id is : ", train_df.NflId.value_counts().index[-1] , "number of HandOffs is : ", train_df.NflId.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", train_df.NflId.isna().sum())

## 12.DisplayName && 13.JerseyNamber

- DisplayName : player's name for each player
- JerseyNumber : jersey number for each player

In [None]:
print("Total number unique player names : ", train_df.DisplayName.nunique())
print("*"*50)
print("max of number of times PlayId player name is : ", train_df.DisplayName.value_counts().index[0] , "number of HandOffs is : ", train_df.DisplayName.value_counts().values[0])
print("*"*50)
print("min of number of times PlayId player name is : ", train_df.DisplayName.value_counts().index[-1] , "number of HandOffs is : ", train_df.DisplayName.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", train_df.DisplayName.isna().sum())

In [None]:
print("Total number unique player numbers : ", train_df.JerseyNumber.nunique())
print("*"*50)
print("max of number of times PlayId player number is : ", train_df.JerseyNumber.value_counts().index[0] , "number of HandOffs is : ", train_df.JerseyNumber.value_counts().values[0])
print("*"*50)
print("min of number of times PlayId player number is : ", train_df.JerseyNumber.value_counts().index[-1] , "number of HandOffs is : ", train_df.JerseyNumber.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", train_df.JerseyNumber.isna().sum())

## 14.Season

- year of the season
- this is time series data

In [None]:
print("Total number unique seasons : ", train_df.Season.nunique())

print("Those are : \n", train_df.Season.value_counts())

## 15.YardLine

- the yard line of the line of scrimmage
- a line of scrimmage is an imaginary transverse line (across the width of the football field) beyond which a team cannot cross until the next play has begun. Its location is based on the spot where the ball is placed after the end of the most recent play and following the assessment of any penalty yards.

![Imgur](https://i.imgur.com/Jxpxcuz.jpg)

- Blue line is line of scrimmage
- On a kickoff, the ball is placed at the 35-yard line of the kicking team in professional and college play and at the 40-yard line in high school play. The ball may be drop-kicked or place-kicked. If a place kick is chosen, the ball can be placed on the ground or a tee; a holder may be used in either case. On a safety kick, the kicking team kicks the ball from their own 20-yard line. They can punt, drop-kick or place-kick the ball, but a tee may not be used in professional play. Any member of the receiving team may catch or advance the ball. The ball may be recovered by the kicking team once it has gone at least ten yards and has touched the ground or has been touched by any member of the receiving team

In [None]:
train_dff = train_df[::22]
print("Total number of  YardLine : ", train_dff.YardLine.shape[0])
print("*"*50)
print("max of YardLine : ", train_dff.YardLine.max())
print("*"*50)
print("min of YardLine : ", train_dff.YardLine.min())
print("*"*50)
print("Number of missing values : ", train_dff.YardLine.isna().sum())

In [None]:
plt.figure()
sns.distplot(train_dff.YardLine)
plt.title("Distribution of YardLine")
plt.show()

## 15.Quarter
- game quarter (1-5, 5 == overtime)
- Football games last for a total of 60 minutes in professional and college play and are divided into two halves of 30 minutes and four quarters of 15 minutes. High school football games are 48 minutes in length with two halves of 24 minutes and four quarters of 12 minutes. The two halves are separated by a halftime period, and the first and third quarters are followed by a short break.

- If a game is tied at the end of four quarters, overtime is played. In overtime, the coin is tossed to determine which team will possess the ball first. The winner of the coin toss can choose to give the ball or receive the ball. If the first possession results in a field goal, the other team is given possession to match or better the field goal, therefore continuing the game. 


In [None]:
Quarter = train_df.Quarter[: : 22]

In [None]:
sns.countplot(Quarter)

- We observe that most of HandOff's are done in 1,2,3,4 Quarters
- In 5th Quarter very less HandOff's

## 16.GameClock
- time on the game clock
- this feature tells us when the HandOff started that particular time

In [None]:
GameClock = train_df.GameClock[::22]

In [None]:
GameClock.value_counts()[:5]

## 17.PossessionTeam

- team with possession
- Both teams' captains run out to midfield for the coin toss. One team wins and the announcer bellows that that team will start with "possession."
- In football, each team goes back and forth with 'possessions.' All that means is that each team's offense gets a chance to control the football. When a team's offense has the ball, they are considered to have 'possession' because they are dictating the scoring. Now, if that team turns the ball over, scores, or punts it away and suddenly the other team's offense comes on the field, that team now has the 'possession.'

**Note:** In traing data same team can have multiple different abbreviations. There are a few columns like `HomeTeamAbbr` or `PossessionTeam` and e.g. Baltimore has BAL abbreviation in the former and BLT in the latter. [link](https://www.kaggle.com/statsbymichaellopez/nfl-tracking-initial-wrangling-voronoi-areas)

In [None]:
train_df.loc[train_df['PossessionTeam'] == 'ARZ', 'PossessionTeam'] = 'ARI'
train_df.loc[train_df['PossessionTeam'] == 'BLT', 'PossessionTeam'] = 'BAL'
train_df.loc[train_df['PossessionTeam'] == 'CLV', 'PossessionTeam'] = 'CLE'
train_df.loc[train_df['PossessionTeam'] == 'HST', 'PossessionTeam'] = 'HOU'

In [None]:
PossessionTeam = train_df.PossessionTeam[::22]

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y=PossessionTeam)
plt.title("PossessionTeam countplot")
plt.show()

## 18.Down
- `Down` - the down(1-4)
- Downs are the most fundamental, and confusing, part of the NFL rulebook. The attacking team, or offence, needs to move the ball forward in chunks of at least 10 yards, which is why the pitch has yardage markings. They have four chances, or downs, to gain those 10 yards

In [None]:
train_dff = train_df[::22]
print("Total number of  Downs : ", train_dff.Down.shape[0])
print("*"*50)
print("max of HandOffs done on Down : ", train_dff.Down.value_counts().keys()[0], " are : ",train_dff.Down.value_counts().values[0] )
print("*"*50)
print("min of HandOffs done on Down : ", train_dff.Down.value_counts().keys()[-1], " are : ",train_dff.Down.value_counts().values[-1] )
print("*"*50)
print("Number of missing values : ", train_dff.Down.isna().sum())

In [None]:
plt.figure()
sns.countplot(x=train_dff.Down)
plt.title("Down countplot")
plt.show()

- Down 1 have higher HandOff's
- Down 4 have hower HandOff's

## 19.Distance
- `Distance`- yards needed for a first down
- Distance is the yards the team needs to make a first down
- Distance is how far away the team is from getting a first down.

In [None]:
train_dff = train_df[::22]
print("Total number of  Distances : ", train_dff.Distance.shape[0])
print("*"*50)
print("max of Distance : ", train_dff.Distance.max())
print("*"*50)
print("min of Distance : ", train_dff.Distance.min())
print("*"*50)
print("Number of missing values : ", train_dff.YardLine.isna().sum())

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y=train_dff.Distance)
plt.title("PossessionTeam countplot")
plt.show()

## 20.FieldPosition
- `FieldPossition`- which side of the field the play is happening on


In [None]:
FieldPosition = train_df.FieldPosition[::22]

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y=FieldPosition)
plt.title("FieldPosition countplot")
plt.show()

## 21.HomeScoreBeforePlay
- home team score before play started
- score before receiving that particular handoff

In [None]:
HomeScoreBeforePlay = train_df["HomeScoreBeforePlay"][::22]

In [None]:
print("max of HomeScoreBeforePlay : ", HomeScoreBeforePlay.max())
print("*"*50)
print("min of HomeScoreBeforePlay : ", HomeScoreBeforePlay.min())
print("*"*50)
print("Number of missing values : ", HomeScoreBeforePlay.isna().sum())

In [None]:
plt.figure()
sns.distplot(HomeScoreBeforePlay)
plt.title("Distribution of HomeScoreBeforePlay")
plt.show()

## 22.VisitorScoreBeforePlay
- `VisitorScoreBeforePlay`-visitor team score before play started 

In [None]:
VisitorScoreBeforePlay = train_df["VisitorScoreBeforePlay"][::22]

print("max of HomeScoreBeforePlay : ", VisitorScoreBeforePlay.max())
print("*"*50)
print("min of HomeScoreBeforePlay : ", VisitorScoreBeforePlay.min())
print("*"*50)
print("Number of missing values : ", VisitorScoreBeforePlay.isna().sum())

In [None]:
plt.figure()
sns.distplot(VisitorScoreBeforePlay)
plt.title("Distribution of VisitorScoreBeforePlay")
plt.show()

## 23.NflIdRusher
- the `NflId` of the rushing player
- Rushing is an action taken by the offense that means to advance the ball by running with it, as opposed to passing or kicking
- Any rushing player is called a rusher.
- PlayerId, who is carrying ball

In [None]:
NflIdRusher = train_df.NflIdRusher[::22]
print("Total number of unique NflIdRyshers : ", NflIdRusher.nunique())
print("*"*50)
print("max times HandOff for NflIdRusher ID is : ", NflIdRusher.value_counts().keys()[0] , "Number of times is :", NflIdRusher.value_counts().values[0])
print("*"*50)
print("min times HandOff for NflIdRusher ID is : ", NflIdRusher.value_counts().keys()[-1] , "Number of times is :", NflIdRusher.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", NflIdRusher.isna().sum())

## 24.OffenseFormation
- `OffenseFormation`- offense formation
- A formation in football refers to the position players line up in before the start of a down. There are both offensive and defensive formations and there are many formations in both categories. Sometimes, formations are referred to as packages.
- video [link](https://www.youtube.com/watch?v=PSP42z4yHOQ)

In [None]:
OffenseFormation = train_df.OffenseFormation[::22] 

In [None]:
print("Number of missing values : ", OffenseFormation.isna().sum())

In [None]:
plt.figure()
sns.countplot(y=OffenseFormation)
plt.title("OffenseFormation countplot")
plt.show()

## 25.OffensePersonnel
`OffensePersonnel`- Offensive team positional grouping

In [None]:
import tqdm as tqdm

In [None]:
OffensePersonnel = train_df.OffensePersonnel[::22]

In [None]:
unique_OffensePersonnel = []
for row in OffensePersonnel:
    result = ''.join([i for i in row.replace(',','') if not i.isdigit()]).strip()
    for per in result.split():
        if per not in unique_OffensePersonnel:
            unique_OffensePersonnel.append(per)
unique_OffensePersonnel

In [None]:
train_df.OffensePersonnel[:5]

In [None]:
OffensePersonnel_df = pd.DataFrame(0 ,columns=unique_OffensePersonnel, index=train_df.index)
for ind,personnel in enumerate(train_df.OffensePersonnel):
    pos = personnel.split(',')
    for i in pos:
        col = i[-2:]
        OffensePersonnel_df.loc[ind][col] = int(i[-4])

In [None]:
OffensePersonnel_df = OffensePersonnel_df.add_prefix("offense_")

In [None]:
OffensePersonnel_df.head()

In [None]:
train_df = pd.merge(train_df , OffensePersonnel_df, how="left", left_index=True, right_index=True)

## 26.DefendersInTheBox

- `DefendersInTheBox`- number of defenders lined up near the line of scrimmage, spanning the width of the offensive line
- In American football, an eight-in-the-box defense is a defensive alignment in which 8 of the 11 defensive players are close to the line of scrimmage. 


In [None]:
DefendersInTheBox = train_df.DefendersInTheBox[::22]

print("Total number of  DefendersInTheBox : ", DefendersInTheBox.shape[0])
print("*"*50)
print("max of Dir : ", DefendersInTheBox.max())
print("*"*50)
print("min of Dir : ", DefendersInTheBox.min())
print("*"*50)
print("Number of missing values : ", DefendersInTheBox.isna().sum())

In [None]:
plt.figure()
sns.distplot(DefendersInTheBox.dropna())
plt.title("Distribution of DefendersInTheBox")
plt.show()

In [None]:
plt.figure()
sns.countplot(y=DefendersInTheBox.dropna())
plt.title("DefendersInTheBox countplot")
plt.show()

## 27.DefensePersonnel

- `DefensePersonnel`- defensive team positional grouping

In [None]:
DefensePersonnel = train_df.DefensePersonnel[::22]

In [None]:
unique_DefensePersonnel = []
for row in DefensePersonnel:
    result = ''.join([i for i in row.replace(',','') if not i.isdigit()]).strip()
    for per in result.split():
        if per not in unique_DefensePersonnel:
            unique_DefensePersonnel.append(per)
unique_DefensePersonnel

In [None]:
train_df.DefensePersonnel[:5]

In [None]:
DefensePersonnel_df = pd.DataFrame(0 ,columns=unique_DefensePersonnel, index=train_df.index)
for ind,personnel in enumerate(train_df.DefensePersonnel):
    pos = personnel.split(',')
    for i in pos:
        col = i[-2:]
        DefensePersonnel_df.loc[ind][col] = int(i[-4])

In [None]:
DefensePersonnel_df = DefensePersonnel_df.add_prefix("defense_")

In [None]:
train_df = pd.merge(train_df , DefensePersonnel_df, how="left", left_index=True, right_index=True)

## 28.PlayDirection

- `PlayDirection`- direction the play is headed
- It have two categorys : left and right
- If `PlayDirection` is right, so offense is from left to right.
- [link](https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112997#latest-650782)

In [None]:
train_df.PlayDirection.value_counts()

## 29.TimeHandoff

- `TimeHandoff`-UTC time of the handoff
-  for what time the handoff of done

In [None]:
train_df["TimeHandoff"] = train_df["TimeHandoff"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

In [None]:
train_df.TimeHandoff[::22][:5]

## 30.TimeSnap
- `TimeSnap` - UTC time of the snap
- A snap (colloquially called a "hike", "snapback", or "pass from center") is the backwards passing of the ball in American and Canadian football at the start of play from scrimmage.

In [None]:
train_df['TimeSnap'] = train_df['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

In [None]:
train_df.TimeSnap[::22][:5]

## 31.Yards
- `Yards` - the yardage gained on the play (you are predicting this)
- the number of yards gained or lost on every play
- every 22 lines in the train data stand for one play. the format of the submission is a little different than just the number of yards, for every possible outcome of the play - you need to provide the probability of this outcome.

In [None]:
Yards = train_df.Yards[::22]

In [None]:
print("max Yards : ", Yards.max())
print("*"*50)
print("min Yards : ", Yards.min())
print("*"*50)
print("Number of missing values : ", Yards.isna().sum())

In [None]:
plt.figure()
sns.distplot(Yards)
plt.title("Distribution of Yards")
plt.show()

- most of in between -5 to 20 Yards

## 32.PlayerHeight
- `PlayerHeight` - player height (ft-in)

In [None]:
train_df.PlayerHeight[:5]

In [None]:
# https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win

train_df['PlayerHeight'] = train_df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

In [None]:
print("max PlayerHeight : ", train_df.PlayerHeight.max())
print("*"*50)
print("min PlayerHeight : ", train_df.PlayerHeight.min())
print("*"*50)
print("Number of missing values : ", train_df.PlayerHeight.isna().sum())

In [None]:
plt.figure()
sns.distplot(train_df.PlayerHeight)
plt.title("Distribution of player height")
plt.show()

## 34.PlayerWeight
- `PlayerWeight` - player weight (lbs)

In [None]:
print("max PlayerWeight : ", train_df.PlayerWeight.max())
print("*"*50)
print("min PlayerWeight : ", train_df.PlayerWeight.min())
print("*"*50)
print("Number of missing values : ", train_df.PlayerWeight.isna().sum())

In [None]:
plt.figure()
sns.distplot(train_df.PlayerWeight)
plt.title("Distribution of player weight")
plt.show()

## 35.PlayerBirthDate

- `PlayerBirthDate` - birth date (mm/dd/yyyy)

In [None]:
train_df["PlayerBirthDate"] = train_df["PlayerBirthDate"].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

In [None]:
train_df.PlayerBirthDate[:5]

## 36.PlayerCollegeName
- `PlayerCollegeName` - where the player attended college

In [None]:
print("Total number of unique PlayerCollegeName : ", train_df.PlayerCollegeName.nunique())
print("*"*50)
print("max number of players from : ", train_df.PlayerCollegeName.value_counts().keys()[0] , " and Number of players :", train_df.PlayerCollegeName.value_counts().values[0])
print("*"*50)
print("min number of players from : ", train_df.PlayerCollegeName.value_counts().keys()[-1] , " and Number of players :", train_df.PlayerCollegeName.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", train_df.PlayerCollegeName.isna().sum())

## 37.HomeTeamAbbr

- `HomeTeamAbbr` - home team abbreviation

In [None]:
HomeTeamAbbr = train_df.HomeTeamAbbr[::22]
plt.figure(figsize=(15,10))
sns.countplot(y=HomeTeamAbbr)
plt.title("HomeTeamAbbr countplot")
plt.show()

## 38.VisitorTeamAbbr
- `VisitorTeamAbbr` - visitor team abbreviation

In [None]:
VisitorTeamAbbr = train_df.VisitorTeamAbbr[::22]
plt.figure(figsize=(15,10))
sns.countplot(y=VisitorTeamAbbr)
plt.title("VisitorTeamAbbr countplot")
plt.show()

## 39.Week
- `Week` - week into the season
- The National Football League (NFL) regular season begins on the weekend following the first Monday of September (i.e, the weekend following the Labor Day holiday) and ends in December or early January. It consists of 256 games, where each team (32 total) plays 16 games during a 17-week period.

In [None]:
Week = train_df.Week[::22]

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y=Week)
plt.title("Week countplot")
plt.show()

## 40.Stadium
- stadium where the game is being played

In [None]:
Stadium = train_df.Stadium[::22].str.lower()

In [None]:
print("Total number of unique Stadiums : ", Stadium.nunique())
print("*"*50)
print("max number of plays in Stadium is : ", Stadium.value_counts().keys()[0] , " and Number of played :", Stadium.value_counts().values[0])
print("*"*50)
print("min number of plays in Stadium is : ", Stadium.value_counts().keys()[-1] , " and Number of played :", Stadium.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", Stadium.isna().sum())

## 41.Location

- `Location` - city where the game is being player

In [None]:
Location = train_df.Location[::22].str.lower()

In [None]:
print("Total number of unique Location : ", Location.nunique())
print("*"*50)
print("max number of plays in Location is : ", Location.value_counts().keys()[0] , " and Number of played :", Location.value_counts().values[0])
print("*"*50)
print("min number of plays in Location is : ", Location.value_counts().keys()[-1] , " and Number of played :", Location.value_counts().values[-1])
print("*"*50)
print("Number of missing values : ", Location.isna().sum())

## 42.StadiumType

- `StadiumType` - description of the stadium environment

In [None]:
train_df.StadiumType[::22].value_counts()

In [None]:
def clean_StadiumType(txt):
    if pd.isna(txt):
        return np.nan
    txt = txt.lower()
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = re.sub(' +', ' ', txt)
    txt = txt.replace('outside', 'outdoor')
    txt = txt.replace('outdor', 'outdoor')
    txt = txt.replace('outddors', 'outdoor')
    txt = txt.replace('outdoors', 'outdoor')
    txt = txt.replace('oudoor', 'outdoor')
    txt = txt.replace('indoors', 'indoor')
    txt = txt.replace('ourdoor', 'outdoor')
    txt = txt.replace('retractable', 'rtr.')
    txt = txt.replace('dome','domed')
    txt = txt.replace('retr roofclosed', 'retr roof closed')
    txt = txt.replace('retr roofopen', 'retr roof open')
    txt = txt.replace('domeddd', 'domed')
    txt = txt.replace('domedd closed', 'domed closed')
    txt = txt.replace('closed domed', 'domed closed')
    txt = txt.replace('domed closedd', 'domed closed')
    txt = txt.replace('domedd', 'domed')
    return txt

In [None]:
train_df['StadiumType'] = train_df['StadiumType'].apply(clean_StadiumType)

## 43.Turf
- `Turf`- description of the field surface

In [None]:
train_df.Turf.value_counts()

In [None]:
#from https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112681#latest-649087
grass_labels = ['grass', 'natural grass', 'natural', 'naturall grass']
train_df['Grass'] = np.where(train_df.Turf.str.lower().isin(grass_labels), 1, 0)

## 44.GameWeather
- `GameWeather` - description of the game weather

In [None]:
train_df.GameWeather.unique()

We are going to apply the following preprocessing:

- Lower case
- N/A Indoor, N/A (Indoors) and Indoor => indoor Let's try to cluster those together.
- coudy and clouidy => cloudy
- party => partly
- sunny and clear => clear and sunny
- skies and mostly => ""

In [None]:
train_df['GameWeather'] = train_df['GameWeather'].str.lower()
indoor = "indoor"
train_df['GameWeather'] = train_df['GameWeather'].apply(lambda x: indoor if not pd.isna(x) and indoor in x else x)
train_df['GameWeather'] = train_df['GameWeather'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
train_df['GameWeather'] = train_df['GameWeather'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
train_df['GameWeather'] = train_df['GameWeather'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)

In [None]:
train_df['GameWeather'].unique()

- Let's now look at the most common words we have in the weather description

In [None]:
from collections import Counter
weather_count = Counter()
for weather in train_df['GameWeather']:
    if pd.isna(weather):
        continue
    for word in weather.split():
        weather_count[word]+=1
        
weather_count.most_common()[:15]

## 45.Temperature
- `Temperature` - temperature (deg F)

In [None]:
Temperature = train_df.Temperature[::22]

print("max of Temperature : ", Temperature.max())
print("*"*50)
print("min of Temperature : ", Temperature.min())
print("*"*50)
print("Number of missing values : ", Temperature.isna().sum())

In [None]:
plt.figure()
sns.distplot(Temperature.dropna())
plt.title("Distribution of Temperature")
plt.show()

## 46.Humidity

- `Humidity` - humidity

In [None]:
Humidity = train_df.Humidity[::22]

print("max of Humidity : ", Humidity.max())
print("*"*50)
print("min of Humidity : ", Humidity.min())
print("*"*50)
print("Number of missing values : ", Humidity.isna().sum())

In [None]:
plt.figure()
sns.distplot(Humidity.dropna())
plt.title("Distribution of Humidity")
plt.show()

## 48.WindSpeed
- `WindSpeed` - wind speed in miles/hour

In [None]:
train_df['WindSpeed'].value_counts()

- We can see there are some values that are not standardized(e.g. 12mph), we are going to remove mph from all our values.

In [None]:
train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)

In [None]:
#let's replace the ones that has x-y by (x+y)/2
# and also the ones with x gusts up to y
train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
train_df['WindSpeed'] = train_df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)

In [None]:
def str_to_float(txt):
    try:
        return float(txt)
    except:
        return -1

In [None]:
train_df['WindSpeed'] = train_df['WindSpeed'].apply(str_to_float)

In [None]:
train_df.WindSpeed.value_counts()

## 49.WindDirection

- `WindDirection` - wind direction


In [None]:
train_df['WindDirection'].value_counts()

In [None]:
def clean_WindDirection(txt):
    if pd.isna(txt):
        return np.nan
    txt = txt.lower()
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = txt.replace('from', '')
    txt = txt.replace(' ', '')
    txt = txt.replace('north', 'n')
    txt = txt.replace('south', 's')
    txt = txt.replace('west', 'w')
    txt = txt.replace('east', 'e')
    return txt

In [None]:
train_df['WindDirection'] = train_df['WindDirection'].apply(clean_WindDirection)

In [None]:
train_df['WindDirection'].value_counts()

In [None]:
def transform_WindDirection(txt):
    if pd.isna(txt):
        return np.nan
    
    if txt=='n':
        return 0
    if txt=='nne' or txt=='nen':
        return 1/8
    if txt=='ne':
        return 2/8
    if txt=='ene' or txt=='nee':
        return 3/8
    if txt=='e':
        return 4/8
    if txt=='ese' or txt=='see':
        return 5/8
    if txt=='se':
        return 6/8
    if txt=='ses' or txt=='sse':
        return 7/8
    if txt=='s':
        return 8/8
    if txt=='ssw' or txt=='sws':
        return 9/8
    if txt=='sw':
        return 10/8
    if txt=='sww' or txt=='wsw':
        return 11/8
    if txt=='w':
        return 12/8
    if txt=='wnw' or txt=='nww':
        return 13/8
    if txt=='nw':
        return 14/8
    if txt=='nwn' or txt=='nnw':
        return 15/8
    return np.nan

In [None]:
train_df['WindDirection'] = train_df['WindDirection'].apply(transform_WindDirection)

### This is my First Kernel :) please give boost for me, with your valuble upvoting

## Thankyou :)