<h1><center>NFL Big Data Bowl Basic EDA for beginner</center></h1>

<center><img src="https://deadline.com/wp-content/uploads/2021/01/NFL-ball.jpg?crop=0px%2C33px%2C1226px%2C687px&resize=681%2C383"></center>

### This is very simple EDA notebook. I have lots of things to analyze so that I'll keep updating.

# Upvote is Free 🤗
### PLEASE UPVOTE if you like this notebook. It will keep me motivated to update my notebook.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

<a id="1"></a>
<h2 style='background:transparent; color:black'><center>1. Game Data<center><h2>

### **Game data:** The games.csv contains the teams playing in each game. The key variable is gameId.

* **gameId:** Game identifier, unique (numeric)

* **gameDate:** Game Date (time, mm/dd/yyyy)

* **gameTimeEastern:** Start time of game (time, HH:MM:SS, EST)

* **homeTeamAbbr:** Home team three-letter code (text)

* **visitorTeamAbbr:** Visiting team three-letter code (text)

* **week:** Week of game (numeric)

In [None]:
games = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
games

## Function for Downcast

Downcast is a great skill to compress data size which helps to save memory.

In [None]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% Compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
games = downcast(games)

## Function for making feature summary 

In [None]:
def resumetable(df):
    print(f'Shape : {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['Data Type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'Feature'})
    summary['Num of null'] = df.isnull().sum().values
    summary['Num of unique'] = df.nunique().values
    summary['First value'] = df.loc[0].values
    summary['Second value'] = df.loc[1].values
    summary['Third value'] = df.loc[2].values
    return summary

resumetable(games)

## Function for writing percent at the top of the bar graph

In [None]:
def write_percent(ax, total_size):
    '''Traverse the figure object and display the ratio at the top of the bar graph.'''
    for patch in ax.patches:
        height = patch.get_height() # Figure height (number of data)
        width = patch.get_width() # Figure width
        left_coord = patch.get_x() # The x-axis position on the left edge of the figure
        percent = height/total_size*100 # percent
        
        # Type text in the (x, y) coordinates
        ax.text(x=left_coord + width/2.0, # x-axis position
                y=height + total_size*0.001, # y-axis position
                s=f'{percent:1.1f}%', # Text
                ha='center') # in the middle

## Make derivative features (month, day, hour)

In [None]:
games['month'] = games['gameDate'].apply(lambda x: int(x.split('/')[0]))
games['day'] = games['gameDate'].apply(lambda x: int(x.split('/')[1]))
games['hour'] = games['gameTimeEastern'].apply(lambda x: int(x.split(':')[0]))

## Data Visualization

In [None]:
mpl.rc('font', size=15) # Set font size
plt.figure(figsize=(7, 6)) # Set figure size

ax = sns.countplot(x='season', data=games)
write_percent(ax, len(games)) 
ax.set_title('Number of games for season');

#### As the years go by, the number of games increases

In [None]:
mpl.rc('font', size=15)
plt.figure(figsize=(8, 6))

ax = sns.countplot(x='month', data=games)
write_percent(ax, len(games))
ax.set_title('Number of games for month');

#### The game was held from September to January. There are especially many games in December, and they are rarely held in January

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(15, 7))

ax = sns.countplot(x='day', data=games)
write_percent(ax, len(games))
ax.set_title('Number of games for day');

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(15, 7))

ax = sns.countplot(x='gameTimeEastern', data=games)
write_percent(ax, len(games))
ax.set_title('Number of games for gameTimeEastern');
ax.tick_params('x', labelrotation=30) # rotate 30 degree of x label

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(15, 7))

ax = sns.countplot(x='hour', data=games)
write_percent(ax, len(games))
ax.set_title('Number of games for hour');

#### The most games were held at 1, 4, and 8

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(15, 7))

ax = sns.countplot(x='week', data=games)
write_percent(ax, len(games))
ax.set_title('Number of games for week');

<a id="2"></a>
<h2 style='background:transparent; border:0; color:black'><center>2. Player Data<center><h2>

### **Player data:** The players.csv file contains player-level information from players that participated in any of the tracking data files. The key variable is nflId

* **nflId:** Player identification number, unique across players (numeric)

* **height:** Player height (text)

* **weight:** Player weight (numeric)

* **birthDate:** Date of birth (YYYY-MM-DD)

* **collegeName:** Player college (text)

* **position:** Player position (text)

* **displayName:** Player name (text)

In [None]:
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')
players

In [None]:
players = downcast(players)

In [None]:
resumetable(players)

### Convert all heights to feet

In [None]:
check = players['height'].str.split('-', expand=True)

check.columns = ['first', 'second']

check.loc[(check['second'].notnull()), 'first'] = check[check['second'].notnull()]['first'].astype(np.int16) * 12 + check[check['second'].notnull()]['second'].astype(np.int16)

In [None]:
players['height'] = check['first']
players['height'] = players['height'].astype(np.float32)
players['height'] /= 12

players

In [None]:
mpl.rc('font', size=15) 
plt.figure(figsize=(10, 6))

ax = sns.distplot(players['height'], bins=12)
ax.set_title('Height Distribution');

In [None]:
mpl.rc('font', size=15) 
plt.figure(figsize=(10, 6))

ax = sns.distplot(players['weight'])
ax.set_title('Weight Distribution');

In [None]:
top_players_colleage = players['collegeName'].value_counts()[:20].reset_index()
top_players_colleage.columns = ['collageName', 'numberOfPlayers']

In [None]:
mpl.rc('font', size=10) 
plt.figure(figsize=(15, 12))

ax = sns.barplot(x='numberOfPlayers', y='collageName', data=top_players_colleage)
ax.set_title('Number of players for collegeName');

### Create birth year feature

In [None]:
players['birthYear'] = 0

There are NA values in birthDate so that we should drop them

In [None]:
players.dropna(subset=['birthDate'], inplace=True)

Extract birth year

In [None]:
for idx, row in players.iterrows():
    if len(row['birthDate'].split('/')) == 3: # ex) 05/17/1994 
        players.loc[idx, 'birthYear'] = row['birthDate'].split('/')[2]
        
    elif len(row['birthDate'].split('-')) == 3: # ex) 1995-05-05
        players.loc[idx, 'birthYear'] = row['birthDate'].split('-')[0]

In [None]:
mpl.rc('font', size=15) 
plt.figure(figsize=(10, 5))

ax = sns.distplot(players['birthYear'], bins=25)
ax.set_title('Players birth year Distribution');

#### Those born in 1995 are the most common

In [None]:
players['birthYear'].min(), players['birthYear'].max()

#### The oldest player was born in 1972, and the youngest player was born in 1999

<a id="3"></a>
<h2 style='background:transparent; border:0; color:black'><center>3. Play Data<center><h2>

### **Play data:** The plays.csv file contains play-level information from each game. The key variables are gameId and playId
- gameId: Game identifier, unique (numeric)
- playId: Play identifier, not unique across games (numeric)
- playDescription: Description of play (text)
- quarter: Game quarter (numeric)
- down: Down (numeric)
- yardsToGo: Distance needed for a first down (numeric)
- possessionTeam: Team punting, placekicking or kicking off the ball (text)
- specialTeamsPlayType: Formation of play: Extra Point, Field Goal, Kickoff or Punt (text)
- specialTeamsPlayResult: Special Teams outcome of play dependent on play type: Blocked Kick Attempt, Blocked Punt, Downed, Fair Catch, Kick Attempt Good, Kick Attempt No Good, Kickoff Team Recovery, Muffed, Non-Special Teams Result, Out of Bounds, Return or Touchback (text)
- kickerId: nflId of placekicker, punter or kickoff specialist on play (numeric)
- returnerId: nflId(s) of returner(s) on play if there was a special teams return. Multiple returners on a play are separated by a ; (text)
- kickBlockerId: nflId of blocker of kick on play if there was a blocked field goal or blocked punt (numeric)
- yardlineSide: 3-letter team code corresponding to line-of-scrimmage (text)
- yardlineNumber: Yard line at line-of-scrimmage (numeric) 
- gameClock: Time on clock of play (MM:SS)
- penaltyCodes: NFL categorization of the penalties that occurred on the play. Multiple penalties on a play are separated by a ; (text)
- penaltyJerseyNumber: Jersey number and team code of the player committing each penalty. Multiple penalties on a play are separated by a ; (text)
- penaltyYards: yards gained by possessionTeam by penalty (numeric)
- preSnapHomeScore: Home score prior to the play (numeric)
- preSnapVisitorScore: Visiting team score prior to the play (numeric)
- passResult: Scrimmage outcome of the play if specialTeamsPlayResult is "Non-Special Teams Result" (C: Complete pass, I: Incomplete pass, S: Quarterback sack, IN: Intercepted pass, R: Scramble, ' ': Designed Rush, text)
- kickLength: Kick length in air of kickoff, field goal or punt (numeric)
- kickReturnYardage: Yards gained by return team if there was a return on a kickoff or punt (numeric)
- playResult: Net yards gained by the kicking team, including penalty yardage (numeric)
- absoluteYardlineNumber: Location of ball downfield in tracking data coordinates (numeric)

In [None]:
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')

plays

In [None]:
plays = downcast(plays)

In [None]:
resumetable(plays)

#### There are lots of null values in `returnerld`, `kickBlockerId`, `penaltyCodes`, `penaltyJerseyNumbers`, `penaltyYards`, `passResult`, `kickReturnYardage` features

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(12, 6))

ax = sns.countplot(x='quarter', data=plays)
write_percent(ax, len(plays))
ax.set_title('Number of plays of every quarter');

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(12, 6))

ax = sns.countplot(x='down', data=plays)
write_percent(ax, len(plays))
ax.set_title('Number of plays of every down');

In [None]:
mpl.rc('font', size=12) 
plt.figure(figsize=(12, 6))

ax = sns.countplot(x='yardsToGo', data=plays)
ax.set_title('Number of plays for every yards to go category');

In [None]:
mpl.rc('font', size=15) 
plt.figure(figsize=(10, 5))

ax = sns.distplot(plays['playResult'], bins=25);
ax.set_title('playResult Distribution'); 

playResult: Net yards gained by the offense, including penalty yardage (numeric)

In [None]:
mpl.rc('font', size=15) 
plt.figure(figsize=(10, 5))

ax = sns.distplot(plays['preSnapHomeScore'], bins=12);
ax.set_title('preSnapHomeScore Distribution'); 

preSnapHomeScore: Home score prior to the play (numeric)

In [None]:
mpl.rc('font', size=15) 
plt.figure(figsize=(10, 5))

ax = sns.distplot(plays['preSnapVisitorScore'], bins=12);
ax.set_title('preSnapVisitorScore Distribution'); 

preSnapVisitorScore: Visiting team score prior to the play (numeric)

<a id="3"></a>
<h2 style='background:transparent; border:0; color:black'><center>4. Tracking Data<center><h2>

In [None]:
tracking2018 = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2018.csv')
tracking2018.head()

In [None]:
tracking2018 = downcast(tracking2018)

This function is taken from the post created by ROB MULLA. See the post [here](https://www.kaggle.com/robikscube/nfl-big-data-bowl-2022-twitch-stream-eda). Thank you ROB MULLA :)

#### 2018123000 and playId == 36

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
tracking2018.query('gameId == 2018123000 and playId == 36').groupby('team') \
    .plot(x='x', y='y', ax=ax, style='.')
plt.legend().remove();

#### gameId == 2018091001 and playId == 4033

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
tracking2018.query('gameId == 2018091001 and playId == 4033').groupby('team') \
    .plot(x='x', y='y', ax=ax, style='.')
plt.legend().remove();

#### gameId == 2018091609 and position == "CB"

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
tracking2018.query('gameId == 2018091609 and position == "CB"').groupby('team') \
    .plot(x='x', y='y', ax=ax, style='.')
plt.legend().remove();

#### gameId == 2018091609 and position == "LB"

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
tracking2018.query('gameId == 2018091609 and position == "LB"').groupby('team') \
    .plot(x='x', y='y', ax=ax, style='.')
plt.legend().remove();

#### gameId == 2018091609 and position == "RB"

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
tracking2018.query('gameId == 2018091609 and position == "RB"').groupby('team') \
    .plot(x='x', y='y', ax=ax, style='.')
plt.legend().remove();

## Function to create football field


This function is taken from the post created by JARON_MICHAL. See the post [here](https://www.kaggle.com/jaronmichal/tracking-data-visualization)

In [None]:
import matplotlib.patches as patches
from matplotlib.patches import Arc
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches

# Change size of the figure
plt.rcParams['figure.figsize'] = [12, 8]
def drawPitch(width, height, color="w"):
    fig = plt.figure()
    ax = plt.axes(xlim=(-10, width + 30), ylim=(-15, height + 5))
    plt.axis('off')

    # Grass around pitch
    rect = patches.Rectangle((-10, -5), width + 40, height + 10, linewidth=1, facecolor='#3f995b', capstyle='round')
    ax.add_patch(rect)
    ###################

    # Pitch boundaries
    rect = plt.Rectangle((0, 0), width + 20, height, ec=color, fc="None", lw=2)
    ax.add_patch(rect)
    ###################

    # vertical lines - every 5 yards
    for i in range(21):
        plt.plot([10 + 5 * i, 10 + 5 * i], [0, height], c="w", lw=2)
    ###################
        
    # distance markers - every 10 yards
    for yards in range(10, width, 10):
        yards_text = yards if yards <= width / 2 else width - yards
        # top markers
        plt.text(10 + yards - 2, height - 7.5, yards_text, size=15, c="w", weight="bold")
        # botoom markers
        plt.text(10 + yards - 2, 7.5, yards_text, size=15, c="w", weight="bold", rotation=180)
    ###################

    # yards markers - every yard
    # bottom markers
    for x in range(20):
        for j in range(1, 5):
            plt.plot([10 + x * 5 + j, 10 + x * 5 + j], [1, 3], color="w", lw=2)

    # top markers
    for x in range(20):
        for j in range(1, 5):
            plt.plot([10 + x * 5 + j, 10 + x * 5 + j], [height - 1, height - 3], color="w", lw=2)

    # middle bottom markers
    y = (height - 18.5) / 2
    for x in range(20):
        for j in range(1, 5):
            plt.plot([10 + x * 5 + j, 10 + x * 5 + j], [y, y + 2], color="w", lw=2)

    # middle top markers
    for x in range(20):
        for j in range(1, 5):
            plt.plot([10 + x * 5 + j, 10 + x * 5 + j], [height - y, height - y - 2], color="w", lw=2)
    ###################

    # draw home end zone
    plt.text(2.5, (height - 15) / 2, "HOME", size=30, c="w", weight="bold", rotation=90)
    rect = plt.Rectangle((0, 0), 10, height, ec=color, fc="#0064dc", lw=2)
    ax.add_patch(rect)

    # draw away end zone    
    plt.text(111, (height - 15) / 2, "AWAY", size=30, c="w", weight="bold", rotation=-90)
    rect = plt.Rectangle((width + 10, 0), 10, height, ec=color, fc="#c80014", lw=2)
    ax.add_patch(rect)
    ###################
    
    # draw extra spot point
    # left
    y = (height - 3) / 2
    plt.plot([10 + 2, 10 + 2], [y, y + 3], c="w", lw=2)
    
    # right
    plt.plot([width + 10 - 2, width + 10 - 2], [y, y + 3], c="w", lw=2)
    ###################
    
    # draw goalpost
    goal_width = 6 # yards
    y = (height - goal_width) / 2
    # left
    plt.plot([0, 0], [y, y + goal_width], "-", c="y", lw=10, ms=20)
    # right
    plt.plot([width + 20, width + 20], [y, y + goal_width], "-", c="y", lw=10, ms=20)
    
    return fig, ax

In [None]:
 fig, ax = drawPitch(100, 53.3)

## Function to create animation

In [None]:
games_ids = {}
games_tracking2018 = tracking2018.groupby(by=["gameId"])
for game, data in games_tracking2018:
    games_ids[game] = list(set(data.playId.tolist()))

In [None]:
def extract_one_game(game_id, play_id, df):
    game = df[(df.gameId == game_id) & (df.playId == play_id)]
    home = {}
    away = {}
    balls = []
    
    players = game.sort_values(['frameId'], ascending=True).groupby('nflId')
    for id, dx in players:
        jerseyNumber = int(dx.jerseyNumber.iloc[0])
        if dx.team.iloc[0] == "home":
            home[jerseyNumber] = list(zip(dx.x.tolist(), dx.y.tolist()))
        elif dx.team.iloc[0] == "away":
            away[jerseyNumber] = list(zip(dx.x.tolist(), dx.y.tolist()))


    ball_df = game.sort_values(['frameId'], ascending=True) 
    ball_df = ball_df[ball_df.team == "football"]
    balls = list(zip(ball_df.x.tolist(), ball_df.y.tolist()))
    return home, away, balls

In [None]:
from matplotlib import animation
from IPython.display import HTML
def animate_one_play(game_id, play_id, df):
    fig, ax = drawPitch(100, 53.3)
    
    home, away, balls = extract_one_game(game_id, play_id, df)

    team_left, = ax.plot([], [], 'o', markersize=20, markerfacecolor="r", markeredgewidth=2, markeredgecolor="white", zorder=7)
    team_right, = ax.plot([], [], 'o', markersize=20, markerfacecolor="b", markeredgewidth=2, markeredgecolor="white", zorder=7)
    ball, = ax.plot([], [], 'o', markersize=10, markerfacecolor="black", markeredgewidth=2, markeredgecolor="white", zorder=7)
    drawings = [team_left, team_right, ball]

    def init():
        team_left.set_data([], [])
        team_right.set_data([], [])
        ball.set_data([], [])
        return drawings

    def draw_teams(i):
        X = []
        Y = []
        for k, v in home.items():
            x, y = v[i]
            X.append(x)
            Y.append(y)
        team_left.set_data(X, Y)
        
        X = []
        Y = []
        for k, v in away.items():
            x, y = v[i]
            X.append(x)
            Y.append(y)
        team_right.set_data(X, Y)

    def animate(i):
        draw_teams(i)
        
        x, y = balls[i]
        ball.set_data([x, y])
        return drawings
    
    # !May take a while!
    anim = animation.FuncAnimation(fig, animate, init_func=init,
                                   frames=len(balls), interval=100, blit=True)

    return HTML(anim.to_html5_video())

In [None]:
animate_one_play(2018123000, 36, tracking2018)