In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <p style="text-align:center;"> ✨🏈 NFL Big Data Bowl - EDA 🏈✨  </p>

<div style="width:100%;text-align: center;"> <img align=middle src="https://cloudfront-us-east-1.images.arcpublishing.com/gray/42TKT722XJLFPFT45YL2PPSFS4.jpg" alt="Heat beating" style="height:300px;margin-top:3rem;">!</div>

**<p style="text-align:center;"> Beginner-friendly EDA notebook. More analyses and visualizations are on the way for future updates !! </p>**

# Upvote is Free 😇

**PLEASE UPVOTE if you like this EDA notebook.  It will keep me inspired to update my notebook.**

# IMPORTING LIBRARIES

In [None]:
import pandas as pd
from matplotlib import pyplot as plt 
import matplotlib.patches as patches
import plotly.express as px
import plotly.graph_objects as pg
from plotly import tools as tls

# <p style="text-align:center;"> ✨1. Player Data✨  </p>

**Player data:** The players.csv file contains player-level information from players that participated in any of the tracking data files. The key variable is nflId.

* nflId: Player identification number, unique across players (numeric)
* Height: Player height (text)
* Weight: Player weight (numeric)
* birthDate: Date of birth (YYYY-MM-DD)
* collegeName: Player college (text)
* Position: Player position (text)
* displayName: Player name (text)

# Loading Data

In [None]:
player = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/players.csv')
print("Number of data points:", len(player))
print("Number of columns:", player.shape[1])
player.head()

# Data Cleaning

**We want to examine the dataset to know the steps taken to clean the dataset.**

In [None]:
# Dataset Summary
player.info()

**From the above output, we can conclude that**

1. The birthDate and collegeName column has missing values.

2. Convert all heights from format feet - inch or inch to feet.

3. The birthDate column should be parsed as DateTime instead of Object.

4. Splitting YYYY (Year) from YYYY-MM-DD date format to calculate birth year of players.

5. Replacing abbrevated words with keywords in Position column.

# Eliminate Null Values

In [None]:
player = player.dropna()
print("The shape of the dataset after removing null values is : ", player.shape)

# Listing out unique values from height column

In [None]:
print("Unique height values and their counts :")
print(player["height"].value_counts())

**From the above output, we can conclude that there are two formats in height values.**

1. Feet - Inch ( Ex: 6-3, 6-4 )
2. Inches ( Ex : 72, 78 )

# Feature Engineering - Converting all heights to feet

**Parsing Object as String and splitting height values (Ex: 6-3) into two (Ex: 6 Feet and 3 Inches)**

In [None]:
new_height = player["height"].str.split("-", n = 1, expand = True)
new_height.columns = ['Feet', 'Inch']
player["height_feet"] = pd.to_numeric(new_height["Feet"])
player["height_inch"] = pd.to_numeric(new_height["Inch"])
print("The shape of the dataset after feature engineering is : ", player.shape)
print(player)

**Converting Inches to Feet by multiplying inch value with 0.0833333 (Since 1 inch = 0.0833333 feet)**

In [None]:
# Column - height_feet
for i in player["height_feet"]:
    if i > 7:
        j = 0.0833333 * i
        player["height_feet"].replace({i : j}, inplace=True)
        
# Column - height_inch
player["height_inch"] = 0.0833333 * player["height_inch"]
print(player.tail())

**Converting Nan values to zero in height_inch column to perform arithmetic operations**

In [None]:
for i in player["height_inch"]:
    player["height_inch"] = player["height_inch"].fillna(0)
print(player["height_inch"])

**Now inch values are converted into feet values.**
**The next step will be getting total feet values by adding height_inch & height_feet column.**

In [None]:
player["height_feet"] = player["height_feet"] + player["height_inch"]
print(player["height_feet"])

# Exploring height_feet Column

In [None]:
# Maximum and Minimum height of the players in feet
print("Maximum height of the player in feet : ",max(player["height_feet"]))
print("Minimum height of the player in feet : ",min(player["height_feet"]))

# Visualizing height_feet column by Histogram

In [None]:
# Creating Histogram plot
fig = px.histogram(player, x="height_feet", title="Player's Height In Feet", nbins=10, color_discrete_sequence=["royalblue"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from weight column

In [None]:
print("Unique weight values and their counts :")
print(player["weight"].value_counts())

**From the above output we can conclude that there is no anomaly (something that deviates from what is standard, normal, or expected)  and all weights are in lbs (pounds).**

# Exploring weight Column

In [None]:
# Maximum and Minimum weight of the players in lbs (pounds)
print("Maximum weight of the player in lbs (pounds) : ",max(player["weight"]))
print("Minimum weight of the player in lbs (pounds) : ",min(player["weight"]))

# Visualizing weight column by Histogram

In [None]:
# Creating Histogram plot
fig = px.histogram(player, x="weight", title="Player's Weight In lbs (pounds)", color_discrete_sequence=["darkblue"])

# Displaying Histogram plot
fig.show()

# Feature Engineering - Converting YYYY-MM-DD date format to YYYY (year) format

**Parsing Object as DateTime**

In [None]:
print("Data type of birthDate column before parsing : ", player["birthDate"].dtypes)
player["birthDate"] = pd.to_datetime(player["birthDate"])
print("Data type of birthDate column after parsing : ", player["birthDate"].dtypes)
print(player["birthDate"].head())

**Splitting YYYY (Year) from YYYY-MM-DD date format**

In [None]:
player['birthYear'] = pd.DatetimeIndex(player['birthDate']).year
print(player["birthYear"])

# Listing out unique values from birthYear column

In [None]:
print("Unique birth year values and their counts :")
print(player["birthYear"].value_counts())

**From the above output we can conclude that there is no anomaly (something that deviates from what is standard, normal, or expected)  and all date values are in YYYY format.**

# Exploring birthYear Column

In [None]:
# Newest and oldest player
print("Newest player birth year : ",max(player["birthYear"]))
print("Oldest player birth year : ",min(player["birthYear"]))

# Visualizing birthYear column by Histogram

In [None]:
# Creating Histogram plot
fig = px.histogram(player, x="birthYear", title="Birth Year Of Players", color_discrete_sequence=["red"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from collegeName column

In [None]:
print("Unique college names and their counts : ")
college_names = player.pivot_table(index = ['collegeName'], aggfunc = 'size') 
college_names = college_names.reset_index()
college_names.columns= ["College Names", "Counts"]
college_names = college_names.sort_values("Counts", ascending = False)
print(college_names)

# Visualizing arrangement of colleges from highest to lowest number of players  by Treemap plot

In [None]:
# Creating Treemap plot
fig = px.treemap(college_names, path=['College Names'], values='Counts', color='Counts', title="Arrangement Of Colleges From Highest To Lowest Number Of Players")

# Displaying Treemap plot
fig.show()

# Displaying top 10 colleges having higher number of players

In [None]:
top_colleges = college_names[0:10]
print(top_colleges)

# Visualizing top 10 colleges having higher number of players by Donut plot

In [None]:
# Creating Donut Plot
fig = plt.figure(figsize = (12, 13)) 
circle = plt.Circle( (0,0), 0.5, color = 'white')
plt.pie(top_colleges["Counts"], labels = top_colleges["College Names"])
p = plt.gcf()
p.gca().add_artist(circle)
plt.legend(top_colleges["Counts"])
plt.title("Top 10 Colleges Having Higher Number Of Players", fontsize=25)

# Displaying Donut Plot
plt.show() 

# Exploring Position column

**Replacing abbrevated words with keywords.**

In [None]:
player["Position"].replace({"WR": "Wide Receiver", "CB": "Corner Back", "DE": "Defensive End", "OLB": "Outside Linebacker",
                           "TE": "Tight End", "T": "Tackle", "RB": "Running Back", "G": "Guard", "DT": "Defensive Tackle",
                           "ILB": "Inside Linebacker", "FS": "Free Safety", "SS": "Strong Safety", "C": "Center", "K": "Kicker",
                           "P": "Punter", "NT": "Nose Tackle", "LS": "Long Snapper", "LB": "Linebacker", "DB": "Defensive Back",
                           "MLB" : "Middle Linebacker", "FB": "Full Back", "OT": "Offensive Tackle", "QB": "Quarter Back",
                           "OG": "Offensive guard", "S": "Safety", "HB": "Half Back"}, inplace=True)
print(player["Position"])

# Listing out unique values from position column

In [None]:
print("Unique position values and their counts :")
pos_val = player.pivot_table(index = ['Position'], aggfunc = 'size') 
pos_val = pos_val.reset_index()
pos_val.columns= ["Positions", "Counts"]
pos_val = pos_val.sort_values("Counts", ascending = False)
print(pos_val)

# Visualizing frequently used to least used positions by Bar plot

In [None]:
#  Creating Bar plot
fig = px.bar(pos_val, x="Positions", y="Counts", title="Arrangement Of Positions From Frequently Used To Least Used", color=pos_val["Counts"])

# Displaying Bar plot
fig.show()

# Tabular Visualization

# Player data having highest height in feet

In [None]:
height = player[player['height_feet'] == max(player["height_feet"])]
height

# Player data having lowest height in feet

In [None]:
low_height = player[player['height_feet'] == min(player["height_feet"])]
low_height

# Player data having highest weight in lbs

In [None]:
weight = player[player['weight'] == max(player["weight"])]
weight

# Player data having highest weight in lbs

In [None]:
low_weight = player[player['weight'] == min(player["weight"])]
low_weight

# Earliest player's data

In [None]:
earliest = player[player['birthYear'] == min(player["birthYear"])]
earliest

# Newest player's data

In [None]:
newest = player[player['birthYear'] == max(player["birthYear"])]
newest

# <p style="text-align:center;"> ✨2. Games Data✨  </p>

**Game data:** The games.csv contains the teams playing in each game. The key variable is gameId.
* gameId: Game identifier, unique (numeric)
* season: Season of game
* week: Week of game
* gameDate: Game Date (time, mm/dd/yyyy)
* gameTimeEastern: Start time of game (time, HH:MM:SS, EST)
* homeTeamAbbr: Home team three-letter code (text)
* visitorTeamAbbr: Visiting team three-letter code (text)

# Loading Data

In [None]:
games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/games.csv')
print("Number of data points:", len(games))
print("Number of columns:", games.shape[1])
games

# Data Cleaning

**We want to examine the dataset to know the steps taken to clean the dataset.**

In [None]:
# Dataset Summary
games.info()

**From the above output, we can conclude that**

1. The gameDate column should be parsed as DateTime instead of Object.

2. Splitting MM (Month), DD (Date) & YYYY (Year) from MM/DD/YYYY date format.

3. Converting MM (Month) into Month name.

4. Splitting HH (Hour) from HH:MM:SS time format.

5. Replacing abbrevated words with keywords in homeTeamAbbr and visitorTeamAbbr column.

# Listing out unique values from season column

In [None]:
print("Unique NFL seasons and their counts :")
g_season = games.pivot_table(index = ['season'], aggfunc = 'size') 
g_season = g_season.reset_index()
g_season.columns= ["Seasons", "Counts"]
g_season = g_season.sort_values("Counts", ascending = False)
print(g_season)

# Visualizing NFL seasons by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_season, x="Seasons", y="Counts", title="NFL Seasons", color="Seasons")

# Displaying Bar plot
fig.show()

# Listing out unique values from week column

In [None]:
print("Unique NFL weeks and their counts :")
g_week = games.pivot_table(index = ['week'], aggfunc = 'size') 
g_week = g_week.reset_index()
g_week.columns= ["Weeks", "Counts"]
g_week = g_week.sort_values("Counts", ascending = False)
print(g_week)

# Visualizing NFL weeks by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_week, x="Weeks", y="Counts", title="NFL Weeks", color="Weeks")

# Displaying Bar plot
fig.show()

# Listing out unique values from week column

In [None]:
print("Unique NFL dates and their counts :")
g_date = games.pivot_table(index = ['gameDate'], aggfunc = 'size') 
g_date = g_date.reset_index()
g_date.columns= ["Date", "Counts"]
g_date = g_date.sort_values("Counts", ascending = False)
print(g_date)

# Visualizing NFL dates by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_date, x="Date", y="Counts", title='Dates On Which NFL Events Occurred', color="Counts")

# Displaying Bar plot
fig.show()

# Feature Engineering - Splitting MM (Month), DD (Date) & YYYY (Year) from MM/DD/YYYY date format

**Parsing Object as DateTime**

In [None]:
print("Data type of gameDate column before parsing : ", games["gameDate"].dtypes)
games["gameDate"] = pd.to_datetime(games["gameDate"])
print("Data type of gameDate column after parsing : ", games["gameDate"].dtypes)
print(games["gameDate"].head())

**Splitting YYYY (Year) from MM/DD/YYYY date format**

In [None]:
games['gameYear'] = pd.DatetimeIndex(games['gameDate']).year
print(games["gameYear"])

# Listing out unique values from year column

In [None]:
print("Unique NFL years and their counts :")
g_year = games.pivot_table(index = ['gameYear'], aggfunc = 'size') 
g_year = g_year.reset_index()
g_year.columns= ["Year", "Counts"]
g_year = g_year.sort_values("Counts", ascending = False)
print(g_year)

# Visualizing NFL years by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_year, x="Year", y="Counts", title="Distinct Count Of NFL Event Years", color="Year")

# Displaying Bar plot
fig.show()

**Splitting MM (Month) from MM/DD/YYYY date format and converting it into Month name**

In [None]:
games["gameMonthName"] = games["gameDate"].dt.month_name()
print(games["gameMonthName"])

# Listing out unique values from month name column

In [None]:
print("Unique NFL months and their counts :")
g_month = games.pivot_table(index = ['gameMonthName'], aggfunc = 'size') 
g_month = g_month.reset_index()
g_month.columns= ["Month", "Counts"]
g_month = g_month.sort_values("Counts", ascending = False)
print(g_month)

# Visualizing NFL months by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_month, x="Month", y="Counts", title="Distinct Count Of NFL Event Months", color="Month")

# Displaying Bar plot
fig.show()

**Splitting DD (Date) from MM/DD/YYYY date format**

In [None]:
games['gameDay'] = pd.DatetimeIndex(games['gameDate']).day
print(games["gameDay"])

# Listing out unique values from day column

In [None]:
print("Unique NFL days and their counts :")
g_days = games.pivot_table(index = ['gameDay'], aggfunc = 'size') 
g_days = g_days.reset_index()
g_days.columns= ["Day", "Counts"]
g_days = g_days.sort_values("Counts", ascending = False)
print(g_days)

# Visualizing NFL days by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_days, x="Day", y="Counts", title="Distinct Count Of NFL Event Days", color="Counts")

# Displaying Bar plot
fig.show()

# Listing out unique values from gameTimeEastern column

In [None]:
print("Unique NFL timings and their counts :")
g_time = games.pivot_table(index = ['gameTimeEastern'], aggfunc = 'size') 
g_time = g_time.reset_index()
g_time.columns= ["Time", "Counts"]
g_time = g_time.sort_values("Counts", ascending = False)
print(g_time)

# Visualizing NFL timings by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_time, x="Time", y="Counts", title="Distinct Count Of NFL Event Timings", color="Counts")

# Displaying Bar plot
fig.show()

**Splitting HH (Hour) from HH:MM:SS time format**

In [None]:
games["hour"] = pd.to_datetime(games["gameTimeEastern"], format="%H:%M:%S").dt.hour
print(games["hour"])

# Listing out unique values from hour column

In [None]:
print("Unique NFL hours and their counts :")
g_hour = games.pivot_table(index = ['hour'], aggfunc = 'size') 
g_hour = g_hour.reset_index()
g_hour.columns= ["Hour", "Counts"]
g_hour = g_hour.sort_values("Counts", ascending = False)
print(g_hour)

# Visualizing NFL hours by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_hour, x="Hour", y="Counts", title="Distinct Count Of NFL Event Hours", color="Counts")

# Displaying Bar plot
fig.show()

# Exploring homeTeamAbbr column

**Replacing abbrevated words with keywords.**

In [None]:
games["homeTeamAbbr"].replace({"ARI": "Arizona Cardinals", "KC": "Kansas City Chiefs", "TEN": "Tennessee Titans", "TB": "Tampa Bay Buccaneers",
                           "SF": "San Francisco 49ers", "SEA": "Seattle Seahawks", "PIT": "Pittsburgh Steelers", "PHI": "Philadelphia Eagles",
                            "NYJ": "New York Jets", "NYG": "New York Giants", "NO": "New Orleans Saints", "NE": "New England Patriots", 
                             "MIA": "Miami Dolphins", "ATL": "Atlanta Falcons", "LA": "Los Angeles Rams", "JAX": "Jacksonville Jaguars", 
                             "IND": "Indianapolis Colts", "BAL": "Baltimore Ravens", "BUF": "Buffalo Bills", "CAR": "Carolina Panthers", 
                             "CIN": "Cincinnati Bengals", "CLE": "Cleveland Browns", "DAL": "Dallas Cowboys", "DET": "Detroit Lions", 
                             "GB": "Green Bay Packers", "HOU": "Houston Texans", "WAS": "Washington Football Team", "DEN": "Denver Broncos", 
                             "MIN": "Minnesota Vikings", "CHI": "Chicago Bears", "LAC": "Los Angeles Chargers", "OAK": "Oakland Raiders", 
                             "LV": "Las Vegas Raiders"}, inplace=True)
print(games["homeTeamAbbr"])

# Listing out unique values from homeTeamAbbr column

In [None]:
print("Unique NFL home teams and their counts :")
g_home = games.pivot_table(index = ['homeTeamAbbr'], aggfunc = 'size') 
g_home = g_home.reset_index()
g_home.columns= ["Home Team", "Counts"]
g_home = g_home.sort_values("Counts", ascending = False)
print(g_home)

# Visualizing NFL Home Teams by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_home, x="Home Team", y="Counts", title="Distinct Count Of NFL Home Teams", color="Counts")

# Displaying Bar plot
fig.show()

# Exploring visitorTeamAbbr column

**Replacing abbrevated words with keywords.**


In [None]:
games["visitorTeamAbbr"].replace({"ARI": "Arizona Cardinals", "KC": "Kansas City Chiefs", "TEN": "Tennessee Titans", "TB": "Tampa Bay Buccaneers",
                           "SF": "San Francisco 49ers", "SEA": "Seattle Seahawks", "PIT": "Pittsburgh Steelers", "PHI": "Philadelphia Eagles",
                            "NYJ": "New York Jets", "NYG": "New York Giants", "NO": "New Orleans Saints", "NE": "New England Patriots", 
                             "MIA": "Miami Dolphins", "ATL": "Atlanta Falcons", "LA": "Los Angeles Rams", "JAX": "Jacksonville Jaguars", 
                             "IND": "Indianapolis Colts", "BAL": "Baltimore Ravens", "BUF": "Buffalo Bills", "CAR": "Carolina Panthers", 
                             "CIN": "Cincinnati Bengals", "CLE": "Cleveland Browns", "DAL": "Dallas Cowboys", "DET": "Detroit Lions", 
                             "GB": "Green Bay Packers", "HOU": "Houston Texans", "WAS": "Washington Football Team", "DEN": "Denver Broncos", 
                             "MIN": "Minnesota Vikings", "CHI": "Chicago Bears", "LAC": "Los Angeles Chargers", "OAK": "Oakland Raiders", 
                             "LV": "Las Vegas Raiders"}, inplace=True)
print(games["visitorTeamAbbr"])

# Listing out unique values from visitorTeamAbbr column

In [None]:
print("Unique NFL visitor teams and their counts :")
g_visit = games.pivot_table(index = ['visitorTeamAbbr'], aggfunc = 'size') 
g_visit = g_visit.reset_index()
g_visit.columns= ["Visitor Team", "Counts"]
g_visit = g_visit.sort_values("Counts", ascending = False)
print(g_visit)

# Visualizing NFL Visitor Teams by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_visit, x="Visitor Team", y="Counts", title="Distinct Count Of NFL Visitor Teams", color="Counts")

# Displaying Bar plot
fig.show()

# <p style="text-align:center;"> ✨3. Play Data✨  </p>

**Play data:** The plays.csv file contains play-level information from each game. The key variables are gameId and playId.
* gameId: Game identifier, unique (numeric)
* playId: Play identifier, not unique across games (numeric)
* playDescription: Description of play (text)
* quarter: Game quarter (numeric)
* down: Down (numeric)
* yardsToGo: Distance needed for a first down (numeric)
* possessionTeam: Team punting, placekicking or kicking off the ball (text)
* specialTeamsPlayType: Formation of play: Extra Point, Field Goal, Kickoff or Punt (text)
* specialTeamsResult: Special Teams outcome of play dependent on play type: Blocked Kick Attempt, Blocked Punt, Downed, Fair Catch, Kick Attempt Good, Kick Attempt No Good, Kickoff Team Recovery, Muffed, Non-Special Teams Result, Out of Bounds, Return or Touchback (text)
* kickerId: nflId of placekicker, punter or kickoff specialist on play (numeric)
* returnerId: nflId(s) of returner(s) on play if there was a special teams return. Multiple returners on a play are separated by a ; (text)
* kickBlockerId: nflId of blocker of kick on play if there was a blocked field goal or blocked punt (numeric)
* yardlineSide: 3-letter team code corresponding to line-of-scrimmage (text)
* yardlineNumber: Yard line at line-of-scrimmage (numeric)
* gameClock: Time on clock of play (MM:SS)
* penaltyCodes: NFL categorization of the penalties that occurred on the play. A standard penalty code followed by a d means the penalty was on the defense. Multiple penalties on a play are separated by a ; (text)
* penaltyJerseyNumber: Jersey number and team code of the player committing each penalty. Multiple penalties on a play are separated by a ; (text)
* penaltyYards: yards gained by possessionTeam by penalty (numeric)
* preSnapHomeScore: Home score prior to the play (numeric)
* preSnapVisitorScore: Visiting team score prior to the play (numeric)
* passResult: Scrimmage outcome of the play if specialTeamsPlayResult is "Non-Special Teams Result" (C: Complete pass, I: Incomplete pass, S: Quarterback sack, IN: Intercepted pass, R: Scramble, ' ': Designed Rush, text)
* kickLength: Kick length in air of kickoff, field goal or punt (numeric)
* kickReturnYardage: Yards gained by return team if there was a return on a kickoff or punt (numeric)
* playResult: Net yards gained by the kicking team, including penalty yardage (numeric)
* absoluteYardlineNumber: Location of ball downfield in tracking data coordinates (numeric)

# Loading Data

In [None]:
play = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/plays.csv')
print("Number of data points:", len(play))
print("Number of columns:", play.shape[1])
play.head()

# Data Cleaning

**We want to examine the dataset to know the steps taken to clean the dataset.**


In [None]:
# Dataset Summary
play.info()

**From the above output, we can conclude that**

1. The kickerId, returnerId, kickBlockerId, yardlineSide, penaltyCodes, penaltyJerseyNumbers, penaltyYards, passResult, kickLength and kickReturnYardage column has missing values.

2. Replacing abbrevated words with keywords in possessionTeam and yardlineSide column.

3. Splitting HH (Hour) from HH:MM:SS time format.

# Eliminate Null Values

In [None]:
play_null = play.dropna()
print("The shape of the dataset after removing null values is : ", play_null.shape)

**From the above output, we can conclude that if we remove rows containing null values then there would be no data. So we do EDA by omitting the null values.**

# Listing out unique values from quarter column

In [None]:
print("Unique NFL game quarter values and their counts :")
play_qrtr = play.pivot_table(index = ['quarter'], aggfunc = 'size') 
play_qrtr = play_qrtr.reset_index()
play_qrtr.columns= ["Quarter", "Counts"]
play_qrtr = play_qrtr.sort_values("Counts", ascending = False)
print(play_qrtr)

# Visualizing NFL game quarters by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(play_qrtr, x="Quarter", y="Counts", title="Distinct Count Of NFL Game Quarters", color="Quarter")

# Displaying Bar plot
fig.show()

# Listing out unique values from down column

In [None]:
print("Unique NFL game downs and their counts :")
g_down = play.pivot_table(index = ['down'], aggfunc = 'size') 
g_down = g_down.reset_index()
g_down.columns= ["Down", "Counts"]
g_down = g_down.sort_values("Counts", ascending = False)
print(g_down)

# Visualizing NFL game downs by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_down, x="Down", y="Counts", title="Distinct Count Of NFL Game Downs", color="Down")

# Displaying Bar plot
fig.show()

# Listing out unique values from yardsToGo column

In [None]:
print("Unique NFL yards to go and their counts :")
g_yards = play.pivot_table(index = ['yardsToGo'], aggfunc = 'size') 
g_yards = g_yards.reset_index()
g_yards.columns= ["Yards To Go", "Counts"]
g_yards = g_yards.sort_values("Counts", ascending = False)
print(g_yards)

# Visualizing NFL yards to go by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_yards, x="Yards To Go", y="Counts", title="Distinct Count Of Yards To Go In NFL Game", color="Yards To Go")

# Displaying Bar plot
fig.show()

# Exploring possessionTeam column

**Replacing abbrevated words with keywords.**

In [None]:
play["possessionTeam"].replace({"ARI": "Arizona Cardinals", "KC": "Kansas City Chiefs", "TEN": "Tennessee Titans", "TB": "Tampa Bay Buccaneers",
                           "SF": "San Francisco 49ers", "SEA": "Seattle Seahawks", "PIT": "Pittsburgh Steelers", "PHI": "Philadelphia Eagles",
                            "NYJ": "New York Jets", "NYG": "New York Giants", "NO": "New Orleans Saints", "NE": "New England Patriots", 
                             "MIA": "Miami Dolphins", "ATL": "Atlanta Falcons", "LA": "Los Angeles Rams", "JAX": "Jacksonville Jaguars", 
                             "IND": "Indianapolis Colts", "BAL": "Baltimore Ravens", "BUF": "Buffalo Bills", "CAR": "Carolina Panthers", 
                             "CIN": "Cincinnati Bengals", "CLE": "Cleveland Browns", "DAL": "Dallas Cowboys", "DET": "Detroit Lions", 
                             "GB": "Green Bay Packers", "HOU": "Houston Texans", "WAS": "Washington Football Team", "DEN": "Denver Broncos", 
                             "MIN": "Minnesota Vikings", "CHI": "Chicago Bears", "LAC": "Los Angeles Chargers", "OAK": "Oakland Raiders", 
                             "LV": "Las Vegas Raiders"}, inplace=True)
print(play["possessionTeam"])

# Listing out unique values from possessionTeam column

In [None]:
print("Unique NFL possession teams and their counts :")
p_team = play.pivot_table(index = ['possessionTeam'], aggfunc = 'size') 
p_team = p_team.reset_index()
p_team.columns= ["Possession Team", "Counts"]
p_team = p_team.sort_values("Counts", ascending = False)
print(p_team)

# Visualizing NFL Possession Teams by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(p_team, x="Possession Team", y="Counts", title="Distinct Count Of NFL Possession Teams", color="Counts")

# Displaying Bar plot
fig.show()


# Listing out unique values from specialTeamsPlayType column

In [None]:
print("Unique NFL special teams play type and their counts :")
gp_type = play.pivot_table(index = ['specialTeamsPlayType'], aggfunc = 'size') 
gp_type = gp_type.reset_index()
gp_type.columns= ["Special Teams Play Type", "Counts"]
gp_type = gp_type.sort_values("Counts", ascending = False)
print(gp_type)

# Visualizing NFL Special Teams play type by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(gp_type, x="Special Teams Play Type", y="Counts", title="Distinct Count Of NFL Special Teams Play Type", color="Counts")

# Displaying Bar plot
fig.show()

# Listing out unique values from specialTeamsResult column

In [None]:
print("Unique NFL special teams result and their counts :")
gp_res = play.pivot_table(index = ['specialTeamsResult'], aggfunc = 'size') 
gp_res = gp_res.reset_index()
gp_res.columns= ["Special Teams Result", "Counts"]
gp_res = gp_res.sort_values("Counts", ascending = False)
print(gp_res)

# Visualizing NFL Special Teams result by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(gp_res, x="Special Teams Result", y="Counts", title="Distinct Count Of NFL Special Teams Result", color="Counts")

# Displaying Bar plot
fig.show()

# Exploring yardline side column

**Replacing abbrevated words with keywords.**

In [None]:
play["yardlineSide"].replace({"ARI": "Arizona Cardinals", "KC": "Kansas City Chiefs", "TEN": "Tennessee Titans", "TB": "Tampa Bay Buccaneers",
                           "SF": "San Francisco 49ers", "SEA": "Seattle Seahawks", "PIT": "Pittsburgh Steelers", "PHI": "Philadelphia Eagles",
                            "NYJ": "New York Jets", "NYG": "New York Giants", "NO": "New Orleans Saints", "NE": "New England Patriots", 
                             "MIA": "Miami Dolphins", "ATL": "Atlanta Falcons", "LA": "Los Angeles Rams", "JAX": "Jacksonville Jaguars", 
                             "IND": "Indianapolis Colts", "BAL": "Baltimore Ravens", "BUF": "Buffalo Bills", "CAR": "Carolina Panthers", 
                             "CIN": "Cincinnati Bengals", "CLE": "Cleveland Browns", "DAL": "Dallas Cowboys", "DET": "Detroit Lions", 
                             "GB": "Green Bay Packers", "HOU": "Houston Texans", "WAS": "Washington Football Team", "DEN": "Denver Broncos", 
                             "MIN": "Minnesota Vikings", "CHI": "Chicago Bears", "LAC": "Los Angeles Chargers", "OAK": "Oakland Raiders", 
                             "LV": "Las Vegas Raiders"}, inplace=True)
print(play["yardlineSide"])

# Listing out unique values from NFL yardline side column

In [None]:
print("Unique NFL yardline sides and their counts :")
g_yard = play.pivot_table(index = ['yardlineSide'], aggfunc = 'size') 
g_yard = g_yard.reset_index()
g_yard.columns= ["Yard Line Side", "Counts"]
g_yard = g_yard.sort_values("Counts", ascending = False)
print(g_yard)

# Visualizing NFL yardline sides by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_yard, x="Yard Line Side", y="Counts", title="Distinct Count Of NFL Yardline Sides", color="Counts")

# Displaying Bar plot
fig.show()

# Listing out unique values from yardline number column

In [None]:
print("Unique NFL yardline numbers and their counts :")
g_yard = play.pivot_table(index = ['yardlineNumber'], aggfunc = 'size') 
g_yard = g_yard.reset_index()
g_yard.columns= ["Yard Line Number", "Counts"]
g_yard = g_yard.sort_values("Counts", ascending = False)
print(g_yard)

# Visualizing NFL yardline numbers by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_yard, x="Yard Line Number", y="Counts", title="Distinct Count Of NFL Yard Line Numbers", color="Yard Line Number")

# Displaying Bar plot
fig.show()

# Feature Engineering - gameClockhour column

**Splitting HH (Hour) from HH:MM:SS time format**

In [None]:
play["gameClockhour"] = pd.to_datetime(play["gameClock"], format="%H:%M:%S").dt.hour
print(play["gameClockhour"])

# Listing out unique values from game clock hour column

In [None]:
print("Unique NFL game hours and their counts :")
g_hour = play.pivot_table(index = ['gameClockhour'], aggfunc = 'size') 
g_hour = g_hour.reset_index()
g_hour.columns= ["Game Hour", "Counts"]
g_hour = g_hour.sort_values("Counts", ascending = False)
print(g_hour)

# Visualizing NFL game hours by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_hour, x="Game Hour", y="Counts", title="Distinct Count Of NFL Game Hours", color="Counts")

# Displaying Bar plot
fig.show()

# Listing out unique values from penalty codes column

In [None]:
print("Unique NFL penalty codes and their counts :")
g_code = play.pivot_table(index = ['penaltyCodes'], aggfunc = 'size') 
g_code = g_code.reset_index()
g_code.columns= ["Penalty Codes", "Counts"]
g_code = g_code.sort_values("Counts", ascending = False)
print(g_code)

# Visualizing NFL penalty codes by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_code, x="Penalty Codes", y="Counts", title="Distinct Count Of NFL Penalty Codes", color="Counts")

# Displaying Bar plot
fig.show()

# Listing out unique values from penalty yards column

In [None]:
print("Unique NFL penalty yards and their counts :")
gp_yard = play.pivot_table(index = ['penaltyYards'], aggfunc = 'size') 
gp_yard = gp_yard.reset_index()
gp_yard.columns= ["Penalty Yards", "Counts"]
gp_yard = gp_yard.sort_values("Counts", ascending = False)
print(gp_yard)

# Visualizing NFL penalty yards by Histogram plot

In [None]:
# Creating Histogram plot
fig = px.histogram(play, x="penaltyYards", nbins=15 , title="Distinct Count Of NFL Visitor Teams", color_discrete_sequence=["royalblue"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from pre-snap home score column

In [None]:
print("Unique NFL pre-snap home scores and their counts :")
g_home = play.pivot_table(index = ['preSnapHomeScore'], aggfunc = 'size') 
g_home = g_home.reset_index()
g_home.columns= ["Pre-Snap Home Score", "Counts"]
g_home = g_home.sort_values("Counts", ascending = False)
print(g_home)

# Visualizing NFL pre-snap home scores by Histogram plot

In [None]:
# Creating Histogram plot
fig = px.histogram(play, x="preSnapHomeScore", nbins=25, title="Distinct Count Of NFL Pre-Snap Home Scores", color_discrete_sequence=["red"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from pre-snap visitor score column

In [None]:
print("Unique NFL pre-snap visitor scores and their counts :")
g_visitor = play.pivot_table(index = ['preSnapVisitorScore'], aggfunc = 'size') 
g_visitor = g_visitor.reset_index()
g_visitor.columns= ["Pre-Snap Visitor Score", "Counts"]
g_visitor = g_visitor.sort_values("Counts", ascending = False)
print(g_visitor)

# Visualizing NFL pre-snap visitor scores by Histogram plot

In [None]:
# Creating Histogram plot
fig = px.histogram(play, x="preSnapVisitorScore", nbins=25, title="Distinct Count Of NFL Pre-Snap Visitor Scores", color_discrete_sequence=["blueviolet"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from pass result column

In [None]:
print("Unique NFL pass results and their counts :")
g_res = play.pivot_table(index = ['passResult'], aggfunc = 'size') 
g_res = g_res.reset_index()
g_res.columns= ["Pass Results", "Counts"]
g_res = g_res.sort_values("Counts", ascending = False)
print(g_res)

# Visualizing NFL pass results by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_res, x="Pass Results", y="Counts", title="Distinct Count Of NFL Pass Results", color="Counts")

# Displaying Bar plot
fig.show()

# Listing out unique values from kick-length column

In [None]:
print("Unique NFL kick lengths and their counts :")
g_kick = play.pivot_table(index = ['kickLength'], aggfunc = 'size') 
g_kick = g_kick.reset_index()
g_kick.columns= ["Kick Length", "Counts"]
g_kick = g_kick.sort_values("Counts", ascending = False)
print(g_kick)

# Visualizing NFL kick lengths by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_kick, x="Kick Length", y="Counts", title="Distinct Count Of NFL Kick lengths", color="Kick Length")

# Displaying Bar plot
fig.show()

# Listing out unique values from kick return yardage column

In [None]:
print("Unique NFL kick return yardages and their counts :")
g_yard = play.pivot_table(index = ['kickReturnYardage'], aggfunc = 'size') 
g_yard = g_yard.reset_index()
g_yard.columns= ["Kick Return Yardage", "Counts"]
g_yard = g_yard.sort_values("Counts", ascending = False)
print(g_yard)

# Visualizing NFL kick return yardages by Histogram plot

In [None]:
# Creating Histogram plot
fig = px.histogram(play, x="kickReturnYardage", nbins=25 , title="Distinct Count Of NFL Kick Return Yardages",  color_discrete_sequence=["mediumvioletred"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from play results column

In [None]:
print("Unique NFL play results and their counts :")
gp_res = play.pivot_table(index = ['playResult'], aggfunc = 'size') 
gp_res = gp_res.reset_index()
gp_res.columns= ["Play Result", "Counts"]
gp_res = gp_res.sort_values("Counts", ascending = False)
print(gp_res)

# Visualizing NFL play results by Histogram plot

In [None]:
# Creating Histogram plot
fig = px.histogram(play, x="playResult", nbins=10, title="Distinct Count Of NFL Play Results", color_discrete_sequence=["blue"])

# Displaying Histogram plot
fig.show()

# Listing out unique values from absolute yardline number column

In [None]:
print("Unique NFL absolute yardline numbers and their counts :")
g_abyl = play.pivot_table(index = ['absoluteYardlineNumber'], aggfunc = 'size') 
g_abyl = g_abyl.reset_index()
g_abyl.columns= ["Absolute YardLine Number", "Counts"]
g_abyl = g_abyl.sort_values("Counts", ascending = False)
print(g_abyl)

# Visualizing NFL absolute yardline numbers by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(g_abyl, x="Absolute YardLine Number", y="Counts", title="Distinct Count Of NFL Absolute Yard Line Number", color="Absolute YardLine Number")

# Displaying Bar plot
fig.show()

# <p style="text-align:center;"> ✨4. Tracking data - 2018✨  </p>

**Tracking data - 2018:** Files tracking2018.csv contain player tracking data from season 2018. The key variables are gameId, playId, and nflId.

* time: Time stamp of play (time, yyyy-mm-dd, hh:mm:ss)
* x: Player position along the long axis of the field, 0 - 120 yards. See Figure 1 below. (numeric)
* y: Player position along the short axis of the field, 0 - 53.3 yards. See Figure 1 below. (numeric)
* s: Speed in yards/second (numeric)
* a: Acceleration in yards/second^2 (numeric)
* dis: Distance traveled from prior time point, in yards (numeric)
* o: Player orientation (deg), 0 - 360 degrees (numeric)
* dir: Angle of player motion (deg), 0 - 360 degrees (numeric)
* event: Tagged play details, including moment of ball snap, pass release, pass catch, tackle, etc (text)
* nflId: Player identification number, unique across players (numeric)
* displayName: Player name (text)
* jerseyNumber: Jersey number of player (numeric)
* position: Player position group (text)
* team: Team (away or home) of corresponding player (text)
* frameId: Frame identifier for each play, starting at 1 (numeric)
* gameId: Game identifier, unique (numeric)
* playId: Play identifier, not unique across games (numeric)
* playDirection: Direction that the offense is moving (left or right)

# Loading Data

In [None]:
tr18 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/tracking2018.csv')
print("Number of data points:", len(tr18))
print("Number of columns:", tr18.shape[1])
tr18.head()

# Dataset Summary

In [None]:
tr18.info()

**From the above output, we can conclude that split date from datetime.**

# Feature Engineering - Splitting date from datetime

In [None]:
tr18['date'] = pd.DatetimeIndex(tr18['time']).date
print(tr18["date"])

# Listing out unique values from date column

In [None]:
print("Unique NFL dates and their counts :")
tr_date = tr18.pivot_table(index = ['date'], aggfunc = 'size') 
tr_date = tr_date.reset_index()
tr_date.columns= ["Date", "Counts"]
tr_date = tr_date.sort_values("Counts", ascending = False)
print(tr_date)

# Visualizing NFL dates by Bar plot

In [None]:
# Creating Bar plot
fig = px.bar(tr_date, x="Date", y="Counts", title="Distinct Count Of NFL Dates", color="Counts")

# Displaying Bar plot
fig.show()

# Creating new dataset using playId, position and gameId

**playId == 36 and gameId == 2018123000**

In [None]:
data = tr18.query('playId == 36 and gameId == 2018123000')
print(data[["x", "y", "team"]])

# Visualizing data regarding playId = 36 and gameId = 2018123000

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

**playId == 36 and gameId == 2018102107**

In [None]:
data = tr18.query('playId == 36 and gameId == 2018102107')
print(data[["x", "y", "team"]])

# Visualizing data regarding playId = 36 and gameId = 2018102107

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

**position == "CB" and gameId == 2018111900**

In [None]:
data = tr18.query('position == "CB" and gameId == 2018111900')
print(data[["x", "y", "team"]])

# Visualizing data regarding position = CB and gameId = 2018111900

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

**position == "RB" and gameId == 2018093009**

In [None]:
data = tr18.query('position == "RB" and gameId == 2018093009')
print(data[["x", "y", "team"]])

# Visualizing data regarding position = RB and gameId = 2018093009

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

**position == "LB" and gameId == 2018111900**

In [None]:
data = tr18.query('position == "LB" and gameId == 2018111900')
print(data[["x", "y", "team"]])

# Visualizing data regarding position = LB and gameId = 2018111900

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

# Filtering out 2018-10-09 from dataset

In [None]:
tr18['date'] = pd.DatetimeIndex(tr18['date'])
rslt_df = tr18[tr18['date'] == '2018-10-09']
print(rslt_df.head())

# Animation play for the date - 2018-10-09

In [None]:
fig = px.scatter(rslt_df, x="x", y="y", animation_frame="time", color="team")
fig.show()

# Filtering out 2018-11-30 from dataset

In [None]:
rslt_df = tr18[tr18['date'] == '2018-11-30']
print(rslt_df.head())

# Animation play for the date - 2018-11-30

In [None]:
fig = px.scatter(rslt_df, x="x", y="y", animation_frame="time", color="team")
fig.show()

# Visualizing football pitch using matplotlib

**Creating function to draw football field**

In [None]:
def drawfield():
    
    # Creating a new figure
    plt.figure(figsize = (24,10))

    # Getting current axis
    ax = plt.gca()

    # Creating a rectangle for the entire figure size and filling it with grass colour
    ax.add_patch(patches.Rectangle((0, 0), 24, 10, facecolor='#3f995b'))

    # Creating vertical lines
    for i in range(1, 22):
        plt.axvline(x = i+1, color='white')

    # Adding text to vertical lines
    k = 10
    for j in range(4, 22):
        if j%2 == 0 and j <= 12:
            #count = k + 10
            plt.text(x = j - 0.25, y = 0.85, s = k, color = "white", size = 20, weight = "bold")
            plt.text(x = j - 0.25, y = 0.125, s = k, color = "white", size = 20, weight = "bold", rotation=180)
            k = k + 10
        if j%2 == 0 and j > 11:
            k = k - 10
            plt.text(x = j - 0.25, y = 0.85, s = k, color = "white", size = 20, weight = "bold")
            plt.text(x = j - 0.25, y = 0.125, s = k, color = "white", size = 20, weight = "bold", rotation=180)

    # Adding yard markers in between the vertical lines
    for l in range(2, 22):
        plt.text(x = l, y = 0.95, s = " |||| ", color = "white", size = 25, weight = "bold")
        plt.text(x = l, y = 0.65, s = " |||| ", color = "white", size = 25, weight = "bold")
        plt.text(x = l, y = 0.325, s = " |||| ", color = "white", size = 25, weight = "bold")
        plt.text(x = l, y = 0.025, s = " |||| ", color = "white", size = 25, weight = "bold")
    
    # Adding home end zone
    ax.add_patch(patches.Rectangle((0, 0), 2, 10, facecolor='#0064dc'))
    plt.text(x = 0.5, y = 0.37, s = "HOME",color = "white", size = 50, weight="bold", rotation=90)

    # Adding away end zone
    ax.add_patch(patches.Rectangle((22, 0), 2, 10, facecolor='#c80014'))
    plt.text(x = 22.5, y = 0.37, s = "AWAY",color = "white", size = 50, weight="bold", rotation=270)
    
    # Adding spot points
    plt.axvline(x = 2.4, ymin = 0.45, ymax = 0.55, color = "white")
    plt.axvline(x = 21.6, ymin = 0.45, ymax = 0.55, color = "white")

    # Adding goalposts
    plt.axvline(x = 0, ymin = 0.425, ymax = 0.575, color = "yellow", linewidth = 10)
    plt.axvline(x = 24, ymin = 0.425, ymax = 0.575, color = "yellow", linewidth = 10)

    # Removing X and Y axes from the plot
    plt.axis('off')

    # Displaying plot
    plt.show()
    
drawfield()