In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import rgb2hex
%matplotlib inline
import seaborn as sns

#(Credit for the below code goes to @ANZ check out his notebook as well)
cmap = cm.get_cmap('GnBu',12) #colormap and number
col_def =[]
for i in range(cmap.N):
    rgb = cmap(i)[:3]
    col_def.append(rgb2hex(rgb))
    print(rgb2hex(rgb))

### Players Data

In [None]:
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')
players

## Data Cleaning

In [None]:
players.info()

**There seem to be NaN values in birthDate and collegeName. Let's get rid of the NaNs in birthdate, and create cols for birth year and birth month.**

In [None]:
players.isnull().sum()

In [None]:
players["birthYear"] = 0
players["birthMonth"] = 0
#There are NA values in birthDate so that we should drop them
players.dropna(subset=["birthDate"], inplace=True)
for idx, row in players.iterrows():
    if len(row['birthDate'].split('/')) == 3: # 05/17/1994 
        players.loc[idx, 'birthYear'] = row['birthDate'].split('/')[2]
        players.loc[idx, 'birthMonth'] = row['birthDate'].split('/')[0]
        
    elif len(row['birthDate'].split('-')) == 3: # 1995-05-05
        players.loc[idx, 'birthYear'] = row['birthDate'].split('-')[0]
        players.loc[idx, 'birthMonth'] = row['birthDate'].split('-')[1]

In [None]:
players.isnull().sum()

**Converting heights to CM and weights to Kg**

In [None]:
players_heights = players["height"] # Get the Height data from DataFrame
players_heights = players_heights.apply(lambda x: x.split("-")) # Split the heights by hyphen ("-")

# Convert Heights to Centimeters and add them to DataFrame
players["height"] = players_heights.apply(lambda x: int(x[0]) * 12 + int(x[1]) if len(x) == 2 else int(x[0])) * 2.54

# Convert Weights to Kilograms and them to DataFrame
players["weight"] = round(players.weight * 0.453592, 2)

players

In [None]:
players.info()

**Now the data is ready to work with. Everything is neat and tidy🥳🥳**

## Time For Some EDA on Players DataSet

In [None]:
len(players['displayName'].unique())

**Let's get some data on the colleges**

In [None]:
college_df = players['collegeName'].value_counts()
sns.set_style('darkgrid')
fig, axes = plt.subplots(1,2,figsize=(12,6))
axes[0] = sns.barplot(x=college_df[:10].values, y=college_df[:10].index, edgecolor="black",palette=col_def, ax=axes[0])
axes[0].set_title("Top 10 College player counts", fontsize=20)
axes[1].pie(x= college_df[:10], labels = college_df[:10].index, colors=col_def, autopct='%.0f%%',
           explode=[0.03 for i in college_df[:10].index])
axes[1].add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

**From the above graph, we can see that, Alabama is at the top with approximately 68 players.**

**Postions played by players**

In [None]:
pos_df = players['Position'].value_counts()
sns.set_style('darkgrid')
fig, axes = plt.subplots(1,2,figsize=(12,6))
axes[0] = sns.barplot(x=pos_df[:10].values, y=pos_df[:10].index, edgecolor="black",palette=col_def, ax=axes[0])
axes[0].set_title("Top 10 Postions played by player (By Count)", fontsize=20)
axes[1].pie(x= pos_df[:10], labels = pos_df[:10].index, colors=col_def, autopct='%.0f%%',
           explode=[0.03 for i in pos_df[:10].index])
axes[1].add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

**The postion 'WR' is played the most by the players. It is approximately 320 i.e is 16%**

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
sns.regplot(x=players.weight, y=players.height, line_kws={"color": "red"})
plt.title("Player Weight(Kg) vs Player Height(cm)");

**It looks like the taller the player is the heavier he is.**

**Weight and Height Distribution.**

In [None]:
fig = plt.figure(figsize=(20, 15), dpi=80)

ax1 = fig.add_subplot(223)
sns.histplot(players.weight, ax=ax1)
ax1.set_title("Weight(Kg) Distribution")

ax2 = fig.add_subplot(224)
sns.histplot(players.height, ax=ax2, bins=10)
ax2.set_title("Height(cm) Distribution");

**From the above distribution, we can see that most of the players are between 190cm - 195cm height(390+310 = 700 approximately). Players who are on the shorter side i.e < 170cm are very less roughly 30 in count. Same is with the taller side i.e 200cm - 205cm, And most players are seen to be in between 80kg to 100. Very less people on both the extremes.**

**Player birthyear and birthmonth Distribution**

In [None]:
fig = plt.figure(figsize=(20, 15), dpi=80)

birthyear = players['birthYear'].value_counts()
ax1 = fig.add_subplot(223)
sns.barplot(x=birthyear.index, y=birthyear.values, ci=None, ax=ax1)
ax1.tick_params(axis='x', rotation=45)
ax1.set_title("BirthYear Distribution",size=20)
plt.xlabel("Year", size=15)

birthmonth = players['birthMonth'].value_counts()
ax2 = fig.add_subplot(224)
sns.barplot(x=birthmonth.index, y=birthmonth.values, ci=None, ax=ax2)
ax2.set_title("BirthMonth Distribution",size=20)
plt.xlabel("Month",size=15);

**From the above charts, it can be determined that most players are born in the year 1995. The most frequent birth month is September.**

### Game data

In [None]:
games = pd.read_csv("../input/nfl-big-data-bowl-2022/games.csv")
games.head()

In [None]:
games.info()

**Making sure that the gameDate doesn't have any null values in the form of '0' or '0/0/0'**

In [None]:
print(games[games['gameDate'] == '0'])

In [None]:
print(games[games['gameDate'] == '0/0/0'])

**Making sure that gameTimeEastern doesn't have any null values in forms like '0' or '0:0:0'**

In [None]:
print(games[games['gameTimeEastern'] == '0'])

In [None]:
print(games[games['gameTimeEastern'] == '0:0:0'])

**Now that we are sure let's get on with some EDA on games data**

In [None]:
games.describe()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.countplot(x=games['season'], hue=games['week'])
plt.title('Game count per Season');

In [None]:
fig = plt.figure(figsize=(12,6))
home = games['homeTeamAbbr'].value_counts()
sns.barplot(x=home.index, y=home.values, ci=None)
plt.xlabel("Home Team")
plt.ylabel("Count")
plt.xticks(rotation=90);

**It seems like there are 33 teams, but after researching and hearing from people I understood that OAK and LV are the same team. The OAK(originally) moved to LV(Las Vegas). This was not mentioned in the dataset, but I am happy that I got around it. If we add up the couts of OAK and LV I think they got the same opurtunity as the other teams.**

In [None]:
fig = plt.figure(figsize=(12,6))
visitor = games['visitorTeamAbbr'].value_counts()
sns.barplot(x=visitor.index, y=visitor.values, ci=None)
plt.xlabel("Visitor Team")
plt.ylabel("Count")
plt.xticks(rotation=90);

**Same case here as well if we add up the couts of OAK and LV it would be same as the rest. So everybody got equal chances (almost)**

### Plays Data

In [None]:
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')
plays.head()

In [None]:
plays.info()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='quarter', y='down', data=plays)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
g = sns.barplot(x='quarter', y='yardsToGo', data=plays, ci=None)
g.bar_label(g.containers[0])
plt.title('Yards to Go in Each Quarter', size=15)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
g = sns.barplot(x='quarter', y='playResult', data=plays, ci=None)
g.bar_label(g.containers[0])
plt.title("Play result for every quarter", size=15)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.distplot(plays['kickLength'])
plt.show()

In [None]:
plays['kickLength'].describe()

The mean kick length is 54.744166. The minimum is 2.000 and the max is 90.000.

In [None]:
fig = plt.figure(figsize=(12,6))
sns.histplot(plays['passResult'])
plt.show()

In [None]:
plays['passResult'].describe()

In [None]:
fig = plt.figure(figsize=(12,6))
sns.histplot(plays['possessionTeam'])
plt.xticks(rotation=90)
plt.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as pg
from plotly import tools as tl

In [None]:
tr18 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2018.csv")
tr18.head()

In [None]:
data = tr18.query('playId == 36 and gameId == 2018123000')
print(data[["x", "y", "team"]])

In [None]:
fig = px.scatter(data, x='x', y='y', color='team')
fig.show()

In [None]:
data = tr18.query('playId == 36 and gameId == 2018102107')
print(data[["x", "y", "team"]])

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

In [None]:
data = tr18.query('position == "CB" and gameId == 2018111900')
print(data[["x", "y", "team"]])

In [None]:
fig = px.scatter(data, x="x", y="y", color="team")
fig.show()

**Well that's it for the notebook. I hope that you were able to make things out of this. Anyway if you liked this notebook then don't forget to leave an upvote as it is free**😋😉