<iframe src="https://www.kaggle.com/embed/hijest/nfl-big-data-bowl-2022-starters-eda?cellIds=1&kernelSessionId=75487025" height="300" style="margin: 0 auto; width: 100%; max-width: 950px;" frameborder="0" scrolling="auto" title="NFL Big Data Bowl 2022 - Starters EDA 🏈🏈"></iframe>

## NFL Big Data Bowl 2022 DATA ANALYSIS

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

pd.set_option('display.max_columns', None)


## Players Data

In [None]:
# Loading the dataset
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')

#### Players csv contain there players data. The key variable is NLF-id

* **nflid**       :  NLF-ID identififer numeric (Int 64) 
* **height**      :  Height of that player (Object)
* **weight**      :  Height of that player (Int 64)
* **birhtdate**   :  Data of Birth of that player (Object)
* **collegeName** :  College name of the player (Object)
* **Position**    :  Playing position of the player (Object)
* **displayName** :  Display name of the player (Object)


In [None]:

print(players.isnull().sum())
players

# We have null values in Birth data so lets check how we handle missing values

### Positions bar plot

In [None]:
plt.style.use("seaborn")

color=plt.cm.flag(np.linspace(0,2,4))
players["Position"].value_counts().plot.bar(color=color,figsize=(16,12))

plt.title("number of positions of Players")
plt.xlabel('Position')
plt.ylabel("total count")
plt.show()

In [None]:
# relationship between features
corr = players.corr()
fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(corr)

In [None]:

check = players['collegeName'].value_counts().reset_index()

check.columns = [
    'college', 
    'players'
]

check = check.sort_values('players').tail(40)

fig = px.bar(
    check, 
    y='college', 
    x="players", 
    orientation='h', 
    title='Top 40 colleges by number of players',
    height=900,
    width=800
)

fig.show()

## Game data

In [None]:
game = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
game

In [None]:
# No null values its good
print(game.isnull().sum())

#### game csv contains data for games records. The key variable is game-id

* **game-id**     :  game-ID identififer numeric (Int 64) 
* **season**      : season (int 64)
* **week**        :  Week of game (numeric) (Int 64)
* **gamedate**    :  Data of game (Object)
* **gameTimeEastern** :  Start time of game (time, HH:MM:SS, EST)
* **visitorTeamAbbr**    : Visiting team three-letter code (text) (Object)

### games player per year(Season)

In [None]:
game["season"].value_counts().plot.pie(figsize=(12,8),explode=(0.1,0.1,0.1),autopct="%1.1f%%")
plt.title("Games player per year (Season)",fontsize=18)
plt.show()

In [None]:
check = game['gameDate'].value_counts().reset_index()

check.columns = [
    'date', 
    'games'
]

check = check.sort_values('games')

fig = px.bar(
    check, 
    y='date', 
    x="games", 
    orientation='h', 
    title='Number of games for every date', 
    height=900, 
    width=800
)

fig.show()

## Play data

### 
* 24 features we have.
* So many missing values be cafeful about them.


In [None]:
play = pd.read_csv("../input/nfl-big-data-bowl-2022/plays.csv")
play

In [None]:
print(play.isnull().sum())

In [None]:
play.info()

In [None]:
check = play['quarter'].value_counts().reset_index()

check.columns = [
    'quarter', 
    'plays'
]

check = check.sort_values('plays')

fig = px.pie(
    check, 
    names='quarter', 
    values="plays",  
    title='Number of plays of every quarter',
    height=500,
    width=800
)

fig.show()

### Corelation between features

In [None]:
# relationship between features
corr = play.corr()
fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(corr)

### Plotting missing values
* These have missing values one have to handle.



In [None]:
fig, ax = plt.subplots(figsize=(8,6))
missing = play.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(ax=ax)

### Heat map for missing values
* Here we can see in this heat map how many values are missing.

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(play.isnull(), ax=ax)

### Play result distribution 

In [None]:
#To see how Play result distribution  is distributed
play["playResult"].hist(figsize=(12,8),bins=15)
plt.show()

In [None]:
#preSnapHomeScore
#To see how preSnapHomeScore is distributed
play["preSnapHomeScore"].hist(figsize=(12,8),bins=15)
plt.show()

In [None]:
#To see how pre Snap Visitor Score is distributed
play["preSnapVisitorScore"].hist(figsize=(12,8),bins=15)
plt.show()