# Introduction

A high level EDA on this NFL dataset is presented below. I will be adding to this in the future but wanted to create a potential starting point for anyone working on this challenge. I performed some feature engineering on the players dataset to get more information such as: position (offense, defense, special teams) and age. I plan on adding to this notebook in the future. If you found it helpful I'd really appreciate a 👍.

![](https://s7d2.scene7.com/is/image/TWCNews/7-1-20_nfl_logo_jpg)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Libraries 

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
from plotly.subplots import make_subplots
import cufflinks
plt.style.use('ggplot')
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# 1. Players

In [None]:
players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/players.csv')
players.columns = ['id', 'height', 'weight', 'dob', 'college', 'pos', 'name']

In [None]:
players.info()

In [None]:
players.head()

In [None]:
pc = players.college.value_counts().head(15).reset_index()
fig = px.bar(pc, x= 'index', y='college', title= 'Top 15 NFL Player Producing Schools', 
      labels={'index':'College', 'college':'Player Count'},
      color_discrete_sequence=px.colors.qualitative.Prism, color= 'index')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
l1 = []
for i in players.groupby('pos').college.value_counts().items():
    l1.append(i)

col_pos = pd.DataFrame(l1)
col_pos.columns = ['s', 'c']

col_pos['info'] = col_pos['s'].apply(lambda x: ', '.join(x) if isinstance(x,tuple) else x)
col_pos['school'] = col_pos['info'].apply(lambda x: x.split(',')[1])
col_pos['pos'] = col_pos['info'].apply(lambda x: x.split(',')[0])
col_pos.drop(['s', 'info'], axis= 1, inplace= True)

In [None]:
col_pos2 = col_pos.groupby(['pos', 'school']).sum().reset_index().sort_values(by='c', ascending= False)

In [None]:
for i in col_pos2.pos.unique():
    col_pos2.loc[col_pos2.pos ==i, :].head(5).iplot(kind='pie', labels= 'school', values= 'c', hole= .5, title= "Top School's Producing: " + i)

In [None]:
players['dob'] = pd.to_datetime(players.dob)
players['yr'] = players.dob.dt.year
players['age'] = 2020 - players.yr

In [None]:
mapdict = {'QB':'Offense', 'WR':'Offense', 'TE':'Offense', 'RB':'Offense', 'FB':'Offense', 'HB':'Offense',
          'CB':'Defense', 'SS':'Defense', 'MLB':'Defense', 'OLB':'Defense', 'FS':'Defense', 'DE':'Defense', 'LB':'Defense', 'ILB':'Defense', 'DB':'Defense', 'S':'Defense', 'NT':'Defense', 'DT':'Defense',
          'P':'Special Teams', 'LS':'Special Teams', 'K':'Special Teams'}

players['ODS'] = players.pos.map(mapdict)

players_o = players.loc[players.ODS == 'Offense', :]
players_d = players.loc[players.ODS == 'Defense', :]
players_st = players.loc[players.ODS == 'Special Teams', :]

In [None]:
fig = px.box(players_o, x= 'pos', y='weight', title= 'Offensive Player Weight Boxplot',
      labels= {'pos':'Position', 'weight':'Weight'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_o, x= 'pos', y='height', title= 'Offensive Player Height Boxplot',
      labels= {'pos':'Position', 'height':'Height'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_d, x= 'pos', y='weight', title= 'Defensive Player Weight Boxplot',
      labels= {'pos':'Position', 'weight':'Weight'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_d, x= 'pos', y='height', title= 'Defensive Player Height Boxplot',
      labels= {'pos':'Position', 'weight':'Weight'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_st, x= 'pos', y='weight', title= 'Special Teams Player Weight Boxplot',
      labels= {'pos':'Position', 'weight':'Weight'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_st, x= 'pos', y='height', title= 'Special Teams Player Height Boxplot',
      labels= {'pos':'Position', 'weight':'Weight'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
px.scatter(players, x= 'weight', y='height', color= 'ODS', labels={'weight':'Weight', 'height':'Height', 'ODS':'Position Type'})

Something interesting to note: the highested weighed offensive player is 311 pounds. That's a large number considering there are no offensive lineman in this dataset. Let's see how that player is.

In [None]:
print(players.loc[(players.weight > 299) & (players.ODS == 'Offense') , :])

In [None]:
px.histogram(players.height, title='Player Height Distribution')

In [None]:
px.histogram(players.weight, title= 'Player Weight Distribution', color_discrete_sequence=['indianred'])

In [None]:
px.histogram(players, x='age', color='ODS', title= 'Age Distribution', labels= {'ODS':'Position Type'})

Interesting to note: offensive players seem to have a higher peak of players between range 25-30. Special teams and defense seem to follow a very similar style of distribution in regards to player age.

In [None]:
fig = px.box(players_o.sort_values(by='age', ascending= False), x= 'pos', y='age', title= 'Defensive Player Age Boxplot',
      labels= {'pos':'Position', 'age':'Age'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_d, x= 'pos', y='age', title= 'Defensive Player Age Boxplot',
      labels= {'pos':'Position', 'age':'Age'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.box(players_st, x= 'pos', y='age', title= 'Special Teams Player Age Boxplot',
      labels= {'pos':'Position', 'age':'Age'}, color_discrete_sequence=px.colors.qualitative.Prism, color='pos')
fig.update_layout(showlegend=False)
fig.show()

# 2. Games

In [None]:
games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/games.csv')

In [None]:
games.info()

In [None]:
games['gameDate'] = pd.to_datetime(games.gameDate)

In [None]:
g = games.week.value_counts().reset_index()
g.rename(columns={'index':'week', 'week':'games'}, inplace= True)
g = g.sort_values(by='week')

px.bar(g, x='week', y='games', color_discrete_sequence=px.colors.qualitative.Prism)

Weeks 1, 9, and 11 had the fewest games played at 13. Could be becaues of bye-week (a bye week is when a team has a scheduled week off for rest) or games being played in different country (NFL has had some international games played in different Countries). Although, that would only be the case for weeks 9 and 11... week 1 having 13 games played is something to look into

# 3. Plays

In [None]:
plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv')

In [None]:
plays.info()

## Let's take a closer look at certain teams

Here I have chosen the New England Patriots (super bowl winners) and the Arizona Cardinals (worst regular season record) to identify any polarization between play types of the best team and worst team. Let's see if we can find anything.

In [None]:
az = plays.loc[plays.possessionTeam == 'ARI', :]
ne = plays.loc[plays.possessionTeam == 'NE', :]

In [None]:
az_play = az.playType.value_counts().reset_index()
az_play.columns = ['play', 'count']

ne_play = ne.playType.value_counts().reset_index()
ne_play.columns = ['play', 'count']

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])


fig.add_trace(go.Pie(values= az_play['count'], labels= az_play['play']), row=1, col=1)
fig.add_trace(go.Pie(values= ne_play['count'], labels= ne_play['play']), row=1, col=2)

fig.update_traces(hole=.5)

fig.update_layout(
    title_text="Worst Team vs. Best Team Play Type",
    annotations=[dict(text='Arizona', x=0.185, y=0.5, font_size=9, showarrow=False),
                 dict(text='New England', x=0.845, y=0.5, font_size=9, showarrow=False)])
fig.show()

In [None]:
ne_result = ne.passResult.value_counts().reset_index()
ne_result.columns = ['type', 'count']

az_result = az.passResult.value_counts().reset_index()
az_result.columns = ['type', 'count']

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles= ('Arizona Play Type Count', 'New England Play Type Count'))

fig.add_trace(go.Bar(y= az_result['count'], x= az_result['type'], marker=dict(color=az_result['count'], coloraxis="coloraxis")), row=1, col=1)
fig.add_trace(go.Bar(y= ne_result['count'], x= ne_result['type'], marker=dict(color=ne_result['count'], coloraxis="coloraxis")), row=1, col=2)

fig.update_layout(coloraxis=dict(colorscale='emrld'), showlegend=False)
fig.show()

We can evidently see why the New England Patriots are the better team. They incur fewer sacks and have a much higher completed pass count. I plan on adding win total to this subset of the data to see if there are any correlations between how often a team passes/throws an incompletion to winning. 

## More to come!