# **2023 NFL Big Data Bowl**
### Sol Ben-Ishay

## **EDA**

In [None]:
%run /kaggle/usr/lib/nfl_bdb_2023_extract_wrangle/nfl_bdb_2023_extract_wrangle.ipynb

### **Play-level data**

In [None]:
play_level_df.head(3)

In [None]:
print(f"Play-level df columns:\n{list(play_level_df.columns)}\n")
print("Play-level df info:")
print(play_level_df.info(verbose=False))

#### **Univariate analysis**

In [None]:
# Descriptive statistics
play_level_df[['yardsToGo','yardlineNumber','penaltyYards','prePenaltyPlayResult',
            'playResult','absoluteYardlineNumber','defendersInBox','pff_playAction',
            'snap_to_action_time']].describe().transpose()

In [None]:
# Team offensive/defensive play counts
display(play_level_df[['possessionTeam']].value_counts().reset_index(name='num_plays_on_off').head())
display(play_level_df[['defensiveTeam']].value_counts().reset_index(name='num_plays_on_def').head())

In [None]:
# passResult counts
play_level_df[['passResult']].value_counts().reset_index(name='passResult_counts').head()

In [None]:
# Most common offensive/defensive personnel groups
display(play_level_df[['personnelO']].value_counts().reset_index(name='personnelO_counts').head())
display(play_level_df[['personnelD']].value_counts().reset_index(name='personnelD_counts').head())

#### **Bi/Multivariate analysis**

In [None]:
# Descriptive statistics grouped by passResult
# Numeric variables with no evident correlation with passResult: 'penaltyYards','prePenaltyPlayResult','playResult','pff_playAction'
play_level_df[['passResult','penaltyYards','prePenaltyPlayResult','playResult','pff_playAction','snap_to_action_time']].groupby("passResult").describe().transpose()

In [None]:
# Boxplot of the snap to action time vs pass result
box = sns.boxplot(data=play_level_df, x="passResult", y="snap_to_action_time")
plt.show()

In [None]:
# Correlation matrix of numeric data
coi = ['yardsToGo','yardlineNumber','penaltyYards','prePenaltyPlayResult',
            'playResult','absoluteYardlineNumber','defendersInBox','pff_playAction',
            'snap_to_action_time']
corr = play_level_df[coi].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f = plt.figure().suptitle(t="Play-level data correlations",fontsize=14)
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### **Player-level data**

In [None]:
player_level_df.head(3)

In [None]:
print(f"Player-level df columns:\n{list(player_level_df.columns)}\n")
print("Player-level df info:")
print(player_level_df.info(verbose=False))

#### **Univariate analysis**

In [None]:
# Descriptive statistics for numeric data for OL/DL
pos_in_ol_w_te_cond = "officialPosition in ['C','G','T','TE']"
pos_in_dl_w_olb_cond = "officialPosition in ['NT','DT','DE','OLB']"
print("OL Descriptive Statistics:")
display(player_level_df.query(pos_in_ol_w_te_cond)[['pff_beatenByDefender', 'pff_hitAllowed', 'pff_hurryAllowed', 'pff_sackAllowed','pff_backFieldBlock']].describe().transpose())
print("DL Descriptive Statistics:")
display(player_level_df.query(pos_in_dl_w_olb_cond)[['pff_hit', 'pff_hurry', 'pff_sack', 'pff_pressure']].describe().transpose())

In [None]:
# Most common positions for OL/DL
display(player_level_df.query(pos_in_ol_w_te_cond)['officialPosition'].value_counts().reset_index().head())
display(player_level_df.query(pos_in_dl_w_olb_cond)['officialPosition'].value_counts().reset_index().head())

In [None]:
# Most common roles for OL/DL
display(player_level_df.query("officialPosition in ['C','G','T']")['pff_role'].value_counts(normalize=True).reset_index().head())
display(player_level_df.query("officialPosition in ['NT','DT','DE']")['pff_role'].value_counts(normalize=True).reset_index().head())

In [None]:
# Most common positions lined up for OL/DL
display(player_level_df.query(pos_in_ol_w_te_cond)['pff_positionLinedUp'].value_counts().reset_index().head())
display(player_level_df.query(pos_in_dl_w_olb_cond)['pff_positionLinedUp'].value_counts().reset_index().head())

In [None]:
# Most common block types for OL
display(player_level_df.query(pos_in_ol_w_te_cond)['pff_blockType'].value_counts().reset_index().head())

#### **Bi/Multivariate analysis**

In [None]:
# OL/DL mean play-level stats grouped by player/position (players with more than 35 snaps)
print("Mean Play Level Stats by Player")
print("Offensive Linemen:")
high_snap_ol = player_level_df.query("officialPosition in ['C','G','T']").nflId.value_counts().loc[lambda c: c > 35].index.tolist()
coi = ['displayName', 'officialPosition', 'pff_beatenByDefender', 'pff_hitAllowed', 'pff_hurryAllowed', 'pff_sackAllowed']
ol_play_stats_by_player = (player_level_df.query("officialPosition in ['C','G','T']")[coi].groupby(['displayName','officialPosition']))
display(ol_play_stats_by_player.mean().reset_index().sort_values(by="pff_beatenByDefender",ascending=True).head())

print("Defensive Linemen:")
high_snap_pass_rushers = player_level_df.query("officialPosition in ['NT','DT','DE']").nflId.value_counts().loc[lambda c: c > 35].index.tolist()
coi = ['displayName', 'officialPosition', 'pff_hit', 'pff_hurry', 'pff_sack', 'pff_pressure']
dl_play_stats_by_player = player_level_df.query(f"nflId in {high_snap_pass_rushers}")[coi].groupby(['displayName','officialPosition'])
display(dl_play_stats_by_player.mean().reset_index().sort_values(by="pff_sack",ascending=False).head())

In [None]:
# OL/DL play-level stat counts grouped by player
print("Play Stat Counts by Player")
print("Offensive Linemen:")
display(ol_play_stats_by_player.sum().reset_index().sort_values(by="pff_beatenByDefender", ascending=True).head())

print("Defensive Linemen:")
display(dl_play_stats_by_player.sum().reset_index().sort_values(by="pff_pressure", ascending=False).head())

In [None]:
# QB mean play-level stats grouped by player
coi = ['gameId','playId','nflId','displayName','officialPosition']
high_snap_qbs = player_level_df.query("officialPosition == 'QB'").nflId.value_counts().loc[lambda c: c > 30].index.tolist()
qb_df = player_level_df[coi].query(f"nflId in {high_snap_qbs}")

qb_avg_play_res_df = qb_df.merge(play_level_df, on=['gameId', 'playId'], how = 'left')
qb_avg_play_res_df = qb_avg_play_res_df[["displayName","snap_to_action_time","playResult"]].groupby("displayName").mean().sort_values(by=['playResult'],ascending=[False]).reset_index()
print("QB Mean Play Stats by Player:")
qb_avg_play_res_df.head()

In [None]:
# QB pass result counts by player
# qb_pass_result_counts = (qb_df.merge(play_level_df[['gameId','playId','possessionTeam','passResult']], on = ['gameId', 'playId'], how='left')
#                 [["displayName", "possessionTeam", "passResult"]]
#                 .groupby(["displayName","possessionTeam","passResult"])
#                 .value_counts().reset_index(name="count").sort_values(by=["passResult", "count"], ascending=[True, False])
#                 .rename(columns={'possessionTeam':'team'}))
# print("QB Pass Result Counts:")
# qb_pass_result_counts.head()

In [None]:
# Play-level stat correlation matrices for OL/DL
coi = ['pff_beatenByDefender', 'pff_hitAllowed', 'pff_hurryAllowed', 'pff_sackAllowed','pff_backFieldBlock']
corr = player_level_df.query("officialPosition in ['C','G','T']")[coi].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, (ax1,ax2) = plt.subplots(1,2,figsize=[14,7])
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax1)
ax1.title.set_text("OL Correlation Heatmap")

# Correlation matrix of numeric data for DL
coi = ['pff_hit', 'pff_hurry', 'pff_sack', 'pff_pressure']
corr = player_level_df.query("officialPosition in ['NT','DT','DE']")[coi].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax2)
ax2.title.set_text("DL Correlation Heatmap")

### **Frame-level data**

In [None]:
frame_level_df.head(3)

In [None]:
print(f"Frame-level df columns:\n{list(frame_level_df.columns)}\n")
print("Frame-level df info:")
print(frame_level_df.info(verbose=False))

#### **Univariate analysis**

In [None]:
# OL/DL frame-level descriptive stats
print("Frame-level Descriptive Stats")
print("Offensive Linemen:")
coi = ['gameId','playId','nflId', 'x', 'y', 's', 'a', 'dis', 'o', 'dir']
display(ol_on_play_df.merge(frame_level_df[coi], how='left', on=['gameId','playId','nflId'])
.drop(columns=["gameId","playId","nflId"]).describe().transpose())

print("Defensive Linemen:")
display(dl_on_play_df.merge(frame_level_df[coi], how='left', on=['gameId','playId','nflId'])
.drop(columns=["gameId","playId","nflId"]).describe().transpose())

#### **Bi/Multivariate analysis**

In [None]:
# OL/DL frame-level means by player
coi = ['gameId','playId','nflId', 'displayName', 'x', 'y', 's', 'a', 'dis', 'o', 'dir']
print("Frame-level Means by Player")
print("Offensive Linemen:")
display(ol_on_play_df.merge(frame_level_df[coi], how='left', on=['gameId','playId','nflId'])
.drop(columns=["gameId","playId"]).groupby(["nflId","displayName"]).mean().reset_index()
.sort_values(by="s",ascending=False).head())

print("Defensive Linemen:")
display(dl_on_play_df.merge(frame_level_df[coi], how='left', on=['gameId','playId','nflId'])
.drop(columns=["gameId","playId"]).groupby(["nflId","displayName"]).mean().reset_index()
.sort_values(by="s",ascending=False).head())

In [None]:
# Frame-level stat correlation matrices for OL/DL
coi = ['gameId','playId','nflId', 'displayName', 'x', 'y', 's', 'a', 'dis', 'o', 'dir']
corr = (ol_on_play_df.merge(frame_level_df[coi], how='left', on=['gameId','playId','nflId'])
.drop(columns=["gameId","playId"]).corr())
mask = np.triu(np.ones_like(corr, dtype=bool))
f, (ax1,ax2) = plt.subplots(1,2,figsize=[14,7])
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax1)
ax1.title.set_text("OL Correlation Heatmap")

corr = (dl_on_play_df.merge(frame_level_df[coi], how='left', on=['gameId','playId','nflId'])
.drop(columns=["gameId","playId"]).corr())
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax2)
ax2.title.set_text("DL Correlation Heatmap")