# **2023 NFL Big Data Bowl**
### Sol Ben-Ishay

## **Feature Engineering**

In [99]:
%run extract&wrangle.ipynb

### **Pre-Snap Features**

#### **DL Alignments**

*Get the pre-snap alignment (edge, head-up, or in gap) for each of the DL for each play*

In [137]:
# 1. Get the y position of the DL 2 frames before the snap
coi = ['gameId', 'playId', 'nflId', 'frameId', 'y']
pass_rushers_pos_presnap_df = (dl_pos_df[coi]
                            .merge(frame_of_snap_df, on=['gameId','playId'], how='left', suffixes=['','_snap'])
                            .query("frameId == (frameId_snap-2)")
                            .drop(columns=["frameId","frameId_snap"]))

# 2. Get the y position of the OL/LOS blockers 2 frames before the snap
ol_pos_presnap_df = (ol_pos_df[coi]
                    .merge(frame_of_snap_df, on=['gameId','playId'], how='left', suffixes=['','_snap'])
                    .query("frameId == (frameId_snap-2)")
                    .drop(columns=["frameId","frameId_snap"]))

#3. Group by game, play, and d-lineman and get the position of all the offensive linemen on the play in a list
dl_alignment_df = pass_rushers_pos_presnap_df.merge(ol_pos_presnap_df, on=['gameId','playId'], how='left', suffixes=['_dl','_ol']).drop(columns=["nflId_ol"])
dl_alignment_df = dl_alignment_df.groupby(["gameId","playId","nflId_dl","y_dl"])['y_ol'].apply(list).reset_index()
dl_alignment_df = dl_alignment_df.sort_values(by=["gameId","playId","y_dl"])
dl_alignment_df['y_ol'] = dl_alignment_df['y_ol'].apply(sorted)

#4. Set the conditions for each alignment (edge, head-up, or in gap)
threshold_for_edge = .5
threshold_for_head_up = .25

edge_cond = (((dl_alignment_df.y_dl + threshold_for_edge) < dl_alignment_df.y_ol.apply(lambda x: min(x))) |
((dl_alignment_df.y_dl - threshold_for_edge) > dl_alignment_df.y_ol.apply(lambda x: max(x))))

head_up_lambda = (lambda x: abs(x['y_ol'][np.abs(np.asarray(x['y_ol']) - x['y_dl']).argmin()] - x['y_dl']))
head_up_cond = ((~edge_cond) &
                (dl_alignment_df.apply(head_up_lambda, axis=1) < threshold_for_head_up))

gap_cond = ((~edge_cond) & (~head_up_cond))

#5. Get the alignment
dl_alignment_df.loc[edge_cond, 'dl_alignment'] = 'edge'
dl_alignment_df.loc[head_up_cond, 'dl_alignment'] = 'head_up'
dl_alignment_df.loc[gap_cond, 'dl_alignment'] = 'gap'

dl_alignment_df = dl_alignment_df.drop(columns=['y_dl','y_ol']).rename(columns={"nflId_dl":"nflId"})

### **Intra-Play Features**

*These features can either be used as is for intra-play predictions or can be modified by taking a historic mesaure for pre-snap predictions*

#### **DL Num of Blockers**

In [138]:
# 1. Get the number of blockers per DL on each play
ol = ['C','G','T','TE','FB','RB']
conds = f"pff_role == 'Pass Block' and officialPosition in {ol}"

# 2. Get the blocking assignments of the offensive linemen
coi = ['gameId', 'playId', 'nflId', 'pff_nflIdBlockedPlayer', 'pff_beatenByDefender']
ol_blocks_df = player_level_df.query(conds)[coi]
ol_blocks_df = ol_blocks_df.rename(columns={"nflId":"nflIdBlocker","pff_nflIdBlockedPlayer":"nflIdDefender","pff_beatenByDefender":"beatBlocker"})
ol_blocks_df.head()

# 3. Get the num of blockers per defender
dl_num_blockers_df = ol_blocks_df[['gameId','playId','nflIdDefender','nflIdBlocker']].groupby(['gameId','playId','nflIdDefender']).count().reset_index().rename(columns={'nflIdDefender':'nflId','nflIdBlocker':'numBlockers'})
dl_num_blockers_df.loc[dl_num_blockers_df['numBlockers'] == 1, 'multBlockers'] = "N"
dl_num_blockers_df.loc[dl_num_blockers_df['numBlockers'] > 1, 'multBlockers'] = "Y"

# dl_num_blockers_df.head()

#### **DL Metrics at Key Points in Time**

*Get the tracking metrics + depth behind the LOS + dist of each of the DL from the QB for specific points in time after the snap for each play*

In [139]:
def get_dl_metrics_at_time(time_after_snap):
    """
    Returns a dataframe with the metrics of every DL at the selected point in time after the snap

    Args:
        time: Either a positive float representing the time in seconds (to the tenths place)
        after the snap or "action" for at release/sack/run time
    Returns:
        This is a description of what is returned.
    Raises:
        KeyError: Raises an exception.
    """

    if isinstance(time_after_snap, float) or isinstance(time_after_snap, int):
        col_str = str(time_after_snap)
        col_names = {"x":f"x_at_{col_str}","y":f"y_at_{col_str}","s":f"speed_at_{col_str}","o":f"o_at_{col_str}","dir":f"dir_at_{col_str}"}
        num_frames_after_snap = time_after_snap * 10

        # 1. Get the metrics of the QB at the selected time
        qb_pos_at_key_df = (qb_pos_df
                        .merge(frame_of_snap_df, on=['gameId','playId'], how='left', suffixes=['','_snap'])
                        .query(f"frameId == (frameId_snap+{num_frames_after_snap})")
                        .drop(columns=["nflId","frameId","frameId_snap","s","o","dir","playDirection"])
                        .rename(columns=col_names))

        # 2. Get the metrics of the DL at the selected time
        pass_rushers_pos_at_key_df = (dl_pos_df
                                .merge(frame_of_snap_df, on=['gameId','playId'], how='left', suffixes=['','_snap'])
                                .query(f"frameId == (frameId_snap+{num_frames_after_snap})")
                                .drop(columns=["frameId","frameId_snap"])
                                .rename(columns=col_names))

    elif time_after_snap == "action":
        col_str = "action"
        col_names = {"x":f"x_at_{col_str}","y":f"y_at_{col_str}","s":f"speed_at_{col_str}","o":f"o_at_{col_str}","dir":f"dir_at_{col_str}"}

        # 1. Get the position of the QB at release
        qb_pos_at_key_df = (frame_of_action_df
                            .merge(qb_pos_df, on=['gameId','playId','frameId'], how='left')
                            .drop(columns=["nflId","frameId","s","o","dir","playDirection"])
                            .rename(columns=col_names))

        # 2. Get the positions of the DL at release
        pass_rushers_pos_at_key_df = (frame_of_action_df
                                    .merge(dl_pos_df, on=['gameId','playId','frameId'], how='left')
                                    .drop(columns=['frameId'])
                                    .rename(columns=col_names))

    else:
        raise ValueError("Invalid input to 'time_after_snap!'")

    # 4. Get the depth of the DL at the selected time
    pass_rushers_depth_at_key_df = pass_rushers_pos_at_key_df.merge(play_level_df[['gameId','playId','absoluteYardlineNumber']], on=['gameId','playId'], how='left')
    pass_rushers_depth_at_key_df.loc[pass_rushers_depth_at_key_df.playDirection == 'right', f'depth_at_{col_str}'] = pass_rushers_depth_at_key_df['absoluteYardlineNumber'] - pass_rushers_depth_at_key_df[f'x_at_{col_str}']
    pass_rushers_depth_at_key_df.loc[pass_rushers_depth_at_key_df.playDirection == 'left', f'depth_at_{col_str}'] = pass_rushers_depth_at_key_df[f'x_at_{col_str}'] - pass_rushers_depth_at_key_df['absoluteYardlineNumber']
    pass_rushers_depth_at_key_df = pass_rushers_depth_at_key_df[['gameId','playId','nflId', f'x_at_{col_str}', f'y_at_{col_str}', f'speed_at_{col_str}',f'o_at_{col_str}',f'dir_at_{col_str}',f'depth_at_{col_str}']]

    # 5. Get the distances of the DL from the QB at the selected time
    rusher_dist_at_key_df = pass_rushers_pos_at_key_df.merge(qb_pos_at_key_df, how='left', on=['gameId','playId'], suffixes=['_pr','_qb'])
    rusher_dist_at_key_df['x2-x1'] = (rusher_dist_at_key_df[f'x_at_{col_str}_qb'] - rusher_dist_at_key_df[f'x_at_{col_str}_pr'])**2
    rusher_dist_at_key_df['y2-y1'] = (rusher_dist_at_key_df[f'y_at_{col_str}_qb'] - rusher_dist_at_key_df[f'y_at_{col_str}_pr'])**2
    rusher_dist_at_key_df[f'dist_from_qb_at_{col_str}'] = np.sqrt(rusher_dist_at_key_df['x2-x1'] + rusher_dist_at_key_df['y2-y1'])
    rusher_dist_at_key_df = rusher_dist_at_key_df[['gameId','playId','nflId',f'dist_from_qb_at_{col_str}']]

    # 6. Get all the metrics of the DL at the selected time
    pass_rushers_at_key_df = pass_rushers_depth_at_key_df.merge(rusher_dist_at_key_df, on=['gameId','playId','nflId'], how='outer')

    return pass_rushers_at_key_df

In [140]:
# Get metrics at 1.5 seconds after the snap
pass_rushers_at_key_df = get_dl_metrics_at_time(1.5)
# Get metrics at 2 seconds after the snap
pass_rushers_at_key2_df = get_dl_metrics_at_time(2)
# Get metrics at action (only "distance from qb" for now)
rusher_dist_at_action_df = get_dl_metrics_at_time("action")[['gameId', 'playId', 'nflId', 'dist_from_qb_at_action']]

In [144]:
rusher_dist_at_action_df.sort_values(by='dist_from_qb_at_action',ascending=False)

Unnamed: 0,gameId,playId,nflId,dist_from_qb_at_action
10673,2021100307,156,53053.0,39.241072
2569,2021091209,243,40051.0,38.487465
9690,2021092612,2662,52556.0,37.402327
11163,2021100308,1400,38542.0,36.732026
9143,2021092609,497,43308.0,36.577869
...,...,...,...,...
3919,2021091600,4489,,
6643,2021091912,4213,,
6704,2021091913,1198,,
6705,2021091913,1220,,


#### **DL Time to Depth**

In [68]:
# 1. Set the depth and the condition for when a player is at depth depending on the play direction
depth = 3
right_play_dir_cond = f"(playDirection == 'right') and (x <= absoluteYardlineNumber - {depth})"
left_play_dir_cond = f"(playDirection == 'left') and (x >= absoluteYardlineNumber + {depth})"

# 2. Get all the frames/times at depth for each DL
time_at_depth_df = (play_level_df[['gameId','playId','absoluteYardlineNumber']]
                .merge(frame_level_df, on = ['gameId','playId'], how = 'left')
                .merge(player_level_df[['gameId','playId','nflId', 'officialPosition']], on= ['gameId','playId','nflId'], how='left')
                .query(f"officialPosition in ['NT','DT','DE'] and ({right_play_dir_cond} or {left_play_dir_cond})"))

# 3. Get the minimum frame/time at depth for each DL
coi = ['gameId','playId','nflId', 'displayName', 'time']
min_time_at_depth_df = time_at_depth_df[coi].groupby(['gameId', 'playId', 'nflId', 'displayName']).min().reset_index().rename(columns={'time':'depth_time'})

# 4. Get the time of snap for each play (to calculate the time after snap at depth)
coi = ['gameId','playId','time']
time_of_snap_df = play_event_times_df.query("event == 'ballsnap'")[coi].rename(columns={'time':'snap_time'})

# 5. Get the time after snap to depth for each DL (if they got to depth)
time_to_depth_df = min_time_at_depth_df.merge(time_of_snap_df, on=['gameId','playId'], how='left')
time_to_depth_df["time_to_depth"] = time_to_depth_df['depth_time'] - time_to_depth_df['snap_time']
time_to_depth_df["time_to_depth"] = time_to_depth_df["time_to_depth"].dt.total_seconds()
time_to_depth_df = time_to_depth_df[['gameId', 'playId', 'nflId', 'time_to_depth']].sort_values(by='time_to_depth')

# 6. Drop weird time_to_depth value less than zero (means the player was 3 yds behind the LOS before the snap, bad tracking data?)
time_to_depth_df = time_to_depth_df.query("~(gameId == 2021091204 and playId == 2699 and nflId == 45011)")

# time_to_depth_df.head()

#### **QB Lateral Movement**

In [7]:
# 1. Get the position of the QB at release
qb_pos_at_snap_df = (frame_of_snap_df
                    .merge(qb_pos_df, on=['gameId','playId','frameId'], how='left')
                    .drop(columns=["nflId","frameId","y","s","o","dir","playDirection"]))

qb_pos_at_release_df = (frame_of_action_df
                    .merge(qb_pos_df, on=['gameId','playId','frameId'], how='left')
                    .drop(columns=["nflId","frameId","y","s","o","dir","playDirection"]))

qb_lateral_movement_df = qb_pos_at_snap_df.merge(qb_pos_at_release_df, on=['gameId','playId'], how='left', suffixes=['_snap','_rel'])

qb_lateral_movement_df['qb_x_movement'] = qb_lateral_movement_df['x_rel'] - qb_lateral_movement_df['x_snap']

qb_lateral_movement_df

Unnamed: 0,gameId,playId,x_snap,x_rel,qb_x_movement
0,2021090900,97,37.64,32.99,-4.65
1,2021090900,137,113.20,116.48,3.28
2,2021090900,187,81.26,84.02,2.76
3,2021090900,282,50.72,58.98,8.26
4,2021090900,349,58.79,63.68,4.89
...,...,...,...,...,...
8528,2021110100,4310,22.74,26.40,3.66
8529,2021110100,4363,29.88,25.16,-4.72
8530,2021110100,4392,32.91,29.20,-3.71
8531,2021110100,4411,24.87,21.98,-2.89


#### **Distance of the QB From the Position the Defender Lined Up at Snap**

In [63]:
# 1. Get the position of the QB at snap
qb_pos_at_snap_df = (frame_of_snap_df
                .merge(qb_pos_df, on=['gameId','playId','frameId'], how='left', suffixes=['','_snap'])
                .drop(columns=["nflId","frameId","s","o","dir","playDirection"]))

# 2. Get the position of the QB at action
qb_pos_at_action_df = (frame_of_action_df
                .merge(qb_pos_df, on=['gameId','playId','frameId'], how='left', suffixes=['','_snap'])
                .drop(columns=["nflId","frameId","s","o","dir","playDirection"]))

# 2. Get the position of the DL at snap
pass_rushers_pos_at_snap_df = (frame_of_snap_df
                        .merge(dl_pos_df, on=['gameId','playId','frameId'], how='left', suffixes=['','_snap'])
                        .drop(columns=["frameId","s","o","dir","playDirection"]))

# 3. Get the distance of where the DL lined up from the QB at snap
rusher_pos_lined_up_dist_at_snap_df = (pass_rushers_pos_at_snap_df.merge(qb_pos_at_snap_df, how='left', on=['gameId','playId'], suffixes=['_pr','_qb']))
rusher_pos_lined_up_dist_at_snap_df['x2-x1'] = (rusher_pos_lined_up_dist_at_snap_df[f'x_qb'] - rusher_pos_lined_up_dist_at_snap_df[f'x_pr'])**2
rusher_pos_lined_up_dist_at_snap_df['y2-y1'] = (rusher_pos_lined_up_dist_at_snap_df[f'y_qb'] - rusher_pos_lined_up_dist_at_snap_df[f'y_pr'])**2
rusher_pos_lined_up_dist_at_snap_df[f'pos_lined_up_dist_from_qb_at_snap'] = np.sqrt(rusher_pos_lined_up_dist_at_snap_df['x2-x1'] + rusher_pos_lined_up_dist_at_snap_df['y2-y1'])
rusher_pos_lined_up_dist_at_snap_df = rusher_pos_lined_up_dist_at_snap_df[['gameId','playId','nflId',f'pos_lined_up_dist_from_qb_at_snap']]

rusher_pos_lined_up_dist_at_snap_df.head()


Unnamed: 0,gameId,playId,nflId,pos_lined_up_dist_from_qb_at_snap
0,2021090900,97,41263.0,7.802083
1,2021090900,97,42403.0,10.452009
2,2021090900,97,44955.0,5.783641
3,2021090900,97,53504.0,6.521633
4,2021090900,137,35441.0,6.466065


#### **Difference in the Distance of the QB From the Position the Defender Lined Up between Snap & Action**

In [64]:
# 1. Get the distance of where the DL lined up from the QB at action
col_str = 'action'
rusher_pos_lined_up_dist_at_action_df = (pass_rushers_pos_at_snap_df.merge(qb_pos_at_action_df, how='left', on=['gameId','playId'], suffixes=['_pr','_qb']))
rusher_pos_lined_up_dist_at_action_df['x2-x1'] = (rusher_pos_lined_up_dist_at_action_df[f'x_qb'] - rusher_pos_lined_up_dist_at_action_df[f'x_pr'])**2
rusher_pos_lined_up_dist_at_action_df['y2-y1'] = (rusher_pos_lined_up_dist_at_action_df[f'y_qb'] - rusher_pos_lined_up_dist_at_action_df[f'y_pr'])**2
rusher_pos_lined_up_dist_at_action_df[f'pos_lined_up_dist_from_qb_at_{col_str}'] = np.sqrt(rusher_pos_lined_up_dist_at_action_df['x2-x1'] + rusher_pos_lined_up_dist_at_action_df['y2-y1'])
rusher_pos_lined_up_dist_at_action_df = rusher_pos_lined_up_dist_at_action_df[['gameId','playId','nflId',f'pos_lined_up_dist_from_qb_at_{col_str}']]

# 2. Get the difference in the distance
rusher_pos_lined_up_dist_diff_df = rusher_pos_lined_up_dist_at_snap_df.merge(rusher_pos_lined_up_dist_at_action_df, on=['gameId','playId','nflId'], how='left')
rusher_pos_lined_up_dist_diff_df["pos_lined_up_diff_dist_from_qb"] = (rusher_pos_lined_up_dist_diff_df["pos_lined_up_dist_from_qb_at_action"] 
- rusher_pos_lined_up_dist_diff_df["pos_lined_up_dist_from_qb_at_snap"])
rusher_pos_lined_up_dist_diff_df = rusher_pos_lined_up_dist_diff_df[["gameId","playId","nflId","pos_lined_up_diff_dist_from_qb"]]
rusher_pos_lined_up_dist_diff_df.head()

Unnamed: 0,gameId,playId,nflId,pos_lined_up_diff_dist_from_qb
0,2021090900,97,41263.0,2.765722
1,2021090900,97,42403.0,5.341803
2,2021090900,97,44955.0,5.310975
3,2021090900,97,53504.0,5.499835
4,2021090900,137,35441.0,3.074737


#### **Closest Distance the Defender got to the QB**

In [205]:
coi = ['gameId','playId','nflId','frameId','x','y']
rusher_min_dist_df = (dl_pos_df[coi]
    .merge(qb_pos_df[coi].drop(columns="nflId"), on=['gameId','playId','frameId'], how='left',suffixes=['_dl','_qb'])
    .merge(frame_of_snap_df,on=['gameId','playId'],suffixes=['_pos','_snap'])
    .query("frameId_pos > frameId_snap")
    .drop(columns=['frameId_snap','frameId_pos']))
rusher_min_dist_df['x2-x1'] = (rusher_min_dist_df[f'x_qb'] - rusher_min_dist_df[f'x_dl'])**2
rusher_min_dist_df['y2-y1'] = (rusher_min_dist_df[f'y_qb'] - rusher_min_dist_df[f'y_dl'])**2
rusher_min_dist_df['dist_from_qb'] = np.sqrt(rusher_min_dist_df['x2-x1'] + rusher_min_dist_df['y2-y1'])
rusher_min_dist_df = rusher_min_dist_df.drop(columns=['x_dl','y_dl','x_qb','y_qb','x2-x1','y2-y1'])
rusher_min_dist_df = rusher_min_dist_df.groupby(["gameId","playId","nflId"]).min().reset_index().rename(columns={'dist_from_qb':'min_dist_from_qb'})
rusher_min_dist_df.head()

Unnamed: 0,gameId,playId,nflId,min_dist_from_qb
0,2021090900,97,41263,2.843835
1,2021090900,97,42403,3.153173
2,2021090900,97,44955,5.494879
3,2021090900,97,53504,1.113059
4,2021090900,137,35441,3.469308


In [206]:
# Descriptive stats of the rusher min dist to the QB grouped by the rush result
rusher_min_dist_stats_by_result_df = (rusher_min_dist_df
.merge(player_level_df.query(f"pff_role == 'Pass Rush'")[['gameId','playId','nflId','pff_sack','pff_hit','pff_hurry']],
on=['gameId','playId','nflId'],
how='left'))

rusher_min_dist_stats_by_result_df.loc[(rusher_min_dist_stats_by_result_df.pff_sack + rusher_min_dist_stats_by_result_df.pff_hit + rusher_min_dist_stats_by_result_df.pff_hurry) == 0, 'rush_result'] = 'None'
rusher_min_dist_stats_by_result_df.loc[(rusher_min_dist_stats_by_result_df.pff_sack) == 1, 'rush_result'] = 'Sack'
rusher_min_dist_stats_by_result_df.loc[(rusher_min_dist_stats_by_result_df.pff_hit) == 1, 'rush_result'] = 'Hit'
rusher_min_dist_stats_by_result_df.loc[(rusher_min_dist_stats_by_result_df.pff_hurry) == 1, 'rush_result'] = 'Hurry'

rusher_min_dist_stats_by_result_df[["min_dist_from_qb","rush_result"]].groupby("rush_result").describe().transpose()

Unnamed: 0,rush_result,Hit,Hurry,None,Sack
min_dist_from_qb,count,502.0,1827.0,23172.0,335.0
min_dist_from_qb,mean,0.764778,1.420224,3.568869,0.509455
min_dist_from_qb,std,0.465433,0.758106,1.353846,0.550208
min_dist_from_qb,min,0.05831,0.05099,0.022361,0.0
min_dist_from_qb,25%,0.495353,0.895067,2.550176,0.221472
min_dist_from_qb,50%,0.707107,1.237619,3.481997,0.384838
min_dist_from_qb,75%,0.977084,1.793237,4.598108,0.619273
min_dist_from_qb,max,6.537109,5.796387,9.171096,4.927606


#### **Defender Penalty on Play**

In [269]:
play_level_coi = ['gameId','playId','foulNFLId1','foulNFLId2','foulNFLId3']
penalty_df = (dl_on_play_df
    .merge(play_level_df[play_level_coi], on=['gameId','playId'], how='left'))
penalty_df['nflId'] = penalty_df['nflId'].astype(float)
penalty_df.loc[(penalty_df.nflId == penalty_df.foulNFLId1)|(penalty_df.nflId == penalty_df.foulNFLId2)|(penalty_df.nflId == penalty_df.foulNFLId3), 'penalty'] = 1
penalty_df.loc[penalty_df.penalty != 1, 'penalty'] = 0
penalty_df = penalty_df.drop(columns=['foulNFLId1','foulNFLId2','foulNFLId3'])