# Full Training

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from lightgbm import LGBMRegressor
from itertools import product

In [2]:
import dill
import pickle

In [3]:
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names")
warnings.filterwarnings("ignore", message="DataFrame.fillna with 'method' is deprecated")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [4]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 

In [5]:
tqdm.pandas()

## Load Data

In [None]:
week = 'w01'

In [6]:
# input_2023 = pd.read_csv(f'../data/train/input_2023_{week}_subset.csv')
# input_2023 = pd.read_csv(f'../data/train/input_2023_{week}.csv')
input_2023 = pd.read_csv(f'../data/folds/fold1/TRAIN_input.csv')

In [7]:
# output_2023 = pd.read_csv(f'../data/train/output_2023_{week}_subset.csv')
# output_2023 = pd.read_csv(f'../data/train/output_2023_{week}.csv')
output_2023 = pd.read_csv(f'../data/folds/fold1/TRAIN_output.csv')

In [8]:
input_2023.shape

(3905662, 24)

In [9]:
output_2023.shape

(451017, 7)

## Merge Input and Output

In [10]:
def merge_input_output_df(input_df, output_df):
    keys = ['game_id', 'play_id', 'nfl_id']
    
    # Identify missing columns in output csv
    missing_output_cols = [col for col in input_df.columns if col not in output_df.columns]
    
    # Set a default value of `np.nan` for those missing fields.
    for col in missing_output_cols:
        output_df[col] = np.nan
        
    # Add a source column to identify the origin of each record
    input_df['source'] = 'input'
    output_df['source'] = 'output'
    
    # Stack dataframes vertically
    combined_df = pd.concat([input_df, output_df], ignore_index=True)
    
    # Sort by keys and source so that the we can forward fill appropriately.
    combined_df = combined_df.sort_values(by=keys + ['source'], ascending=[True]*4)
    
    # Forward fill the missing fields in the output records using the values from the input.
    combined_df[input_df.columns] = combined_df.groupby(keys)[input_df.columns].ffill()
    
    # Reset index
    combined_df.reset_index(drop=True, inplace=True)
    
    return combined_df

In [11]:
combined_df = merge_input_output_df(input_2023, output_2023)

In [12]:
combined_df.shape

(4356679, 25)

## Standardize XY Positions

In [13]:
def standardize_by_play_direction(df):
    # Standardize X-Y Position
    df.loc[df['play_direction'] == 'left', 'x'] = 120 - df['x']
    df.loc[df['play_direction'] == 'left', 'ball_land_x'] = 120 - df['ball_land_x']
    df.loc[df['play_direction'] == 'left', 'y'] = 53.3 - df['y']
    df.loc[df['play_direction'] == 'left', 'ball_land_y'] = 53.3 - df['ball_land_y']
    
    # Standardize 'dir' and 'o' angles
    df.loc[df['play_direction'] == 'left', 'dir'] = (df['dir'] + 180) % 360
    df.loc[df['play_direction'] == 'left', 'o'] = (df['o'] + 180) % 360
    
    return df

In [14]:
std_df = standardize_by_play_direction(combined_df)

In [15]:
std_df.shape

(4356679, 25)

## Engineer Features

In [16]:
def vectorize_kinematics(df: pd.DataFrame):
    groups = ['game_id', 'play_id', 'nfl_id']

    df['x_shift'] = df.groupby(groups)['x'].shift(1)
    df['y_shift'] = df.groupby(groups)['y'].shift(1)

    df['velocity_x'] = (df['x'] - df['x_shift']) / 0.1
    df['velocity_y'] = (df['y'] - df['y_shift']) / 0.1

    df['acc_x'] = df.groupby(groups)['velocity_x'].diff() / 0.1
    df['acc_y'] = df.groupby(groups)['velocity_y'].diff() / 0.1

    return df

In [17]:
def calculate_land_spot_bearing(df: pd.DataFrame):
    # Receiver angle to landing spot
    df['land_spot_dir'] = np.degrees(np.arctan2(
        df['ball_land_y'] - df['y'],
        df['ball_land_x'] - df['x']
    )) % 360
    
    # xy components for angle to landing spot
    df['land_spot_dirx'] = np.cos(np.radians(df['land_spot_dir']))
    df['land_spot_diry'] = np.sin(np.radians(df['land_spot_dir']))
    
    # landing spot distance
    df['land_spot_dist'] = np.sqrt(
        (df['ball_land_x'] - df['x'])**2 + (df['ball_land_y'] - df['y'])**2
    )
    
    return df

In [18]:
def calculate_field_distances(df: pd.DataFrame):
    # Distance from top
    df['dist_sideline'] = 53.3 - df['y']
    
    # Distance to nearest endzone
    df['dist_endzone'] = 120 - df['x']
    
    return df

In [19]:
def get_closest_defender_bearing(df: pd.DataFrame):
    def closest(group: pd.DataFrame):
        # exactly 1 offensive player
        off = group[group['player_role'] == 'Targeted Receiver'].iloc[0]

        # all defenders
        defs = group[group['player_side'] == 'Defense']

        if defs.empty:
            return pd.DataFrame({
                'nfl_id': off.nfl_id,
                'closest_def_id': [-1],
                'closest_def_dist': [0],
                'closest_def_dir_x': [0],
                'closest_def_dir_y': [0]
            })

        # Get x, y directions from offense to each defender
        closest_def_dir = np.degrees(np.arctan2(defs['y'] - off['y'], defs['x'] - off['x'])) % 360
        closest_def_dir_x = np.cos(np.radians(closest_def_dir))
        closest_def_dir_y = np.sin(np.radians(closest_def_dir))
        
        # compute Euclidean distances
        closest_def_dist = np.sqrt((defs['x'] - off['x'])**2 + (defs['y'] - off['y'])**2)

        # index of defender with min distance
        idxmin = closest_def_dist.idxmin()
        
        return pd.DataFrame([{
            'nfl_id': off.nfl_id,
            'closest_def_id': defs.loc[idxmin, 'nfl_id'],
            'closest_def_dist': closest_def_dist.loc[idxmin],
            'closest_def_dir_x': closest_def_dir_x.loc[idxmin],
            'closest_def_dir_y': closest_def_dir_y.loc[idxmin]
        }])

    result = (
        df.groupby(['game_id', 'play_id', 'frame_id', 'source'])
        .progress_apply(closest)
        .reset_index()
    )
    
    # Merge back to original dataframe
    merged = df.merge(
        result,
        on=['game_id', 'play_id', 'frame_id', 'source', 'nfl_id'],
        how='left',
        suffixes=['', '_y']
    )

    # Identify offense rows
    mask = df['player_side'] == 'Offense'

    # Keep merged values only for offense rows
    for col in result.columns:
        if col not in ['game_id', 'play_id', 'frame_id', 'source', 'nfl_id']:
            merged[col] = merged[col].where(mask)
    
    return merged

In [20]:
def get_closest_defender_kinematics(df: pd.DataFrame):
    def closest(group: pd.DataFrame):
        # exactly 1 offensive player
        off = group[group['player_role'] == 'Targeted Receiver'].iloc[0]

        # all defenders
        defs = group[group['player_side'] == 'Defense']

        if defs.empty:
            return pd.DataFrame({
                'nfl_id': off.nfl_id,
                'closest_def_id': [-1],
                'closest_def_velo_x': [0],
                'closest_def_velo_y': [0],
                'closest_def_acc_x': [0],
                'closest_def_acc_y': [0]
            })
        
        # Get velocities and acceleration
        closest_def_velo_x, closest_def_velo_y = defs['velocity_x'], defs['velocity_y']
        closest_def_acc_x, closest_def_acc_y = defs['acc_x'], defs['acc_y']
        
        # compute Euclidean distances
        closest_def_dist = np.sqrt((defs['x'] - off['x'])**2 + (defs['y'] - off['y'])**2)
        
        # index of defender with min distance
        idxmin = closest_def_dist.idxmin()
        
        return pd.DataFrame([{
            'nfl_id': off.nfl_id,
            'closest_def_id': defs.loc[idxmin, 'nfl_id'],
            'closest_def_velo_x': closest_def_velo_x.loc[idxmin],
            'closest_def_velo_y': closest_def_velo_y.loc[idxmin],
            'closest_def_acc_x': closest_def_acc_x.loc[idxmin],
            'closest_def_acc_y': closest_def_acc_y.loc[idxmin]
        }])
        
    
    result = (
        df.groupby(['game_id', 'play_id', 'frame_id', 'source'])
        .progress_apply(closest)
        .reset_index()
    )
    
    # Merge back to original dataframe
    merged = df.merge(
        result,
        on=['game_id', 'play_id', 'frame_id', 'source', 'nfl_id'],
        how='left',
        suffixes=['', '_y']
    )
    
    # Identify offense rows
    mask = df['player_side'] == 'Offense'

    # Keep merged values only for offense rows
    for col in result.columns:
        if col not in ['game_id', 'play_id', 'frame_id', 'source', 'nfl_id']:
            merged[col] = merged[col].where(mask)
    
    return merged

In [21]:
def get_receiver_bearing(df: pd.DataFrame):
    off = (
        df[df['player_role'] == 'Targeted Receiver']
        [['game_id', 'play_id', 'frame_id', 'source', 'player_side', 'nfl_id', 'x', 'y']]
    )
    
    defs = df[df['player_side'] == 'Defense']
    
    df2 = defs.merge(
        off,
        on=['game_id', 'play_id', 'frame_id', 'source'],
        how='left',
        suffixes=('', '_off')
    )
    
    # Get x, y directions from defender to receiver
    receiver_dir = np.degrees(np.arctan2(df2['y_off'] - df2['y'], df2['x_off'] - df2['x'])) % 360
    receiver_dir_x = np.cos(np.radians(receiver_dir))
    receiver_dir_y = np.sin(np.radians(receiver_dir))
    
    # Calculate receiver distance
    df2['receiver_dist'] = np.sqrt(
        (df2['x'] - df2['x_off'])**2 +
        (df2['y'] - df2['y_off'])**2
    )
    
    result = df2[['game_id', 'play_id', 'frame_id', 'source', 'nfl_id', 'receiver_dist']].copy()
    result['receiver_dir_x'] = receiver_dir_x
    result['receiver_dir_y'] = receiver_dir_y
    
    merged = df.merge(
        result,
        on=['game_id', 'play_id', 'frame_id', 'source', 'nfl_id'],
        how='left',
        suffixes=['', '_y']
    )
    
    return merged

In [22]:
def get_receiver_kinematics(df: pd.DataFrame):
    off = (
        df[df['player_role'] == 'Targeted Receiver']
        [['game_id', 'play_id', 'frame_id', 
          'source', 'player_side', 'nfl_id', 
          'velocity_x', 'velocity_y',
          'acc_x', 'acc_y'
        ]]
    )
    
    defs = df[df['player_side'] == 'Defense']
    
    df2 = defs.merge(
        off,
        on=['game_id', 'play_id', 'frame_id', 'source'],
        how='left',
        suffixes=('', '_off')
    )
    
    result = df2[['game_id', 'play_id', 'frame_id', 'source', 'nfl_id', 'velocity_x_off', 'velocity_y_off', 'acc_x_off', 'acc_y_off']]
    
    merged = df.merge(
        result,
        on=['game_id', 'play_id', 'frame_id', 'source', 'nfl_id'],
        how='left',
        suffixes=['', '_y']
    )
    
    return merged

In [23]:
def calculate_throw_stats(df):
    # NOTE: There are some plays where QB is not specified in the data
    #       therefore, the throw stats will be NA. For those, we fillna(0)
    #       as shown in the bottom of engineering features section
    # Create a small df with throw distance and throw velocity per game_id, play_id
    qb_df = df[df['player_position'] == 'QB']
    qb_df = qb_df.drop_duplicates(subset=['game_id', 'play_id'], keep='last')
    
    qb_df['throw_distance'] = np.sqrt((qb_df['ball_land_x'] - qb_df['x'])**2 + (qb_df['ball_land_y'] - qb_df['y'])**2)
    qb_df['throw_velocity'] = qb_df['throw_distance'] / (qb_df['num_frames_output'] / 10)
    throw_stats_df = qb_df[['game_id', 'play_id', 'throw_distance', 'throw_velocity']]
    
    return throw_stats_df

Apply the above functions

In [24]:
throw_stats = calculate_throw_stats(std_df)

In [25]:
# We only train using those who needs prediction since only them have data in the `output` dataset
std_df_subset = std_df[std_df['player_to_predict'] == True].reset_index(drop=True)

In [26]:
eng_df = std_df_subset.copy()

In [27]:
eng_df = vectorize_kinematics(eng_df)

In [28]:
eng_df = calculate_field_distances(eng_df)

In [29]:
eng_df = calculate_land_spot_bearing(eng_df)

In [30]:
eng_df = get_closest_defender_bearing(eng_df).drop(['level_4', 0], axis=1, errors='ignore')  # Where does this 0 column come from???

  0%|          | 0/446176 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [31]:
eng_df.shape

(1496046, 41)

In [32]:
eng_df = get_closest_defender_kinematics(eng_df).drop(['level_4', 0], axis=1, errors='ignore')  # Where does this 0 column come from???

  0%|          | 0/446176 [00:00<?, ?it/s]

In [33]:
eng_df.shape

(1496046, 46)

In [34]:
eng_df = get_receiver_bearing(eng_df)

In [35]:
eng_df = get_receiver_kinematics(eng_df)

In [36]:
eng_df = eng_df.merge(throw_stats, on=['game_id', 'play_id'], how='left')

Quick inspect

In [37]:
eng_df.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,source,x_shift,y_shift,velocity_x,velocity_y,acc_x,acc_y,dist_sideline,dist_endzone,land_spot_dir,land_spot_dirx,land_spot_diry,land_spot_dist,closest_def_id,closest_def_dist,closest_def_dir_x,closest_def_dir_y,closest_def_id_y,closest_def_velo_x,closest_def_velo_y,closest_def_acc_x,closest_def_acc_y,receiver_dist,receiver_dir_x,receiver_dir_y,velocity_x_off,velocity_y_off,acc_x_off,acc_y_off,throw_distance,throw_velocity
0,2023090700,101,1.0,44930,1,right,42.0,Josh Reynolds,6-3,196.0,1995-02-16,WR,Offense,Targeted Receiver,41.03,12.17,0.0,0.0,156.35,80.97,21.0,63.259998,-0.22,w01,input,,,,,,,41.13,78.97,330.866673,0.873489,-0.486844,25.449655,52546.0,3.423814,0.990124,-0.140195,52546.0,,,,,,,,,,,,41.08852,19.565962
1,2023090700,101,1.0,44930,2,right,42.0,Josh Reynolds,6-3,196.0,1995-02-16,WR,Offense,Targeted Receiver,41.03,12.17,0.0,0.0,119.09,82.26,21.0,63.259998,-0.22,w01,input,41.03,12.17,0.0,0.0,,,41.13,78.97,330.866673,0.873489,-0.486844,25.449655,52546.0,3.511595,0.991003,-0.133842,52546.0,0.9,0.1,,,,,,,,,,41.08852,19.565962
2,2023090700,101,1.0,44930,3,right,42.0,Josh Reynolds,6-3,196.0,1995-02-16,WR,Offense,Targeted Receiver,41.05,12.18,0.02,0.47,65.03,83.33,21.0,63.259998,-0.22,w01,input,41.03,12.17,0.2,0.1,2.0,1.0,41.12,78.95,330.825066,0.873135,-0.487478,25.43706,52546.0,3.571064,0.991301,-0.131613,52546.0,0.8,0.1,-1.0,1.776357e-13,,,,,,,,41.08852,19.565962
3,2023090700,101,1.0,44930,4,right,42.0,Josh Reynolds,6-3,196.0,1995-02-16,WR,Offense,Targeted Receiver,41.07,12.2,0.18,1.54,56.06,84.29,21.0,63.259998,-0.22,w01,input,41.05,12.18,0.2,0.2,7.105427e-13,1.0,41.1,78.93,330.763753,0.872613,-0.488412,25.429361,52546.0,3.620635,0.991539,-0.129811,52546.0,0.7,0.2,-1.0,1.0,,,,,,,,41.08852,19.565962
4,2023090700,101,1.0,44930,5,right,42.0,Josh Reynolds,6-3,196.0,1995-02-16,WR,Offense,Targeted Receiver,41.11,12.22,0.57,3.09,59.41,88.21,21.0,63.259998,-0.22,w01,input,41.07,12.2,0.4,0.2,2.0,1.776357e-13,41.08,78.89,330.68033,0.871901,-0.489682,25.404252,52546.0,3.65903,0.992066,-0.125716,52546.0,0.8,0.3,1.0,1.0,,,,,,,,41.08852,19.565962


In [38]:
# Check if there are NAs in Offense features related to closest defender
eng_df[(eng_df.player_side == 'Offense') & (eng_df[['closest_def_id']].isnull().any(axis=1))].shape

(0, 55)

In [39]:
# Check if there are NAs in Defense features related to receiver
eng_df[(eng_df.player_side == 'Defense') & (eng_df[['receiver_dist']].isnull().any(axis=1))].shape

(0, 55)

In [40]:
# Check if any of throw stats is NA
eng_df[eng_df[['throw_distance']].isnull().any(axis=1)].shape

(934, 55)

Fill NAs. There are some cases where QB is not available, we fill NAs as 0 to make sure training doesn't crash. Based on initial tries on smaller training data, we're still able to generate a decently good prediction model even if we fill NAs with 0.

In [41]:
eng_df.fillna(0, inplace=True)

Checkpoint after calculation of features

In [6]:
# dill.dump_session('training_saved_session/after_calculating_features.db')
dill.load_session('training_saved_session/after_calculating_features.db')

Clear some memory

In [7]:
del std_df_subset
del input_2023
del output_2023

## Training

In [8]:
def reduce_df(df: pd.DataFrame):
    # Get all frames in output and last frame of input
    df = df[((df.source == 'input') & (df.frame_id == df[df.source == 'input'].frame_id.max())) | (df.source == 'output')]
    df.loc[df.source == 'input', 'frame_id'] = 0
    
    return df

In [9]:
def append_values_to_predict(df: pd.DataFrame):
    df['velocity_x_shift'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['velocity_x'].shift(-1)
    df['velocity_y_shift'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['velocity_y'].shift(-1)
    df['acceleration_x_shift'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['acc_x'].shift(-1)
    df['acceleration_y_shift'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['acc_y'].shift(-1)
    
    return df

In [10]:
def fit_and_scale_features(X: pd.DataFrame, y: pd.DataFrame):
    motion_vars = ['velocity_x', 'velocity_y', 'acc_x', 'acc_y']
    other_vars = [col for col in X.columns if col not in motion_vars]
    
    y = y.rename(columns={
        'velocity_x_shift': 'velocity_x',
        'velocity_y_shift': 'velocity_y',
        'acceleration_x_shift': 'acc_x',
        'acceleration_y_shift': 'acc_y'
    })
    
    scaler_motion = StandardScaler()
    scaler_others = StandardScaler()
    
    # Apply scaling
    X_motion_scaled = pd.DataFrame(scaler_motion.fit_transform(X[motion_vars]), columns=motion_vars)
    X_others_scaled = pd.DataFrame(scaler_others.fit_transform(X[other_vars]), columns=other_vars)
    y_scaled = pd.DataFrame(scaler_motion.transform(y), columns=y.columns)  # use same scaler as motion features for the values to be predicted
    
    # Recombine scaled X
    X_scaled = pd.concat([X_motion_scaled, X_others_scaled], axis=1)
    
    return {
        'scaler_motion': scaler_motion,
        'scaler_others': scaler_others,
        'X_scaled': X_scaled,
        'y_scaled': y_scaled
    }

In [11]:
def select_xy_features(df: pd.DataFrame, off_or_def: str):
    base_features = [
        'velocity_x', 'velocity_y',
        'acc_x', 'acc_y',
        'land_spot_dirx', 'land_spot_diry', 'land_spot_dist',
        'dist_sideline', 'dist_endzone',
        'throw_distance', 'throw_velocity',
    ]
    
    offense_features = [
        'closest_def_dist', 'closest_def_dir_x', 'closest_def_dir_y',
        'closest_def_velo_x', 'closest_def_velo_y',
        'closest_def_acc_x', 'closest_def_acc_y'
    ]
    
    defense_features = [
        'receiver_dist', 'receiver_dir_x', 'receiver_dir_y',
        'velocity_x_off', 'velocity_y_off',
        'acc_x_off', 'acc_y_off'
    ]
    
    if off_or_def == 'offense':
        feature_cols = base_features + offense_features
    else:
        feature_cols = base_features + defense_features
        
    preds = ['velocity_x_shift', 'velocity_y_shift', 'acceleration_x_shift', 'acceleration_y_shift']
    
    X = df[feature_cols]
    y = df[preds]
    
    return X, y

In [12]:
def fit_model(X_train_scaled, y_train_scaled, add_params: dict):
    lgbm_params = {"n_estimators": 1000, "learning_rate": 0.05, "random_state": 123, "num_threads": 8, "verbosity": -1}
    lgbm_params.update(add_params)

    print(f'Full LGBM params: {lgbm_params}')

    base_stack = StackingRegressor(
        estimators=[
            ('rf', LinearRegression()),
            ('lgbm', LGBMRegressor(**lgbm_params))
        ],
        final_estimator=Ridge(alpha=1.0)
    )
    
    model = MultiOutputRegressor(base_stack)
    
    model.fit(X_train_scaled, y_train_scaled)
    
    return model

In [49]:
def data_prep_for_training(df: pd.DataFrame, off_or_def='Offense'):
    # Reduce dataframe; select only the output and last input frame
    df_for_train = df.groupby(['game_id', 'play_id', 'nfl_id']).apply(reduce_df).reset_index(drop=True)

    # Append to-be-predicted motion variables
    df_for_train = append_values_to_predict(df_for_train)

    # Last frame of output will have NA in to-be-predicted vars 
    # since we shifted the motion vars; We omit them
    df_for_train.dropna(
        axis=0, 
        how='any', 
        inplace=True, 
        subset=['velocity_x_shift', 'velocity_y_shift', 'acceleration_x_shift', 'acceleration_y_shift']
    )
    
    # Select who to predict
    train_data = df_for_train[df_for_train.player_side == off_or_def].reset_index(drop=True)

    # Select features for the relevant model
    X_train, y_train = select_xy_features(train_data, off_or_def.lower())

    # Scale training data
    scaling = fit_and_scale_features(X_train, y_train)
    X_train_scaled, y_train_scaled = scaling['X_scaled'], scaling['y_scaled']

    return X_train_scaled, y_train_scaled, scaling

In [60]:
def calculate_training_score(X, y, model, scaler_motion):
    y_pred_scaled = model.predict(X)
    y_pred = scaler_motion.inverse_transform(y_pred_scaled)
    y_true = scaler_motion.inverse_transform(y)

    # mse
    mse = np.mean((y_pred - y_true)**2)

    return mse

#### Offense training

In [50]:
X_off_train_scaled, y_off_train_scaled, off_scaling = data_prep_for_training(eng_df, 'Offense')

Check for NAs

In [51]:
X_off_train_scaled[X_off_train_scaled.isnull().any(axis=1)].head()

Unnamed: 0,velocity_x,velocity_y,acc_x,acc_y,land_spot_dirx,land_spot_diry,land_spot_dist,dist_sideline,dist_endzone,throw_distance,throw_velocity,closest_def_dist,closest_def_dir_x,closest_def_dir_y,closest_def_velo_x,closest_def_velo_y,closest_def_acc_x,closest_def_acc_y


In [52]:
y_off_train_scaled[y_off_train_scaled.isnull().any(axis=1)].head()

Unnamed: 0,velocity_x,velocity_y,acc_x,acc_y


Train

In [53]:
off_add_params = {'num_leaves': 15, 'min_data_in_leaf': 20, 'max_depth': 4}

In [54]:
off_model = fit_model(X_off_train_scaled, y_off_train_scaled, off_add_params)

Full LGBM params: {'n_estimators': 1000, 'learning_rate': 0.05, 'random_state': 123, 'num_threads': 8, 'verbosity': -1, 'num_leaves': 15, 'min_data_in_leaf': 20, 'max_depth': 4}


Check training score

In [61]:
calculate_training_score(
    X_off_train_scaled, 
    y_off_train_scaled, 
    off_model, 
    off_scaling['scaler_motion']
)

1.1128130507046416

Create session checkpoint and save important variables

In [62]:
dill.dump_session('training_saved_session/after_offense_training.db')

In [63]:
with open('training_saved_session/off_scaling.pkl', 'wb') as f:
    pickle.dump(off_scaling, f)

In [64]:
with open('training_saved_session/off_model.pkl', 'wb') as f:
    pickle.dump(off_model, f)

#### Defense training

In [65]:
X_def_train_scaled, y_def_train_scaled, def_scaling = data_prep_for_training(eng_df , 'Defense')

Check for NAs

In [66]:
X_def_train_scaled[X_def_train_scaled.isnull().any(axis=1)].head()

Unnamed: 0,velocity_x,velocity_y,acc_x,acc_y,land_spot_dirx,land_spot_diry,land_spot_dist,dist_sideline,dist_endzone,throw_distance,throw_velocity,receiver_dist,receiver_dir_x,receiver_dir_y,velocity_x_off,velocity_y_off,acc_x_off,acc_y_off


In [67]:
y_def_train_scaled[y_def_train_scaled.isnull().any(axis=1)].head()

Unnamed: 0,velocity_x,velocity_y,acc_x,acc_y


Train

In [68]:
def_add_params = {'num_leaves': 15, 'min_data_in_leaf': 30, 'max_depth': -1}

In [69]:
def_model = fit_model(X_def_train_scaled, y_def_train_scaled, def_add_params)

Full LGBM params: {'n_estimators': 1000, 'learning_rate': 0.05, 'random_state': 123, 'num_threads': 8, 'verbosity': -1, 'num_leaves': 15, 'min_data_in_leaf': 30, 'max_depth': -1}


Check training score

In [70]:
calculate_training_score(
    X_def_train_scaled, 
    y_def_train_scaled, 
    def_model, 
    def_scaling['scaler_motion']
)

1.2819261220640519

Create session checkpoint and save important variables

In [71]:
dill.dump_session('training_saved_session/after_defense_training.db')

In [72]:
with open('training_saved_session/def_scaling.pkl', 'wb') as f:
    pickle.dump(def_scaling, f)

In [73]:
with open('training_saved_session/def_model.pkl', 'wb') as f:
    pickle.dump(def_model, f)