In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import random
import os

In [4]:
DATA_PATH = "data/"

In [5]:
fp_gc = "GCC Offense.xlsx"
df_gc = pd.read_excel(os.path.join(DATA_PATH,fp_gc))
df_gc.head()

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,STRENGTH,TARGET,B/S ROUTE,DEEP SHOT,PROTECTION,RECEIVER_Jersey,RECEIVER_Name,WR ALIGNMENT,ROUTE,COMMENT
0,14,O,1,24:01,0,10,-27,M,Run,Rush,...,STRONG,,,L,,,,,,
1,15,O,1,23:27,2,4,-33,M,Run,Rush,...,STRONG,,,L,,,,,,
2,16,O,1,22:53,1,10,-45,L,Run,Rush,...,,,,L,,,,,,
3,17,O,1,22:19,1,10,45,L,Pass,Complete,...,,WR,OUT,L,,,,1 STR,STAB,
4,18,O,1,21:45,1,10,33,L,Run,Rush,...,,,,L,,,,,,


In [3]:
df_gc.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'PERSONNEL', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM',
       'VARIATION', 'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH',
       'TARGET', 'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT'],
      dtype='object')

In [4]:
df_gc_trim = df_gc.loc[df_gc['RESULT'] != 'Timeout']
df_gc_trim = df_gc.loc[df_gc['RESULT'] != 'Penalty']
df_gc_trim = df_gc[~df_gc['OFF FORM'].isna()]
df_gc_trim = df_gc[~df_gc['PERSONNEL'].isna()]
df_gc_trim = df_gc_trim.copy()
df_gc_trim.head(5)

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,STRENGTH,TARGET,B/S ROUTE,DEEP SHOT,PROTECTION,RECEIVER_Jersey,RECEIVER_Name,WR ALIGNMENT,ROUTE,COMMENT
0,14,O,1,24:01,0,10,-27,M,Run,Rush,...,STRONG,,,L,,,,,,
1,15,O,1,23:27,2,4,-33,M,Run,Rush,...,STRONG,,,L,,,,,,
2,16,O,1,22:53,1,10,-45,L,Run,Rush,...,,,,L,,,,,,
3,17,O,1,22:19,1,10,45,L,Pass,Complete,...,,WR,OUT,L,,,,1 STR,STAB,
4,18,O,1,21:45,1,10,33,L,Run,Rush,...,,,,L,,,,,,


In [5]:
def df_trim(df):
    df_output = df.loc[df['RESULT'] != 'Timeout']
    df_output = df.loc[df['RESULT'] != 'Penalty']
    df_output = df[~df['OFF FORM'].isna()]
    df_output = df[~df['PERSONNEL'].isna()]
    df_output = df.copy()
    return df_output

In [1]:
def create_lags(df):
    df['GN/LS LAG 1'] = df['GN/LS'].shift(1)
    df['GN/LS LAG 2'] = df['GN/LS'].shift(2)

    df['PLAY# LAG 1'] = df['PLAY#'].shift(1)
    df['PLAY# LAG 2'] = df['PLAY#'].shift(2)

    df['PLAY TYPE LAG 1'] = df['PLAY TYPE'].shift(1)
    df['PLAY TYPE LAG 2'] = df['PLAY TYPE'].shift(2)

    df.loc[df['PLAY #'] - df['PLAY# LAG 1'] != 1, 'GN/LS LAG 1'] = 0
    df.loc[df['PLAY #'] - df['PLAY# LAG 2'] != 2, 'GN/LS LAG 2'] = 0

    df['OFF FORM LAG 1'] = df['OFF FORM'].shift(1)
    df['OFF FORM LAG 2'] = df['OFF FORM'].shift(2)

    df.loc[df['PLAY #'] - df['PLAY# LAG 1'] != 1, 'OFF FORM LAG 1'] = 'NONE'
    df.loc[df['PLAY #'] - df['PLAY# LAG 2'] != 2, 'OFF FORM LAG 2'] = 'NONE'

    df['TIME TO HALF'] = (df['TIME TO HALF'].astype(str).str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1])))

    df['SCORE DIFF'] = df['OPP SCORE'] - df['OWN SCORE']

    df['HASH'] = df['HASH'].astype(str).str.strip()

    df['MID OR NOT'] = df['HASH'] == 'M'
    df['HASH OR NOT'] = df['HASH'] != 'M'

    df['0-2'] = df['DIST'] <= 2
    df['2-6'] = (df['DIST'] > 2) & (df['DIST'] <= 6)
    df['6+'] = df['DIST'] > 6

    df['PERSONNEL!'] = df['PERSONNEL'].astype(str).str.strip()
    df = pd.get_dummies(df, columns=['PERSONNEL'])

    df = df.copy()
    df['PLAY OF DRIVE NUM'] = -1
    for i in range(len(df)):
        if (df.iloc[i]['DN'] == 0):
            df.iloc[i, df.columns.get_loc('PLAY OF DRIVE NUM')] = 0
        else:
            df.iloc[i, df.columns.get_loc('PLAY OF DRIVE NUM')] = df.iloc[(i - 1), df.columns.get_loc('PLAY OF DRIVE NUM')] + 1


    df['PERSONNEL_10 LAG 1'] = df['PERSONNEL_10'].shift(1)
    df['PERSONNEL_10 LAG 2'] = df['PERSONNEL_10'].shift(2)

    df['PERSONNEL_11 LAG 1'] = df['PERSONNEL_11'].shift(1)
    df['PERSONNEL_11 LAG 2'] = df['PERSONNEL_11'].shift(2)

    df['PERSONNEL_12 LAG 1'] = df['PERSONNEL_12'].shift(1)
    df['PERSONNEL_12 LAG 2'] = df['PERSONNEL_12'].shift(2)

    df['PERSONNEL_12T LAG 1'] = df['PERSONNEL_12T'].shift(1)
    df['PERSONNEL_12T LAG 2'] = df['PERSONNEL_12T'].shift(2)

    df['PERSONNEL_11T LAG 1'] = df['PERSONNEL_11T'].shift(1)
    df['PERSONNEL_11T LAG 2'] = df['PERSONNEL_11T'].shift(2)


    df['2 MIN'] = df['2 MIN'].astype(str).str.strip()
    df['2 MIN OR NOT'] = -1
    df['2 MIN OR NOT'] = df['2 MIN'].apply(lambda x: 1 if x == "Y" else 0)



    df['OWN END'] = -1
    df['OWN END'] = df['YARD LN'].apply(lambda x: 1 if x < 0 else 0)



    df['OPP END'] = -1
    df['OPP END'] = df['YARD LN'].apply(lambda x: 1 if x >= 0 else 0)



    df['RED ZONE'] = -1
    df['RED ZONE'] = df['YARD LN'].apply(lambda x: 1 if (x <= 20 and x > 0) else 0)



    df['HALF #'] = -1
    df['HALF #'] = df['QTR'].apply(lambda x: 1 if x <= 2 else 2)


    df['TIME LEFT'] = -1
    df['TIME LEFT'] = df.apply(lambda row: row['TIME TO HALF'] if row['HALF #'] == 2 else (row['TIME TO HALF'] + 900), axis=1)



    df['PPS NEEDED'] = -1
    df['PPS NEEDED'] = (df['SCORE DIFF']) * -1/df['TIME LEFT']


    df['WINNING'] = -1
    df['WINNING'] = df['SCORE DIFF'].apply(lambda x: 1 if x < 0 else 0)


    df['DN X DIST'] = -1
    df['DN X DIST'] = df['DN'] * df['DIST']


    df['PREV PLAY PASS OR NOT'] = -1
    df['PREV PLAY PASS OR NOT'] = df['PLAY TYPE LAG 1'].apply(lambda x: 1 if x == "Pass" else 0)



    df['SCORE DIFF ^2'] = df['SCORE DIFF'] * df['SCORE DIFF']
    df['SCORE DIFF ^2'] = df['WINNING'].apply(lambda x: -x if x > 0 else x)

    df['SCORE DIFF x TIME LEFT'] = df['SCORE DIFF'] * df['TIME LEFT']

    df['SCORE DIFF x DN'] = df['SCORE DIFF'] * df['DN']

    df['SCORE DIFF / 7'] = df['SCORE DIFF'] / 7
    df['TIME LEFT * SCORE DIFF / 7'] = df['TIME LEFT'] * df['SCORE DIFF'] / 7

    df['SCORE DIFF x QTR'] = df['SCORE DIFF'] * df['QTR']

    df['YARDS TO TD'] = df['YARD LN'].apply(lambda x: x + 100 if x < 0 else x)
    df['YARDS TO TD * SCORE DIFF / 7'] = df['YARDS TO TD'] * df['SCORE DIFF'] / 7

In [7]:
df_gc_trim['GN/LS LAG 1'] = df_gc_trim['GN/LS'].shift(1)
df_gc_trim['GN/LS LAG 2'] = df_gc_trim['GN/LS'].shift(2)

df_gc_trim['PLAY# LAG 1'] = df_gc_trim['PLAY #'].shift(1)
df_gc_trim['PLAY# LAG 2'] = df_gc_trim['PLAY #'].shift(2)

df_gc_trim['PLAY TYPE LAG 1'] = df_gc_trim['PLAY TYPE'].shift(1)
df_gc_trim['PLAY TYPE LAG 2'] = df_gc_trim['PLAY TYPE'].shift(2)

df_gc_trim.loc[df_gc_trim['PLAY #'] - df_gc_trim['PLAY# LAG 1'] != 1, 'GN/LS LAG 1'] = 0
df_gc_trim.loc[df_gc_trim['PLAY #'] - df_gc_trim['PLAY# LAG 2'] != 2, 'GN/LS LAG 2'] = 0

df_gc_trim.drop(columns=['PLAY# LAG 1', 'PLAY# LAG 2'], inplace=True)

In [8]:
df_gc_trim['OFF FORM LAG 1'] = df_gc_trim['OFF FORM'].shift(1)
df_gc_trim['OFF FORM LAG 2'] = df_gc_trim['OFF FORM'].shift(2)

df_gc_trim['PLAY# LAG 1'] = df_gc_trim['PLAY #'].shift(1)
df_gc_trim['PLAY# LAG 2'] = df_gc_trim['PLAY #'].shift(2)

df_gc_trim.loc[df_gc_trim['PLAY #'] - df_gc_trim['PLAY# LAG 1'] != 1, 'OFF FORM LAG 1'] = 'NONE'
df_gc_trim.loc[df_gc_trim['PLAY #'] - df_gc_trim['PLAY# LAG 2'] != 2, 'OFF FORM LAG 2'] = 'NONE'

df_gc_trim.drop(columns=['PLAY# LAG 1', 'PLAY# LAG 2'], inplace=True)

In [9]:
df_gc_trim['TIME TO HALF'] = (
    df_gc_trim['TIME TO HALF']
    .astype(str)
    .str.split(':')
    .apply(lambda x: int(x[0]) * 60 + int(x[1]))
)


In [10]:
df_gc_trim['SCORE DIFF'] = df_gc_trim['OPP SCORE'] - df_gc_trim['OWN SCORE']

In [11]:
df_gc_trim['HASH'].unique()
print(repr(df_gc_trim['HASH'][1]))  # or any index you know should be 'M'
df_gc_trim['HASH'] = df_gc_trim['HASH'].astype(str).str.strip()

'M'


In [12]:
df_gc_trim['MID OR NOT'] = df_gc_trim['HASH'] == 'M'
df_gc_trim['HASH OR NOT'] = df_gc_trim['HASH'] != 'M'


In [13]:
df_gc_trim['0-2'] = df_gc_trim['DIST'] <= 2
df_gc_trim['2-6'] = (df_gc_trim['DIST'] > 2) & (df_gc_trim['DIST'] <= 6)
df_gc_trim['6+'] = df_gc_trim['DIST'] > 6
df_gc_trim.head()

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,PLAY TYPE LAG 1,PLAY TYPE LAG 2,OFF FORM LAG 1,OFF FORM LAG 2,SCORE DIFF,MID OR NOT,HASH OR NOT,0-2,2-6,6+
0,14,O,1,1441,0,10,-27,M,Run,Rush,...,,,NONE,NONE,-7,True,False,False,False,True
1,15,O,1,1407,2,4,-33,M,Run,Rush,...,Run,,TWINS OPEN,NONE,-7,True,False,False,True,False
2,16,O,1,1373,1,10,-45,L,Run,Rush,...,Run,Run,TWINS OPEN,TWINS OPEN,-7,False,True,False,False,True
3,17,O,1,1339,1,10,45,L,Pass,Complete,...,Run,Run,DUTCH Y OFF,TWINS OPEN,-7,False,True,False,False,True
4,18,O,1,1305,1,10,33,L,Run,Rush,...,Pass,Run,TREY Y OFF,DUTCH Y OFF,-7,False,True,False,False,True


In [14]:
df_gc_trim.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'PERSONNEL', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM',
       'VARIATION', 'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH',
       'TARGET', 'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1',
       'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2',
       '2-6', '6+'],
      dtype='object')

In [15]:
df_gc_trim['PERSONNEL!'] = df_gc_trim['PERSONNEL'].astype(str).str.strip()
df_gc_trim = pd.get_dummies(df_gc_trim, columns=['PERSONNEL'])
df_gc_trim

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,HASH OR NOT,0-2,2-6,6+,PERSONNEL!,PERSONNEL_10,PERSONNEL_11,PERSONNEL_12,PERSONNEL_11T,PERSONNEL_12T
0,14,O,1,1441,0,10,-27,M,Run,Rush,...,False,False,False,True,11,False,True,False,False,False
1,15,O,1,1407,2,4,-33,M,Run,Rush,...,False,False,True,False,11,False,True,False,False,False
2,16,O,1,1373,1,10,-45,L,Run,Rush,...,True,False,False,True,11,False,True,False,False,False
3,17,O,1,1339,1,10,45,L,Pass,Complete,...,True,False,False,True,11,False,True,False,False,False
4,18,O,1,1305,1,10,33,L,Run,Rush,...,True,False,False,True,11,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,137,O,4,548,2,2,9,R,Run,Rush,...,True,True,False,False,12T,False,False,False,False,True
192,139,O,4,488,3,3,10,R,Pass,Incomplete,...,True,False,True,False,11,False,True,False,False,False
193,151,O,4,217,0,10,-27,L,Run,Rush,...,True,False,False,True,12,False,False,True,False,False
194,152,O,4,206,2,9,-28,M,Run,Rush,...,False,False,False,True,12,False,False,True,False,False


In [16]:
df_gc_trim = df_gc_trim.copy()
df_gc_trim['PLAY OF DRIVE NUM'] = -1
for i in range(len(df_gc_trim)):
    if (df_gc_trim.iloc[i]['DN'] == 0):
        df_gc_trim.iloc[i, df_gc_trim.columns.get_loc('PLAY OF DRIVE NUM')] = 0
    else:
        df_gc_trim.iloc[i, df_gc_trim.columns.get_loc('PLAY OF DRIVE NUM')] = df_gc_trim.iloc[(i - 1), df_gc_trim.columns.get_loc('PLAY OF DRIVE NUM')] + 1

df_gc_trim.head(34)

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,0-2,2-6,6+,PERSONNEL!,PERSONNEL_10,PERSONNEL_11,PERSONNEL_12,PERSONNEL_11T,PERSONNEL_12T,PLAY OF DRIVE NUM
0,14,O,1,1441,0,10,-27,M,Run,Rush,...,False,False,True,11,False,True,False,False,False,0
1,15,O,1,1407,2,4,-33,M,Run,Rush,...,False,True,False,11,False,True,False,False,False,1
2,16,O,1,1373,1,10,-45,L,Run,Rush,...,False,False,True,11,False,True,False,False,False,2
3,17,O,1,1339,1,10,45,L,Pass,Complete,...,False,False,True,11,False,True,False,False,False,3
4,18,O,1,1305,1,10,33,L,Run,Rush,...,False,False,True,11,False,True,False,False,False,4
5,19,O,1,1271,2,3,26,L,Run,Rush,...,False,True,False,12,False,False,True,False,False,5
6,20,O,1,1237,3,1,24,L,Run,Rush,...,True,False,False,12T,False,False,False,False,True,6
8,22,O,1,1169,2,5,8,M,Pass,"Complete, TD",...,False,True,False,12T,False,False,False,False,True,7
9,30,O,1,1046,0,10,-32,L,Pass,Complete,...,False,False,True,11,False,True,False,False,False,0
10,31,O,1,1017,2,2,-40,R,Run,Rush,...,True,False,False,11,False,True,False,False,False,1


In [17]:
df_gc_trim.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1',
       'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2',
       '2-6', '6+', 'PERSONNEL!', 'PERSONNEL_10', 'PERSONNEL_11',
       'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM'],
      dtype='object')

In [18]:
df_gc_trim['2 MIN'] = df_gc_trim['2 MIN'].astype(str).str.strip()

df_gc_trim['2 MIN OR NOT'] = -1

df_gc_trim['2 MIN OR NOT'] = df_gc_trim['2 MIN'].apply(lambda x: 1 if x == "Y" else 0)

# df_gc_trim['2 MIN'].head(34)

In [19]:
df_gc_trim['OWN END'] = -1
df_gc_trim['OWN END'] = df_gc_trim['YARD LN'].apply(lambda x: 1 if x < 0 else 0)
# df_gc_trim.head()

In [20]:
df_gc_trim['OPP END'] = -1
df_gc_trim['OPP END'] = df_gc_trim['YARD LN'].apply(lambda x: 1 if x >= 0 else 0)
# df_gc_trim.head()

In [21]:
df_gc_trim['RED ZONE'] = -1
df_gc_trim['RED ZONE'] = df_gc_trim['YARD LN'].apply(lambda x: 1 if (x <= 20 and x > 0) else 0)
# df_gc_trim.head(20)

In [22]:
df_gc_trim['HALF #'] = -1
df_gc_trim['HALF #'] = df_gc_trim['QTR'].apply(lambda x: 1 if x <= 2 else 2)
# df_gc_trim.head(100)

In [23]:
df_gc_trim['TIME LEFT'] = -1
df_gc_trim['TIME LEFT'] = df_gc_trim.apply(lambda row: row['TIME TO HALF'] if row['HALF #'] == 2 else (row['TIME TO HALF'] + 900), axis=1)
# df_gc_trim.head()

In [24]:
df_gc_trim['PPS NEEDED'] = -1
df_gc_trim['PPS NEEDED'] = (df_gc_trim['SCORE DIFF']) * -1/df_gc_trim['TIME LEFT']
df_gc_trim.head()

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,PERSONNEL_11T,PERSONNEL_12T,PLAY OF DRIVE NUM,2 MIN OR NOT,OWN END,OPP END,RED ZONE,HALF #,TIME LEFT,PPS NEEDED
0,14,O,1,1441,0,10,-27,M,Run,Rush,...,False,False,0,0,1,0,0,1,2341,0.00299
1,15,O,1,1407,2,4,-33,M,Run,Rush,...,False,False,1,0,1,0,0,1,2307,0.003034
2,16,O,1,1373,1,10,-45,L,Run,Rush,...,False,False,2,0,1,0,0,1,2273,0.00308
3,17,O,1,1339,1,10,45,L,Pass,Complete,...,False,False,3,0,0,1,0,1,2239,0.003126
4,18,O,1,1305,1,10,33,L,Run,Rush,...,False,False,4,0,0,1,0,1,2205,0.003175


In [25]:
df_gc_trim['WINNING'] = -1
df_gc_trim['WINNING'] = df_gc_trim['SCORE DIFF'].apply(lambda x: 1 if x < 0 else 0)
# df_gc_trim['SCORE DIFF'].head(100)


In [26]:
df_gc_trim['PERSONNEL_10 LAG 1'] = df_gc_trim['PERSONNEL_10'].shift(1)
df_gc_trim['PERSONNEL_10 LAG 2'] = df_gc_trim['PERSONNEL_10'].shift(2)

df_gc_trim['PERSONNEL_11 LAG 1'] = df_gc_trim['PERSONNEL_11'].shift(1)
df_gc_trim['PERSONNEL_11 LAG 2'] = df_gc_trim['PERSONNEL_11'].shift(2)

df_gc_trim['PERSONNEL_12 LAG 1'] = df_gc_trim['PERSONNEL_12'].shift(1)
df_gc_trim['PERSONNEL_12 LAG 2'] = df_gc_trim['PERSONNEL_12'].shift(2)

df_gc_trim['PERSONNEL_12T LAG 1'] = df_gc_trim['PERSONNEL_12T'].shift(1)
df_gc_trim['PERSONNEL_12T LAG 2'] = df_gc_trim['PERSONNEL_12T'].shift(2)

df_gc_trim['PERSONNEL_11T LAG 1'] = df_gc_trim['PERSONNEL_11T'].shift(1)
df_gc_trim['PERSONNEL_11T LAG 2'] = df_gc_trim['PERSONNEL_11T'].shift(2)

In [27]:
df_gc_trim['DN X DIST'] = -1
df_gc_trim['DN X DIST'] = df_gc_trim['DN'] * df_gc_trim['DIST']
# df_gc_trim

In [28]:
df_gc_trim['PREV PLAY PASS OR NOT'] = -1
df_gc_trim['PREV PLAY PASS OR NOT'] = df_gc_trim['PLAY TYPE LAG 1'].apply(lambda x: 1 if x == "Pass" else 0)
# df_gc_trim


In [29]:
df_gc_trim['SCORE DIFF ^2'] = df_gc_trim['SCORE DIFF'] * df_gc_trim['SCORE DIFF']
df_gc_trim['SCORE DIFF ^2'] = df_gc_trim['WINNING'].apply(lambda x: -x if x > 0 else x)

df_gc_trim['SCORE DIFF x TIME LEFT'] = df_gc_trim['SCORE DIFF'] * df_gc_trim['TIME LEFT']

df_gc_trim['SCORE DIFF x DN'] = df_gc_trim['SCORE DIFF'] * df_gc_trim['DN']

df_gc_trim['SCORE DIFF / 7'] = df_gc_trim['SCORE DIFF'] / 7
df_gc_trim['TIME LEFT * SCORE DIFF / 7'] = df_gc_trim['TIME LEFT'] * df_gc_trim['SCORE DIFF'] / 7

df_gc_trim['SCORE DIFF x QTR'] = df_gc_trim['SCORE DIFF'] * df_gc_trim['QTR']

df_gc_trim['YARDS TO TD'] = df_gc_trim['YARD LN'].apply(lambda x: x + 100 if x < 0 else x)
df_gc_trim['YARDS TO TD * SCORE DIFF / 7'] = df_gc_trim['YARDS TO TD'] * df_gc_trim['SCORE DIFF'] / 7

In [30]:
df_gc_trim.head(90)

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,DN X DIST,PREV PLAY PASS OR NOT,SCORE DIFF ^2,SCORE DIFF x TIME LEFT,SCORE DIFF x DN,SCORE DIFF / 7,TIME LEFT * SCORE DIFF / 7,SCORE DIFF x QTR,YARDS TO TD,YARDS TO TD * SCORE DIFF / 7
0,14,O,1,1441,0,10,-27,M,Run,Rush,...,0,0,-1,-16387,0,-1.000000,-2341.000000,-7,73,-73.000000
1,15,O,1,1407,2,4,-33,M,Run,Rush,...,8,0,-1,-16149,-14,-1.000000,-2307.000000,-7,67,-67.000000
2,16,O,1,1373,1,10,-45,L,Run,Rush,...,10,0,-1,-15911,-7,-1.000000,-2273.000000,-7,55,-55.000000
3,17,O,1,1339,1,10,45,L,Pass,Complete,...,10,0,-1,-15673,-7,-1.000000,-2239.000000,-7,45,-45.000000
4,18,O,1,1305,1,10,33,L,Run,Rush,...,10,1,-1,-15435,-7,-1.000000,-2205.000000,-7,33,-33.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,90,O,2,9,2,10,-25,M,Pass,Incomplete,...,20,1,0,3636,8,0.571429,519.428571,8,75,42.857143
90,102,O,3,1556,0,10,-13,M,Run,Rush,...,0,1,0,6224,0,0.571429,889.142857,12,87,49.714286
91,103,O,3,1516,1,10,-23,L,Run,Rush,...,10,0,0,6064,4,0.571429,866.285714,12,77,44.000000
92,104,O,3,1476,2,9,-24,L,Pass,Complete,...,18,0,0,5904,8,0.571429,843.428571,12,76,43.428571


In [31]:
def evaluate_feature_subset(features, X_train_full, y_train, X_test_full, y_test, model_type='top1'):
    selected_features = [f for i, f in enumerate(X_train_full.columns) if features[i] == 1]
    
    if not selected_features:
        return 0.0  # Avoid empty feature sets

    X_train = X_train_full[selected_features]
    X_test = X_test_full[selected_features]

    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), selected_features)]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=45))
    ])
    
    pipeline.fit(X_train, y_train)

    if model_type == 'top1':
        y_pred = pipeline.predict(X_test)
        return accuracy_score(y_test, y_pred)

    elif model_type == 'top2':
        y_proba = pipeline.predict_proba(X_test)
        classes = pipeline.named_steps['classifier'].classes_
        top2_idx = np.argsort(y_proba, axis=1)[:, -2:]
        top2_preds = np.array([[classes[i] for i in row] for row in top2_idx])
        correct_top2 = [y in preds for y, preds in zip(y_test, top2_preds)]
        return np.mean(correct_top2)

    else:
        raise ValueError("model_type must be 'top1' or 'top2'")

In [32]:
def genetic_algorithm(X_train, y_train, X_test, y_test, n_generations=30, pop_size=50, mutation_rate=0.15, model_type='top1'):
    n_features = X_train.shape[1]
    population = [np.random.randint(0, 2, size=n_features).tolist() for _ in range(pop_size)]

    for generation in range(n_generations):
        scores = [evaluate_feature_subset(ind, X_train, y_train, X_test, y_test, model_type) for ind in population]
        print(f"Generation {generation}: Best score = {max(scores):.4f}")

        # Select top 50%
        sorted_pop = [x for _, x in sorted(zip(scores, population), reverse=True)]
        parents = sorted_pop[:pop_size // 2]

        # Crossover
        offspring = []
        while len(offspring) < pop_size - len(parents):
            p1, p2 = random.sample(parents, 2)
            cut = random.randint(1, n_features - 1)
            child = p1[:cut] + p2[cut:]
            offspring.append(child)

        # Mutation
        for child in offspring:
            if random.random() < mutation_rate:
                idx = random.randint(0, n_features - 1)
                child[idx] = 1 - child[idx]

        population = parents + offspring

    # Return best feature subset
    final_scores = [evaluate_feature_subset(ind, X_train, y_train, X_test, y_test, model_type) for ind in population]
    best_idx = np.argmax(final_scores)
    best_features = [f for i, f in enumerate(X_train.columns) if population[best_idx][i] == 1]
    
    return best_features

In [33]:
# input_cols = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH','OWN SCORE', 
#               'OPP SCORE', 'GN/LS LAG 1', 'GN/LS LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 
#               'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 
#               'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM', 'OWN END', 
#               'RED ZONE', 'SCORE DIFF ^2', 'SCORE DIFF x DN', 'SCORE DIFF / 7', 
#               'TIME LEFT * SCORE DIFF / 7',  'YARDS TO TD']  

# target_col = 'OFF FORM' 


In [34]:
input_cols = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 
              'OWN SCORE', 'OPP SCORE', 
              'GN/LS LAG 1', 'GN/LS LAG 2', 
              'OFF FORM LAG 1', 'OFF FORM LAG 2'
              , 'SCORE DIFF'
              , 'MID OR NOT'
              , 'HASH OR NOT'
              , 'PERSONNEL_10'
              , 'PERSONNEL_11'
              , 'PERSONNEL_12'
              , 'PERSONNEL_11T'
              , 'PERSONNEL_12T'
              , 'PLAY OF DRIVE NUM'
              , '0-2'
              , '2-6'
              , '6+'
              , '2 MIN OR NOT'
              , 'OWN END'
              , 'OPP END'
              , 'RED ZONE'
              , 'HALF #'
              , 'TIME LEFT'
              , 'WINNING'
              , 'PPS NEEDED'
              , 'PLAY TYPE LAG 1'
              , 'PLAY TYPE LAG 2'
              , 'PERSONNEL!'
              , 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2'
              , 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2'
              , 'PERSONNEL_11T LAG 1', 'PERSONNEL_11T LAG 2'
              , 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2'
              , 'PERSONNEL_12T LAG 1', 'PERSONNEL_12T LAG 2'
              , 'PREV PLAY PASS OR NOT'
              , 'DN X DIST'
              , 'SCORE DIFF ^2'
              , 'SCORE DIFF x TIME LEFT'
              , 'SCORE DIFF x DN'
              , 'SCORE DIFF / 7'
              , 'SCORE DIFF x QTR'
              , 'TIME LEFT * SCORE DIFF / 7'
              , 'YARDS TO TD'
              , 'YARDS TO TD * SCORE DIFF / 7'
              ]  
target_col = 'OFF FORM' 



In [35]:
def split_train(df, location):
    split_index = int(len(df) * location)
    train = df.iloc[:split_index]
    test = df.iloc[split_index:]
    return train, test

In [36]:
split_index = int(len(df_gc_trim) * 0.8)
train = df_gc_trim.iloc[:split_index]
test = df_gc_trim.iloc[split_index:]

X_train_full = train[input_cols]
X_test_full = test[input_cols]
y_train_full = train[target_col]
y_test_full = test[target_col]

# Run GA
best_features_top1 = genetic_algorithm(X_train_full, y_train_full, X_test_full, y_test_full, 
                                  model_type='top1')

print(best_features_top1)

Generation 0: Best score = 0.6579
Generation 1: Best score = 0.6579
Generation 2: Best score = 0.6579
Generation 3: Best score = 0.6579
Generation 4: Best score = 0.6579
Generation 5: Best score = 0.6579
Generation 6: Best score = 0.6579
Generation 7: Best score = 0.6842
Generation 8: Best score = 0.6842
Generation 9: Best score = 0.6842
Generation 10: Best score = 0.6842
Generation 11: Best score = 0.6842
Generation 12: Best score = 0.6842
Generation 13: Best score = 0.6842
Generation 14: Best score = 0.6842
Generation 15: Best score = 0.6842
Generation 16: Best score = 0.6842
Generation 17: Best score = 0.6842
Generation 18: Best score = 0.6842
Generation 19: Best score = 0.6842
Generation 20: Best score = 0.6842
Generation 21: Best score = 0.6842
Generation 22: Best score = 0.6842
Generation 23: Best score = 0.6842
Generation 24: Best score = 0.6842
Generation 25: Best score = 0.6842
Generation 26: Best score = 0.6842
Generation 27: Best score = 0.6842
Generation 28: Best score = 0.

In [37]:
best_features_top2 = genetic_algorithm(X_train_full, y_train_full, X_test_full, y_test_full, 
                                  model_type='top2')

print(best_features_top2)

Generation 0: Best score = 0.8421
Generation 1: Best score = 0.8421
Generation 2: Best score = 0.8684
Generation 3: Best score = 0.8684
Generation 4: Best score = 0.8684
Generation 5: Best score = 0.8684
Generation 6: Best score = 0.8684
Generation 7: Best score = 0.8684
Generation 8: Best score = 0.8684
Generation 9: Best score = 0.8684
Generation 10: Best score = 0.8684
Generation 11: Best score = 0.8684
Generation 12: Best score = 0.8684
Generation 13: Best score = 0.8684
Generation 14: Best score = 0.8684
Generation 15: Best score = 0.8684
Generation 16: Best score = 0.8684
Generation 17: Best score = 0.8684
Generation 18: Best score = 0.8684
Generation 19: Best score = 0.8684
Generation 20: Best score = 0.8684
Generation 21: Best score = 0.8684
Generation 22: Best score = 0.8684
Generation 23: Best score = 0.8684
Generation 24: Best score = 0.8684
Generation 25: Best score = 0.8684
Generation 26: Best score = 0.8684
Generation 27: Best score = 0.8684
Generation 28: Best score = 0.

In [None]:
# feature sets 

# Accuracy Top 1 = 0.6579
# best_features_top1 = ['TIME TO HALF', 'DN', 'YARD LN', 'HASH', 'GN/LS LAG 2', 'PERSONNEL_10',
#                       'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_11T', '2-6', 'OWN END',
#                       'RED ZONE', 'HALF #', 'WINNING', 'PERSONNEL_10 LAG 1',
#                       'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1',
#                       'PERSONNEL_12 LAG 2', 'PERSONNEL_12T LAG 2', 'DN X DIST', 'SCORE DIFF ^2',
#                       'SCORE DIFF x TIME LEFT', 'SCORE DIFF x QTR', 'YARDS TO TD']

# best_features_top1 = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'OWN SCORE', 'GN/LS LAG 1',
#                       'GN/LS LAG 2', 'OFF FORM LAG 1', 'SCORE DIFF', 'PERSONNEL_10', 'PERSONNEL_11',
#                       'PERSONNEL_12', 'PERSONNEL_11T', 'PLAY OF DRIVE NUM', 'OPP END', 'RED ZONE',
#                       'TIME LEFT', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'PERSONNEL!', 
#                       'PERSONNEL_11 LAG 2', 'PERSONNEL_11T LAG 1', 'PERSONNEL_11T LAG 2',
#                       'PREV PLAY PASS OR NOT', 'DN X DIST', 'SCORE DIFF ^2', 'SCORE DIFF x DN']

# Accuracy Top 2 = 0.8421
# best_features_top2 = ['QTR', 'TIME TO HALF', 'YARD LN', 'HASH', 'OWN SCORE', 'OPP SCORE',
#                       'GN/LS LAG 1', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'HASH OR NOT',
#                       'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_11T',
#                       'PERSONNEL_12T', 'PLAY OF DRIVE NUM', '6+', '2 MIN OR NOT', 'OWN END',
#                       'OPP END', 'HALF #', 'WINNING', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2',
#                       'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1',
#                       'DN X DIST', 'SCORE DIFF x DN', 'YARDS TO TD']

# best_features_top2 = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 'OWN SCORE',
#                       'GN/LS LAG 1', 'GN/LS LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2',
#                       'MID OR NOT', 'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12',
#                       'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM', '2-6', '6+',
#                       '2 MIN OR NOT', 'OPP END', 'RED ZONE', 'TIME LEFT', 'WINNING', 'PPS NEEDED',
#                       'PLAY TYPE LAG 2', 'PERSONNEL!', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2',
#                       'PERSONNEL_11 LAG 1', 'PERSONNEL_11T LAG 1', 'PERSONNEL_12 LAG 1', 
#                       'PREV PLAY PASS OR NOT', 'DN X DIST', 'SCORE DIFF ^2', 'SCORE DIFF x DN', 
#                       'SCORE DIFF x QTR', 'TIME LEFT * SCORE DIFF / 7', 'YARDS TO TD']

In [None]:
# 0.8684 top 2
# ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 'OWN SCORE', 'GN/LS LAG 2',
#  'OFF FORM LAG 1', 'OFF FORM LAG 2', 'MID OR NOT', 'HASH OR NOT', 'PERSONNEL_10',
#  'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM',
#  '0-2', '2 MIN OR NOT', 'RED ZONE', 'HALF #', 'WINNING', 'PPS NEEDED', 'PERSONNEL!',
#  'PERSONNEL_10 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_11T LAG 2', 'PERSONNEL_12 LAG 2',
#  'PERSONNEL_12T LAG 2', 'SCORE DIFF ^2', 'SCORE DIFF x DN', 'SCORE DIFF x QTR',
#  'YARDS TO TD * SCORE DIFF / 7']

In [38]:
df_gc_trim.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1',
       'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2',
       '2-6', '6+', 'PERSONNEL!', 'PERSONNEL_10', 'PERSONNEL_11',
       'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM',
       '2 MIN OR NOT', 'OWN END', 'OPP END', 'RED ZONE', 'HALF #', 'TIME LEFT',
       'PPS NEEDED', 'WINNING', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2',
       'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1',
       'PERSONNEL_12 LAG 2', 'PERS

In [39]:


# X = df_gc_trim[input_cols]
# y = df_gc_trim[target_col]

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

split_index = int(len(df_gc_trim) * 0.8)
train = df_gc_trim.iloc[:split_index]
test = df_gc_trim.iloc[split_index:]


In [40]:
train.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1',
       'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2',
       '2-6', '6+', 'PERSONNEL!', 'PERSONNEL_10', 'PERSONNEL_11',
       'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM',
       '2 MIN OR NOT', 'OWN END', 'OPP END', 'RED ZONE', 'HALF #', 'TIME LEFT',
       'PPS NEEDED', 'WINNING', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2',
       'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1',
       'PERSONNEL_12 LAG 2', 'PERS

In [41]:

X_train = train[best_features_top1]
X_test = test[best_features_top1]

y_train = train[target_col]
y_test = test[target_col]


In [42]:

# Set up preprocessing for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, best_features_top1)]
)

# Create a Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=45))
])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# y_pred = model.fit(X_test)
# print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

          CON       0.00      0.00      0.00         1
    DEUCE OFF       0.00      0.00      0.00         1
      DOUBLES       0.00      0.00      0.00         1
  DUTCH Y OFF       0.00      0.00      0.00         1
    PRO Y OFF       0.60      1.00      0.75         3
TREY DBL WING       1.00      0.50      0.67         2
    TREY WING       0.00      0.00      0.00         1
   TREY Y OFF       0.00      0.00      0.00         5
         TRIO       0.86      1.00      0.92        12
        TROOP       0.00      0.00      0.00         1
   TROOP WING       0.33      1.00      0.50         1
   TWINS OPEN       0.58      1.00      0.74         7
    WOLVERINE       1.00      1.00      1.00         2

     accuracy                           0.68        38
    macro avg       0.34      0.42      0.35        38
 weighted avg       0.54      0.68      0.59        38



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
X_train_top2 = train[best_features_top2]
X_test_top2 = test[best_features_top2]

# X_train_top2 = train[input_cols]
# X_test_top2 = test[input_cols]

y_train_top2 = train[target_col]
y_test_top2 = test[target_col]

In [44]:
train.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1',
       'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2',
       '2-6', '6+', 'PERSONNEL!', 'PERSONNEL_10', 'PERSONNEL_11',
       'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM',
       '2 MIN OR NOT', 'OWN END', 'OPP END', 'RED ZONE', 'HALF #', 'TIME LEFT',
       'PPS NEEDED', 'WINNING', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2',
       'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1',
       'PERSONNEL_12 LAG 2', 'PERS

In [45]:
# Rebuild preprocessor and pipeline with top-2 selected features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor_top2 = ColumnTransformer(
    transformers=[('cat', categorical_transformer, best_features_top2)]
)

rf_pipeline_top2 = Pipeline(steps=[
    ('preprocessor', preprocessor_top2),
    ('classifier', RandomForestClassifier(random_state=45))
])

# Fit on top-2 feature subset
rf_pipeline_top2.fit(X_train_top2, y_train_top2)


In [46]:
y_proba = rf_pipeline_top2.predict_proba(X_test_top2)

# Get class labels (in the same order as columns of y_proba)
classes = rf_pipeline.named_steps['classifier'].classes_

# For each sample, get indices of top 2 probs
top2_idx = np.argsort(y_proba, axis=1)[:, -2:]

# Convert indices to class labels
top2_preds = np.array([[classes[i] for i in row] for row in top2_idx])

# y_test must be a numpy array or series of actual labels
correct_top2 = [true in preds for true, preds in zip(y_test_top2, top2_preds)]
top2_accuracy = np.mean(correct_top2)
print(f"Top-2 Accuracy: {top2_accuracy:.2f}")

y_test_array = np.array(y_test_top2)

# Create 'soft' top-2 predictions
soft_top2_preds = []
for i, row in enumerate(top2_preds):
    true = y_test_array[i]
    if true in row:
        soft_top2_preds.append(true)  # Treat as correct if in top-2
    else:
        soft_top2_preds.append(row[1])  # Use 2nd best prediction

# Print full classification report
print(classification_report(y_test_array, soft_top2_preds))






# add play number for how far into a drive the play is
# with play num: accuracy +0.03, macro avg +0.01, weighted avg +0.01, Top-2 Accuracy -0.03

# With all features except PLAY OF DRIVE NUM, 
# exact accuracy tied at .66 with other versions but top-2 acc drops to .74 from .79 with all features
# Highest macro avg of .4, seems this version predicts some of the least common occurences but less accurate for secondary choice

# With all features Top-2 acc is .79 with exact acc at .66
# macro avg .32

# With all features except distance brackets and PLAY OF DRIVE NUM, exact acc is .63, macro is .31
# BEST TOP-2 of .82




Top-2 Accuracy: 0.87
               precision    recall  f1-score   support

          CON       0.00      0.00      0.00         1
    DEUCE OFF       0.00      0.00      0.00         1
      DOUBLES       1.00      1.00      1.00         1
  DUTCH Y OFF       1.00      1.00      1.00         1
    PRO Y OFF       0.75      1.00      0.86         3
TREY DBL WING       1.00      1.00      1.00         2
    TREY WING       0.00      0.00      0.00         1
   TREY Y OFF       1.00      0.80      0.89         5
         TRIO       0.92      1.00      0.96        12
        TROOP       0.00      0.00      0.00         1
   TROOP WING       0.50      1.00      0.67         1
   TWINS OPEN       0.88      1.00      0.93         7
    WOLVERINE       0.67      1.00      0.80         2

     accuracy                           0.87        38
    macro avg       0.59      0.68      0.62        38
 weighted avg       0.80      0.87      0.82        38



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
df_gc_trim.to_excel('df_gc_trim.xlsx')

In [40]:
input_cols = ['QTR', 'DN', 'DIST', 'YARD LN', 'HASH', 'OWN SCORE', 'OPP SCORE']  
target_col = 'PERSONNEL' 

X = df_gc_trim[input_cols]
y = df_gc_trim[target_col]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up preprocessing for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, input_cols)]
)

# Create a Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

KeyError: 'PERSONNEL'

In [15]:
print((y_test, y_pred))

(194               WOLVERINE
80               TREY Y OFF
56                     TREY
143              TWINS OPEN
169                    TRIO
16            TREY DBL WING
77              DUTCH Y OFF
161              TREY Y OFF
127                    TRIO
183                 DOUBLES
165                    TRIO
43               TWINS OPEN
173              TROOP WING
31     UNBALANCED PRO Y OFF
52               TREY Y OFF
17               TWINS OPEN
174                    TRIO
185              TWINS OPEN
88              DUTCH Y OFF
20               TWINS OPEN
129              TWINS OPEN
123                    TRIO
5                 WOLVERINE
155              TREY Y OFF
25               TWINS OPEN
159              TREY Y OFF
10               TREY Y OFF
144                    TRIO
141              TWINS OPEN
170                    TRIO
19                     TRIO
180                    TRIO
78                PRO Y OFF
67                TREY WING
69                WOLVERINE
36               TW

In [60]:
X_test.dtypes

QTR                 int64
TIME TO HALF        int64
DN                  int64
DIST                int64
YARD LN             int64
HASH               object
OWN SCORE           int64
OPP SCORE           int64
GN/LS LAG 1       float64
GN/LS LAG 2       float64
OFF FORM LAG 1     object
OFF FORM LAG 2     object
SCORE DIFF          int64
MID OR NOT           bool
HASH OR NOT          bool
PERSONNEL_10         bool
PERSONNEL_11         bool
PERSONNEL_12         bool
PERSONNEL_11T        bool
PERSONNEL_12T        bool
dtype: object