In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import random
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import joblib
import os

In [2]:
def df_trim(df):
    out = df.copy()

    # remove rows whose RESULT is in this list
    out = out[~out['RESULT'].isin(['Timeout', 'Penalty', 'Interception'])]

    # drop rows with missing OFF FORM or PERSONNEL
    out = out[out['OFF FORM'].notna() & out['PERSONNEL'].notna()]

    return out

In [None]:
def create_features(df):
    df_out = df.copy()
    df_out['GN/LS LAG 1'] = df_out['GN/LS'].shift(1)
    df_out['GN/LS LAG 2'] = df_out['GN/LS'].shift(2)

    df_out['PLAY # LAG 1'] = df_out['PLAY #'].shift(1)
    df_out['PLAY # LAG 2'] = df_out['PLAY #'].shift(2)

    df_out['PLAY TYPE LAG 1'] = df_out['PLAY TYPE'].shift(1)
    df_out['PLAY TYPE LAG 2'] = df_out['PLAY TYPE'].shift(2)

    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 1'] != 1, 'GN/LS LAG 1'] = 0
    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 2'] != 2, 'GN/LS LAG 2'] = 0

    df_out['OFF FORM LAG 1'] = df_out['OFF FORM'].shift(1)
    df_out['OFF FORM LAG 2'] = df_out['OFF FORM'].shift(2)

    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 1'] != 1, 'OFF FORM LAG 1'] = 'NONE'
    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 2'] != 2, 'OFF FORM LAG 2'] = 'NONE'

    df_out['TIME TO HALF'] = (df_out['TIME TO HALF'].astype(str).str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1])))

    df_out['SCORE DIFF'] = df_out['OPP SCORE'] - df_out['OWN SCORE']

    df_out['HASH'] = df_out['HASH'].astype(str).str.strip()

    df_out['MID OR NOT'] = df_out['HASH'] == 'M'
    df_out['HASH OR NOT'] = df_out['HASH'] != 'M'

    df_out['0-2'] = df_out['DIST'] <= 2
    df_out['2-6'] = (df_out['DIST'] > 2) & (df_out['DIST'] <= 6)
    df_out['6+'] = df_out['DIST'] > 6

    print("Before OHE:", df_out['PERSONNEL'].unique())

    # Reset index before doing anything
    df_out = df_out.reset_index(drop=True)

    # Create dummy variables in a single clean step
    dummies = pd.get_dummies(
        df_out['PERSONNEL'].astype(str).str.strip(),
        prefix='PERSONNEL'
    )

    # Concatenate and then drop the original column
    df_out = pd.concat([df_out, dummies], axis=1).drop(columns=['PERSONNEL'])
    
    print("After OHE columns:", df_out.filter(like='PERSONNEL_').columns.tolist())
    print(df_out.filter(like='PERSONNEL_').tail(3))

    df_out['PLAY OF DRIVE NUM'] = -1
    for i in range(len(df)):
        if (df_out.iloc[i]['DN'] == 0):
            df_out.iloc[i, df_out.columns.get_loc('PLAY OF DRIVE NUM')] = 0
        else:
            df_out.iloc[i, df_out.columns.get_loc('PLAY OF DRIVE NUM')] = df_out.iloc[(i - 1), df_out.columns.get_loc('PLAY OF DRIVE NUM')] + 1


    # Loop through all columns that start with "PERSONNEL_"
    for col in df_out.columns:
        if col.startswith("PERSONNEL_"):
            df_out[f"{col} LAG 1"] = df_out[col].shift(1)
            df_out[f"{col} LAG 2"] = df_out[col].shift(2)



    df_out['2 MIN'] = df_out['2 MIN'].astype(str).str.strip()
    df_out['2 MIN OR NOT'] = -1
    df_out['2 MIN OR NOT'] = df_out['2 MIN'].apply(lambda x: 1 if x == "Y" else 0)

    df_out['OWN END'] = -1
    df_out['OWN END'] = df_out['YARD LN'].apply(lambda x: 1 if x < 0 else 0)

    df_out['OPP END'] = -1
    df_out['OPP END'] = df_out['YARD LN'].apply(lambda x: 1 if x >= 0 else 0)

    df_out['RED ZONE'] = -1
    df_out['RED ZONE'] = df_out['YARD LN'].apply(lambda x: 1 if (x <= 20 and x > 0) else 0)

    df_out['HALF_NUM'] = -1
    df_out['HALF_NUM'] = df_out['QTR'].apply(lambda x: 1 if x <= 2 else 2)

    df_out['TIME LEFT'] = -1
    df_out['TIME LEFT'] = df_out.apply(lambda row: row['TIME TO HALF'] if row['HALF_NUM'] == 2 else (row['TIME TO HALF'] + 900), axis=1)

    df_out['PPS NEEDED'] = -1
    df_out['PPS NEEDED'] = (df_out['SCORE DIFF']) * -1/df_out['TIME LEFT']

    df_out['WINNING'] = -1
    df_out['WINNING'] = df_out['SCORE DIFF'].apply(lambda x: 1 if x < 0 else 0)

    df_out['DN X DIST'] = -1
    df_out['DN X DIST'] = df_out['DN'] * df_out['DIST']

    df_out['PREV PLAY PASS OR NOT'] = -1
    df_out['PREV PLAY PASS OR NOT'] = df_out['PLAY TYPE LAG 1'].apply(lambda x: 1 if x == "Pass" else 0)

    df_out['SCORE DIFF ^2'] = df_out['SCORE DIFF'] * df_out['SCORE DIFF']
    df_out['SCORE DIFF ^2'] = df_out['WINNING'].apply(lambda x: -x if x > 0 else x)

    df_out['SCORE DIFF x TIME LEFT'] = df_out['SCORE DIFF'] * df_out['TIME LEFT']

    df_out['SCORE DIFF x DN'] = df_out['SCORE DIFF'] * df_out['DN']

    df_out['SCORE DIFF / 7'] = df_out['SCORE DIFF'] / 7
    df_out['TIME LEFT * SCORE DIFF / 7'] = df_out['TIME LEFT'] * df_out['SCORE DIFF'] / 7

    df_out['SCORE DIFF x QTR'] = df_out['SCORE DIFF'] * df_out['QTR']

    df_out['YARDS TO TD'] = df_out['YARD LN'].apply(lambda x: x + 100 if x < 0 else x)
    df_out['YARDS TO TD * SCORE DIFF / 7'] = df_out['YARDS TO TD'] * df_out['SCORE DIFF'] / 7

    # drop the first 2 plays (because of lag features)
    df_out = df_out.iloc[2:].reset_index(drop=True)

    return df_out

In [4]:
def evaluate_feature_subset(features, X_train_full, y_train, X_test_full, y_test, model_type='top1'):
    selected_features = [f for i, f in enumerate(X_train_full.columns) if features[i] == 1]
    
    if not selected_features:
        return 0.0  # Avoid empty feature sets

    X_train = X_train_full[selected_features]
    X_test = X_test_full[selected_features]

    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), selected_features)]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=45))
    ])
    
    pipeline.fit(X_train, y_train)

    if model_type == 'top1':
        y_pred = pipeline.predict(X_test)
        return accuracy_score(y_test, y_pred)

    elif model_type == 'top2':
        y_proba = pipeline.predict_proba(X_test)
        classes = pipeline.named_steps['classifier'].classes_
        top2_idx = np.argsort(y_proba, axis=1)[:, -2:]
        top2_preds = np.array([[classes[i] for i in row] for row in top2_idx])
        correct_top2 = [y in preds for y, preds in zip(y_test, top2_preds)]
        return np.mean(correct_top2)

    else:
        raise ValueError("model_type must be 'top1' or 'top2'")

In [5]:

def evaluate_feature_subset_single(
    features,                  # list of column NAMES 
    X_train, y_train, 
    X_test,  y_test,
    model_type='top1',         # 'top1' or 'top2'
    random_state=45,
    save_name=None
):
    if (model_type == 'top1'):
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        preprocessor = ColumnTransformer(
            transformers=[('cat', categorical_transformer, features)]
        )

        # Create a Random Forest pipeline
        rf_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=45))
        ])

        # Train the model
        rf_pipeline.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = rf_pipeline.predict(X_test)
        print(classification_report(y_test, y_pred))

        if save_name:
            joblib.dump(rf_pipeline, f"{save_name}.pkl")

        return rf_pipeline

    elif (model_type == 'top2'):
        # Rebuild preprocessor and pipeline with top-2 selected features
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        preprocessor_top2 = ColumnTransformer(
            transformers=[('cat', categorical_transformer, features)]
        )

        rf_pipeline_top2 = Pipeline(steps=[
            ('preprocessor', preprocessor_top2),
            ('classifier', RandomForestClassifier(random_state=45))
        ])

        # Fit on top-2 feature subset
        rf_pipeline_top2.fit(X_train, y_train)
        
        y_proba = rf_pipeline_top2.predict_proba(X_test)

        # Get class labels (in the same order as columns of y_proba)
        classes = rf_pipeline_top2.named_steps['classifier'].classes_

        # For each sample, get indices of top 2 probs
        top2_idx = np.argsort(y_proba, axis=1)[:, -2:]

        # Convert indices to class labels
        top2_preds = np.array([[classes[i] for i in row] for row in top2_idx])

        # y_test must be a numpy array or series of actual labels
        correct_top2 = [true in preds for true, preds in zip(y_test, top2_preds)]
        top2_accuracy = np.mean(correct_top2)
        print(f"Top-2 Accuracy: {top2_accuracy:.2f}")

        y_test_array = np.array(y_test)

        # Create 'soft' top-2 predictions
        soft_top2_preds = []
        for i, row in enumerate(top2_preds):
            true = y_test_array[i]
            if true in row:
                soft_top2_preds.append(true)  # Treat as correct if in top-2
            else:
                soft_top2_preds.append(row[1])  # Use 2nd best prediction

        # Print full classification report
        print(classification_report(y_test_array, soft_top2_preds))

        if save_name:
            joblib.dump(rf_pipeline_top2, f"{save_name}.pkl")

        return rf_pipeline_top2

In [25]:
rf_param_grid = {
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "max_features": ["sqrt", 0.5, 1.0],
    "n_estimators": [300, 600, 900],
}

In [6]:
def genetic_algorithm(X_train, y_train, X_test, y_test, n_generations=30, pop_size=50, mutation_rate=0.15, model_type='top1'):
    n_features = X_train.shape[1]
    population = [np.random.randint(0, 2, size=n_features).tolist() for _ in range(pop_size)]

    for generation in range(n_generations):
        scores = [evaluate_feature_subset(ind, X_train, y_train, X_test, y_test, model_type) for ind in population]
        print(f"Generation {generation}: Best score = {max(scores):.4f}")

        # Select top 50%
        sorted_pop = [x for _, x in sorted(zip(scores, population), reverse=True)]
        parents = sorted_pop[:pop_size // 2]

        # Crossover
        offspring = []
        while len(offspring) < pop_size - len(parents):
            p1, p2 = random.sample(parents, 2)
            cut = random.randint(1, n_features - 1)
            child = p1[:cut] + p2[cut:]
            offspring.append(child)

        # Mutation
        for child in offspring:
            if random.random() < mutation_rate:
                idx = random.randint(0, n_features - 1)
                child[idx] = 1 - child[idx]

        population = parents + offspring

    # Return best feature subset
    final_scores = [evaluate_feature_subset(ind, X_train, y_train, X_test, y_test, model_type) for ind in population]
    best_idx = np.argmax(final_scores)
    best_features = [f for i, f in enumerate(X_train.columns) if population[best_idx][i] == 1]
    
    return best_features

In [7]:
def split_train(df, location, inputs, target):
    split_index = int(len(df) * location)
    train = df.iloc[:split_index]
    test = df.iloc[split_index:]

    X_train = train[inputs]
    X_test = test[inputs]
    y_train = train[target]
    y_test = test[target]

    return X_train, X_test, y_train, y_test

In [8]:
def rf_input(X_train, X_test, y_train, y_test, features):
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[('cat', categorical_transformer, features)]
    )

    # Create a Random Forest pipeline
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=45))
    ])

    # Train the model
    rf_pipeline.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = rf_pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))

In [9]:
# TEMP

# Copy df, remove weird plays, create features
def pre_split_prep(df):
    df = df.copy()
    df = df_trim(df)
    df = create_features(df)
    return df

# Manually do df.columns and create a list df_input_cols with the appropriate valid columns, create df_target_col = 'OFF FORM'

# 
def get_best_features(df, df_input_cols, df_target_col):
    df = df.copy()
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = split_train(
            df, 0.7, df_input_cols, df_target_col
        )
    best_features_top1 = genetic_algorithm(
        X_train_temp, y_train_temp, X_test_temp, y_test_temp,
        n_generations=10, pop_size=50,
        mutation_rate=0.15, model_type='top1'
    )

    best_features_top2 = genetic_algorithm(
        X_train_temp, y_train_temp, X_test_temp, y_test_temp,
        n_generations=10, pop_size=50,
        mutation_rate=0.15, model_type='top2'
    )

    evaluate_feature_subset_single(best_features_top1, X_train_temp, y_train_temp, 
                        X_test_temp, y_test_temp, model_type='top1')

    evaluate_feature_subset_single(best_features_top2, X_train_temp, y_train_temp, 
                        X_test_temp, y_test_temp, model_type='top2')
    
    return best_features_top1, best_features_top2

In [13]:
def predict_top2(model, X):
    y_proba = model.predict_proba(X)
    top2_idx = np.argsort(y_proba, axis=1)[:, -2:]
    classes = model.named_steps['classifier'].classes_
    return np.array([[classes[i] for i in row] for row in top2_idx])

# For top1 prediction simply do the following: y_pred = model_top1.predict(new_X)

In [15]:
DATA_PATH = "data/"

In [17]:
fp_gc = "GCC Offense 2025.xlsx"
fp_wj = "WJ Offense.xlsx"


df_gc = pd.read_excel(os.path.join(DATA_PATH,fp_gc))
df_wj = pd.read_excel(os.path.join(DATA_PATH,fp_wj))

print("GCC shape:", df_gc.shape)
print("WJ shape:", df_wj.shape)

GCC shape: (187, 35)
WJ shape: (230, 35)


In [15]:
# df_gc = df_wj.copy()
df_gc.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'PERSONNEL', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM',
       'VARIATION', 'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH',
       'TARGET', 'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT'],
      dtype='object')

In [37]:
gc_input_cols = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 
              'OWN SCORE', 'OPP SCORE', 
              'GN/LS LAG 1', 'GN/LS LAG 2', 
              'OFF FORM LAG 1', 'OFF FORM LAG 2'
              , 'SCORE DIFF'
              , 'MID OR NOT'
              , 'HASH OR NOT'
              , 'PERSONNEL_10'
              , 'PERSONNEL_11'
              , 'PERSONNEL_12'
              , 'PERSONNEL_12T'
              , 'PERSONNEL_13'
              , 'PERSONNEL_13T'
              , 'PLAY OF DRIVE NUM'
              , '0-2'
              , '2-6'
              , '6+'
              , '2 MIN OR NOT'
              , 'OWN END'
              , 'OPP END'
              , 'RED ZONE'
              , 'HALF_NUM'
              , 'TIME LEFT'
              , 'WINNING'
              , 'PPS NEEDED'
              , 'PLAY TYPE LAG 1'
              , 'PLAY TYPE LAG 2'
            #   , 'PERSONNEL!'
              , 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2'
              , 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2'
              , 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2'
              , 'PERSONNEL_12T LAG 1', 'PERSONNEL_12T LAG 2'
              , 'PERSONNEL_13 LAG 1', 'PERSONNEL_13 LAG 2'
              , 'PERSONNEL_13T LAG 1', 'PERSONNEL_13T LAG 2'
              , 'PREV PLAY PASS OR NOT'
              , 'DN X DIST'
              , 'SCORE DIFF ^2'
              , 'SCORE DIFF x TIME LEFT'
              , 'SCORE DIFF x DN'
              , 'SCORE DIFF / 7'
              , 'SCORE DIFF x QTR'
              , 'TIME LEFT * SCORE DIFF / 7'
              , 'YARDS TO TD'
              , 'YARDS TO TD * SCORE DIFF / 7'
              ]  
gc_target_col = 'OFF FORM' 

wj_input_cols = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'OWN SCORE', 'OPP SCORE',
        # 'GN/LS', 
       #'INC10',
       '2 MIN', 
       #'FSL', 
      #  'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
      #  'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
      #  'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
      #  'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 
       'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY # LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 1',
       'PLAY TYPE LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF',
       'MID OR NOT', 'HASH OR NOT', '0-2', '2-6', '6+', 'PERSONNEL_1',
       'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_32',
       'PLAY OF DRIVE NUM', 'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2',
       'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2', 'PERSONNEL_11 LAG 1',
       'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2',
       'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2', '2 MIN OR NOT', 'OWN END',
       'OPP END', 'RED ZONE', 'HALF_NUM', 'TIME LEFT', 'PPS NEEDED', 'WINNING',
       'DN X DIST', 'PREV PLAY PASS OR NOT', 'SCORE DIFF ^2',
       'SCORE DIFF x TIME LEFT', 'SCORE DIFF x DN', 'SCORE DIFF / 7',
       'TIME LEFT * SCORE DIFF / 7', 'SCORE DIFF x QTR', 'YARDS TO TD',
       'YARDS TO TD * SCORE DIFF / 7']
wj_target_col = 'OFF FORM' 

In [18]:
df_gc = df_gc.copy()
df_gc = df_trim(df_gc)
df_gc = create_features(df_gc)
df_gc.columns

Before OHE: [11 12 10 13 '12T' '13T']
After OHE columns: ['PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_12T', 'PERSONNEL_13', 'PERSONNEL_13T']
     PERSONNEL_10  PERSONNEL_11  PERSONNEL_12  PERSONNEL_12T  PERSONNEL_13  \
167         False         False         False          False         False   
168         False         False         False          False          True   
169         False         False         False          False          True   

     PERSONNEL_13T  
167           True  
168          False  
169          False  


Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY # LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 1',
       'PLAY TYPE LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF',
       'MID OR NOT', 'HASH OR NOT', '0-2', '2-6', '6+', 'PERSONNEL_10',
       'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_12T', 'PERSONNEL_13',
       'PERSONNEL_13T', 'PLAY OF DRIVE NUM', 'PERSONNEL_10 LAG 1',
       'PERSONNEL_10 LAG 2', 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2',
       'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2', 'PERSONNEL_12T LAG 1',
       'PERSONNEL_12T LAG 2', 'PERSONNEL_13 LAG 1', '

In [19]:
print(df_gc.isna().sum().sort_values(ascending=False).to_string())

ROUTE                           168
WR ALIGNMENT                    168
RECEIVER_Name                   168
RECEIVER_Jersey                 168
PROTECTION                      168
DEEP SHOT                       168
COMMENT                         167
TARGET                          165
SHIFT FROM                      164
INC10                           163
TAG                             159
B/S ROUTE                       154
FSL                             146
MOTION                          129
TREE                            109
VARIATION                       103
STRENGTH                         77
GN/LS                             2
OFF PLAY                          1
RESULT                            1
PERSONNEL_12T LAG 1               0
PERSONNEL_10 LAG 1                0
PERSONNEL_10 LAG 2                0
PERSONNEL_11 LAG 1                0
PERSONNEL_11 LAG 2                0
PERSONNEL_12 LAG 1                0
PERSONNEL_12 LAG 2                0
PERSONNEL_13 LAG 2          

In [20]:
df_gc.loc[df_gc['RESULT'] == 'Interception']

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,DN X DIST,PREV PLAY PASS OR NOT,SCORE DIFF ^2,SCORE DIFF x TIME LEFT,SCORE DIFF x DN,SCORE DIFF / 7,TIME LEFT * SCORE DIFF / 7,SCORE DIFF x QTR,YARDS TO TD,YARDS TO TD * SCORE DIFF / 7


In [38]:
X_train_full, X_test_full, y_train_full, y_test_full = split_train(
            df_gc, 0.7, gc_input_cols, gc_target_col
        )

In [39]:
best_features_top1 = genetic_algorithm(
        X_train_full, y_train_full, X_test_full, y_test_full,
        n_generations=40, pop_size=50,
        mutation_rate=0.15, model_type='top1'
    )


Generation 0: Best score = 0.6078
Generation 1: Best score = 0.6078
Generation 2: Best score = 0.6275
Generation 3: Best score = 0.6275
Generation 4: Best score = 0.6275
Generation 5: Best score = 0.6275
Generation 6: Best score = 0.6275
Generation 7: Best score = 0.6275
Generation 8: Best score = 0.6275
Generation 9: Best score = 0.6275
Generation 10: Best score = 0.6275
Generation 11: Best score = 0.6275
Generation 12: Best score = 0.6275
Generation 13: Best score = 0.6275
Generation 14: Best score = 0.6275
Generation 15: Best score = 0.6275
Generation 16: Best score = 0.6275
Generation 17: Best score = 0.6275
Generation 18: Best score = 0.6275
Generation 19: Best score = 0.6275
Generation 20: Best score = 0.6275
Generation 21: Best score = 0.6275
Generation 22: Best score = 0.6275
Generation 23: Best score = 0.6275
Generation 24: Best score = 0.6275
Generation 25: Best score = 0.6275
Generation 26: Best score = 0.6275
Generation 27: Best score = 0.6275
Generation 28: Best score = 0.

In [43]:

best_features_top2 = genetic_algorithm(
        X_train_full, y_train_full, X_test_full, y_test_full,
        n_generations=30, pop_size=50,
        mutation_rate=.1, model_type='top2'
    )

Generation 0: Best score = 0.6667
Generation 1: Best score = 0.6667
Generation 2: Best score = 0.6667
Generation 3: Best score = 0.6863
Generation 4: Best score = 0.6863
Generation 5: Best score = 0.6863
Generation 6: Best score = 0.6863
Generation 7: Best score = 0.6863
Generation 8: Best score = 0.6863
Generation 9: Best score = 0.6863
Generation 10: Best score = 0.6863
Generation 11: Best score = 0.6863
Generation 12: Best score = 0.6863
Generation 13: Best score = 0.6863
Generation 14: Best score = 0.6863
Generation 15: Best score = 0.6863
Generation 16: Best score = 0.6863
Generation 17: Best score = 0.6863
Generation 18: Best score = 0.6863
Generation 19: Best score = 0.6863
Generation 20: Best score = 0.6863
Generation 21: Best score = 0.6863
Generation 22: Best score = 0.6863
Generation 23: Best score = 0.6863
Generation 24: Best score = 0.6863
Generation 25: Best score = 0.6863
Generation 26: Best score = 0.6863
Generation 27: Best score = 0.6863
Generation 28: Best score = 0.

In [41]:
print(best_features_top1)
print(best_features_top2)

['QTR', 'DN', 'DIST', 'YARD LN', 'GN/LS LAG 1', 'GN/LS LAG 2', 'MID OR NOT', 'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_13', 'PLAY OF DRIVE NUM', '0-2', '2-6', '6+', '2 MIN OR NOT', 'PPS NEEDED', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2', 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_13 LAG 2', 'PREV PLAY PASS OR NOT', 'DN X DIST', 'SCORE DIFF x TIME LEFT', 'SCORE DIFF x DN', 'SCORE DIFF x QTR', 'TIME LEFT * SCORE DIFF / 7', 'YARDS TO TD']
['QTR', 'DN', 'HASH', 'OPP SCORE', 'GN/LS LAG 1', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM', '0-2', '6+', '2 MIN OR NOT', 'PPS NEEDED', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'PERSONNEL_10 LAG 2', 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2', 'PERSONNEL_12T LAG 2', 'PERSONNEL_13 LAG 1', 'PERSONNEL_13 LAG 2', 'PERSONNEL_13T LAG 1', 'DN X DIST', 'SCORE DIFF x TIME LEFT', 'TIME LEFT * S

In [None]:
gc_top1_73 = ['QTR', 'TIME TO HALF', 'DIST', 'HASH', 'OWN SCORE', 'OFF FORM LAG 1', 
              'OFF FORM LAG 2', 'SCORE DIFF', 'PERSONNEL_10', 'PERSONNEL_12', 'PERSONNEL_12T', 
              '2-6', 'OWN END', 'OPP END', 'WINNING', 'PPS NEEDED', 'PERSONNEL_11 LAG 2', 
              'PERSONNEL_11T LAG 1', 'PERSONNEL_11T LAG 2', 'DN X DIST', 'SCORE DIFF x QTR', 
              'YARDS TO TD']

gc_top2_88 = ['DN', 'YARD LN', 'HASH', 'OWN SCORE', 'GN/LS LAG 1', 'GN/LS LAG 2', 
              'OFF FORM LAG 2', 'MID OR NOT', 'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 
              'PERSONNEL_12', 'PERSONNEL_11T', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM', '2-6', 
              'HALF_NUM', 'WINNING', 'PPS NEEDED', 'PLAY TYPE LAG 2', 'PERSONNEL_10 LAG 1', 
              'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1', 'PREV PLAY PASS OR NOT', 
              'SCORE DIFF ^2', 'SCORE DIFF x DN', 'SCORE DIFF / 7', 'SCORE DIFF x QTR', 
              'YARDS TO TD * SCORE DIFF / 7']

gc_top2_90 = ['QTR', 'YARD LN', 'OWN SCORE', 'GN/LS LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF',
              'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_12T',
              '0-2', '6+', 'RED ZONE', 'PERSONNEL_10 LAG 2', 'PERSONNEL_11 LAG 1',
              'PERSONNEL_11 LAG 2', 'PERSONNEL_12T LAG 1', 'SCORE DIFF ^2', 'SCORE DIFF x DN',
              'SCORE DIFF / 7', 'SCORE DIFF x QTR']

wj_top1_58 = ['QTR', 'TIME TO HALF', 'DN', 'YARD LN', 'HASH', 'OWN SCORE', 'OPP SCORE', 
              'PLAY # LAG 1', 'PLAY TYPE LAG 1', 'MID OR NOT', '2-6', '6+', 'PERSONNEL_10',
              'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_32', 'PLAY OF DRIVE NUM',
              'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2', 'PERSONNEL_10 LAG 1', 'PERSONNEL_11 LAG 2',
              'PERSONNEL_12 LAG 2', 'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2', 'HALF_NUM',
              'PREV PLAY PASS OR NOT', 'SCORE DIFF / 7', 'YARDS TO TD']

wj_top2_82 = ['TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'OPP SCORE', 'GN/LS', '2 MIN', 'PLAY TYPE LAG 1',
              'PLAY TYPE LAG 2', 'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT',
              'PERSONNEL_10', 'PERSONNEL_12', 'PERSONNEL_1 LAG 1', 'PERSONNEL_11 LAG 2',
              'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2', 'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2',
              '2 MIN OR NOT', 'RED ZONE', 'SCORE DIFF ^2', 'SCORE DIFF / 7',
              'TIME LEFT * SCORE DIFF / 7', 'YARDS TO TD']

wj_top2_81 = ['TIME TO HALF', 'DN', 'OWN SCORE', 'GN/LS LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 2',
              'OFF FORM LAG 1', 'SCORE DIFF', '0-2', 'PERSONNEL_1', 'PERSONNEL_11', 'PERSONNEL_12',
              'PLAY OF DRIVE NUM', 'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2', 'PERSONNEL_11 LAG 1',
              'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 2', 'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2',
              '2 MIN OR NOT', 'OWN END', 'OPP END', 'RED ZONE', 'SCORE DIFF ^2', 'SCORE DIFF x QTR',
              'YARDS TO TD', 'YARDS TO TD * SCORE DIFF / 7']

In [18]:
evaluate_feature_subset_single(gc_top2_90, X_train_full, y_train_full, 
                        X_test_full, y_test_full, model_type='top2')

Top-2 Accuracy: 0.91
               precision    recall  f1-score   support

          CON       0.00      0.00      0.00         1
    DEUCE OFF       0.00      0.00      0.00         1
      DOUBLES       1.00      1.00      1.00         1
  DUTCH Y OFF       1.00      1.00      1.00         1
    PRO Y OFF       1.00      1.00      1.00         3
TREY DBL WING       1.00      1.00      1.00         2
    TREY WING       0.00      0.00      0.00         2
   TREY Y OFF       1.00      1.00      1.00         6
         TRIO       0.93      1.00      0.96        13
        TROOP       0.00      0.00      0.00         1
   TROOP WING       0.83      1.00      0.91         5
   TWINS OPEN       1.00      1.00      1.00        13
    WOLVERINE       0.57      1.00      0.73         4

     accuracy                           0.91        53
    macro avg       0.64      0.69      0.66        53
 weighted avg       0.84      0.91      0.87        53



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
df_wj = pre_split_prep(df_wj)

In [27]:
df_wj.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY # LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 1',
       'PLAY TYPE LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF',
       'MID OR NOT', 'HASH OR NOT', '0-2', '2-6', '6+', 'PERSONNEL_1',
       'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_32',
       'PLAY OF DRIVE NUM', 'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2',
       'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2', 'PERSONNEL_11 LAG 1',
       'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2',
       'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2', '2

In [46]:
print(wj_input_cols)
wj_target_col

['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 'OWN SCORE', 'OPP SCORE', 'GN/LS', '2 MIN', 'GN/LS LAG 1', 'GN/LS LAG 2', 'PLAY # LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2', '2-6', '6+', 'PERSONNEL_1', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_32', 'PLAY OF DRIVE NUM', 'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2', 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2', 'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2', '2 MIN OR NOT', 'OWN END', 'OPP END', 'RED ZONE', 'HALF_NUM', 'TIME LEFT', 'PPS NEEDED', 'WINNING', 'DN X DIST', 'PREV PLAY PASS OR NOT', 'SCORE DIFF ^2', 'SCORE DIFF x TIME LEFT', 'SCORE DIFF x DN', 'SCORE DIFF / 7', 'TIME LEFT * SCORE DIFF / 7', 'SCORE DIFF x QTR', 'YARDS TO TD', 'YARDS TO TD * SCORE DIFF / 7']


'OFF FORM'

In [53]:
X_train_full, X_test_full, y_train_full, y_test_full = split_train(
            df_wj, 0.7, wj_input_cols, wj_target_col
        )

In [54]:
wj_top1, wj_top2 = get_best_features(df_wj, wj_input_cols, wj_target_col)

Generation 0: Best score = 0.4848
Generation 1: Best score = 0.4848
Generation 2: Best score = 0.5000
Generation 3: Best score = 0.5606
Generation 4: Best score = 0.5606
Generation 5: Best score = 0.5606
Generation 6: Best score = 0.5606
Generation 7: Best score = 0.5606
Generation 8: Best score = 0.5606
Generation 9: Best score = 0.5758
Generation 0: Best score = 0.8182
Generation 1: Best score = 0.8182
Generation 2: Best score = 0.8182
Generation 3: Best score = 0.8182
Generation 4: Best score = 0.8182
Generation 5: Best score = 0.8182
Generation 6: Best score = 0.8182
Generation 7: Best score = 0.8182
Generation 8: Best score = 0.8182
Generation 9: Best score = 0.8182
              precision    recall  f1-score   support

     DOUBLES       0.62      1.00      0.77         5
   DUECE OFF       0.55      0.92      0.69        13
 DUECE Y OFF       0.00      0.00      0.00         2
  DUTCH WING       0.00      0.00      0.00         5
 DUTCH Y OFF       0.57      1.00      0.73      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
print(wj_top1)
print(wj_top2)

['DIST', 'YARD LN', '2 MIN', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF', 'MID OR NOT', 'HASH OR NOT', '0-2', '2-6', 'PERSONNEL_1', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_32', 'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2', 'PERSONNEL_10 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 2', 'PERSONNEL_32 LAG 1', 'RED ZONE', 'PPS NEEDED', 'YARDS TO TD']
['TIME TO HALF', 'DN', 'OWN SCORE', 'GN/LS LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 2', 'OFF FORM LAG 1', 'SCORE DIFF', '0-2', 'PERSONNEL_1', 'PERSONNEL_11', 'PERSONNEL_12', 'PLAY OF DRIVE NUM', 'PERSONNEL_1 LAG 1', 'PERSONNEL_1 LAG 2', 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 2', 'PERSONNEL_32 LAG 1', 'PERSONNEL_32 LAG 2', '2 MIN OR NOT', 'OWN END', 'OPP END', 'RED ZONE', 'SCORE DIFF ^2', 'SCORE DIFF x QTR', 'YARDS TO TD', 'YARDS TO TD * SCORE DIFF / 7']


In [42]:
evaluate_feature_subset_single(gc_top2_90, X_train_full, y_train_full, 
                        X_test_full, y_test_full, save_name='gc_top2_90', model_type='top2')

Top-2 Accuracy: 0.91
               precision    recall  f1-score   support

          CON       0.00      0.00      0.00         1
    DEUCE OFF       0.00      0.00      0.00         1
      DOUBLES       1.00      1.00      1.00         1
  DUTCH Y OFF       1.00      1.00      1.00         1
    PRO Y OFF       1.00      1.00      1.00         3
TREY DBL WING       1.00      1.00      1.00         2
    TREY WING       0.00      0.00      0.00         2
   TREY Y OFF       1.00      1.00      1.00         6
         TRIO       0.93      1.00      0.96        13
        TROOP       0.00      0.00      0.00         1
   TROOP WING       0.83      1.00      0.91         5
   TWINS OPEN       1.00      1.00      1.00        13
    WOLVERINE       0.57      1.00      0.73         4

     accuracy                           0.91        53
    macro avg       0.64      0.69      0.66        53
 weighted avg       0.84      0.91      0.87        53



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
model_gc_top2_90 = joblib.load('gc_top2_90.pkl')
model_wj_top2_82 = joblib.load('wj_top2_82.pkl')

In [18]:
model = joblib.load("gc_top2_90.pkl")
print(model.named_steps['classifier'].get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 45, 'verbose': 0, 'warm_start': False}
