In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import random
import os

In [2]:
DATA_PATH = "data/"

In [3]:
fp_gc = "GCC Offense.xlsx"
df_gc = pd.read_excel(os.path.join(DATA_PATH,fp_gc))
df_gc.head()

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,STRENGTH,TARGET,B/S ROUTE,DEEP SHOT,PROTECTION,RECEIVER_Jersey,RECEIVER_Name,WR ALIGNMENT,ROUTE,COMMENT
0,14,O,1,24:01,0,10,-27,M,Run,Rush,...,STRONG,,,L,,,,,,
1,15,O,1,23:27,2,4,-33,M,Run,Rush,...,STRONG,,,L,,,,,,
2,16,O,1,22:53,1,10,-45,L,Run,Rush,...,,,,L,,,,,,
3,17,O,1,22:19,1,10,45,L,Pass,Complete,...,,WR,OUT,L,,,,1 STR,STAB,
4,18,O,1,21:45,1,10,33,L,Run,Rush,...,,,,L,,,,,,


In [23]:
def df_trim(df):
    out = df.copy()

    # remove rows whose RESULT is in this list
    out = out[~out['RESULT'].isin(['Timeout', 'Penalty', 'Interception'])]

    # drop rows with missing OFF FORM or PERSONNEL
    out = out[out['OFF FORM'].notna() & out['PERSONNEL'].notna()]

    # drop the first 2 plays (because of lag features)
    out = out.iloc[2:].reset_index(drop=True)

    return out

In [24]:
def create_features(df):
    df_out = df.copy()
    df_out['GN/LS LAG 1'] = df_out['GN/LS'].shift(1)
    df_out['GN/LS LAG 2'] = df_out['GN/LS'].shift(2)

    df_out['PLAY # LAG 1'] = df_out['PLAY #'].shift(1)
    df_out['PLAY # LAG 2'] = df_out['PLAY #'].shift(2)

    df_out['PLAY TYPE LAG 1'] = df_out['PLAY TYPE'].shift(1)
    df_out['PLAY TYPE LAG 2'] = df_out['PLAY TYPE'].shift(2)

    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 1'] != 1, 'GN/LS LAG 1'] = 0
    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 2'] != 2, 'GN/LS LAG 2'] = 0

    df_out['OFF FORM LAG 1'] = df_out['OFF FORM'].shift(1)
    df_out['OFF FORM LAG 2'] = df_out['OFF FORM'].shift(2)

    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 1'] != 1, 'OFF FORM LAG 1'] = 'NONE'
    df_out.loc[df_out['PLAY #'] - df_out['PLAY # LAG 2'] != 2, 'OFF FORM LAG 2'] = 'NONE'

    df_out['TIME TO HALF'] = (df_out['TIME TO HALF'].astype(str).str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1])))

    df_out['SCORE DIFF'] = df_out['OPP SCORE'] - df_out['OWN SCORE']

    df_out['HASH'] = df_out['HASH'].astype(str).str.strip()

    df_out['MID OR NOT'] = df_out['HASH'] == 'M'
    df_out['HASH OR NOT'] = df_out['HASH'] != 'M'

    df_out['0-2'] = df_out['DIST'] <= 2
    df_out['2-6'] = (df_out['DIST'] > 2) & (df_out['DIST'] <= 6)
    df_out['6+'] = df_out['DIST'] > 6

    # df['PERSONNEL!'] = df['PERSONNEL'].astype(str).str.strip()
    # df = pd.get_dummies(df, columns=['PERSONNEL'])

    df_out = pd.concat(
        [df_out.drop(columns=['PERSONNEL']),
         pd.get_dummies(df_out['PERSONNEL'].astype(str).str.strip(), prefix='PERSONNEL')],
        axis=1
    )
    
    df_out['PLAY OF DRIVE NUM'] = -1
    for i in range(len(df)):
        if (df_out.iloc[i]['DN'] == 0):
            df_out.iloc[i, df_out.columns.get_loc('PLAY OF DRIVE NUM')] = 0
        else:
            df_out.iloc[i, df_out.columns.get_loc('PLAY OF DRIVE NUM')] = df_out.iloc[(i - 1), df_out.columns.get_loc('PLAY OF DRIVE NUM')] + 1


    df_out['PERSONNEL_10 LAG 1'] = df_out['PERSONNEL_10'].shift(1)
    df_out['PERSONNEL_10 LAG 2'] = df_out['PERSONNEL_10'].shift(2)

    df_out['PERSONNEL_11 LAG 1'] = df_out['PERSONNEL_11'].shift(1)
    df_out['PERSONNEL_11 LAG 2'] = df_out['PERSONNEL_11'].shift(2)

    df_out['PERSONNEL_12 LAG 1'] = df_out['PERSONNEL_12'].shift(1)
    df_out['PERSONNEL_12 LAG 2'] = df_out['PERSONNEL_12'].shift(2)

    df_out['PERSONNEL_12T LAG 1'] = df_out['PERSONNEL_12T'].shift(1)
    df_out['PERSONNEL_12T LAG 2'] = df_out['PERSONNEL_12T'].shift(2)

    df_out['PERSONNEL_11T LAG 1'] = df_out['PERSONNEL_11T'].shift(1)
    df_out['PERSONNEL_11T LAG 2'] = df_out['PERSONNEL_11T'].shift(2)


    df_out['2 MIN'] = df_out['2 MIN'].astype(str).str.strip()
    df_out['2 MIN OR NOT'] = -1
    df_out['2 MIN OR NOT'] = df_out['2 MIN'].apply(lambda x: 1 if x == "Y" else 0)



    df_out['OWN END'] = -1
    df_out['OWN END'] = df_out['YARD LN'].apply(lambda x: 1 if x < 0 else 0)



    df_out['OPP END'] = -1
    df_out['OPP END'] = df_out['YARD LN'].apply(lambda x: 1 if x >= 0 else 0)



    df_out['RED ZONE'] = -1
    df_out['RED ZONE'] = df_out['YARD LN'].apply(lambda x: 1 if (x <= 20 and x > 0) else 0)



    df_out['HALF_NUM'] = -1
    df_out['HALF_NUM'] = df_out['QTR'].apply(lambda x: 1 if x <= 2 else 2)


    df_out['TIME LEFT'] = -1
    df_out['TIME LEFT'] = df_out.apply(lambda row: row['TIME TO HALF'] if row['HALF_NUM'] == 2 else (row['TIME TO HALF'] + 900), axis=1)



    df_out['PPS NEEDED'] = -1
    df_out['PPS NEEDED'] = (df_out['SCORE DIFF']) * -1/df_out['TIME LEFT']


    df_out['WINNING'] = -1
    df_out['WINNING'] = df_out['SCORE DIFF'].apply(lambda x: 1 if x < 0 else 0)


    df_out['DN X DIST'] = -1
    df_out['DN X DIST'] = df_out['DN'] * df_out['DIST']


    df_out['PREV PLAY PASS OR NOT'] = -1
    df_out['PREV PLAY PASS OR NOT'] = df_out['PLAY TYPE LAG 1'].apply(lambda x: 1 if x == "Pass" else 0)



    df_out['SCORE DIFF ^2'] = df_out['SCORE DIFF'] * df_out['SCORE DIFF']
    df_out['SCORE DIFF ^2'] = df_out['WINNING'].apply(lambda x: -x if x > 0 else x)

    df_out['SCORE DIFF x TIME LEFT'] = df_out['SCORE DIFF'] * df_out['TIME LEFT']

    df_out['SCORE DIFF x DN'] = df_out['SCORE DIFF'] * df_out['DN']

    df_out['SCORE DIFF / 7'] = df_out['SCORE DIFF'] / 7
    df_out['TIME LEFT * SCORE DIFF / 7'] = df_out['TIME LEFT'] * df_out['SCORE DIFF'] / 7

    df_out['SCORE DIFF x QTR'] = df_out['SCORE DIFF'] * df_out['QTR']

    df_out['YARDS TO TD'] = df_out['YARD LN'].apply(lambda x: x + 100 if x < 0 else x)
    df_out['YARDS TO TD * SCORE DIFF / 7'] = df_out['YARDS TO TD'] * df_out['SCORE DIFF'] / 7

    return df_out

In [25]:
def evaluate_feature_subset(features, X_train_full, y_train, X_test_full, y_test, model_type='top1'):
    selected_features = [f for i, f in enumerate(X_train_full.columns) if features[i] == 1]
    
    if not selected_features:
        return 0.0  # Avoid empty feature sets

    X_train = X_train_full[selected_features]
    X_test = X_test_full[selected_features]

    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), selected_features)]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=45))
    ])
    
    pipeline.fit(X_train, y_train)

    if model_type == 'top1':
        y_pred = pipeline.predict(X_test)
        return accuracy_score(y_test, y_pred)

    elif model_type == 'top2':
        y_proba = pipeline.predict_proba(X_test)
        classes = pipeline.named_steps['classifier'].classes_
        top2_idx = np.argsort(y_proba, axis=1)[:, -2:]
        top2_preds = np.array([[classes[i] for i in row] for row in top2_idx])
        correct_top2 = [y in preds for y, preds in zip(y_test, top2_preds)]
        return np.mean(correct_top2)

    else:
        raise ValueError("model_type must be 'top1' or 'top2'")

In [26]:
def genetic_algorithm(X_train, y_train, X_test, y_test, n_generations=30, pop_size=50, mutation_rate=0.15, model_type='top1'):
    n_features = X_train.shape[1]
    population = [np.random.randint(0, 2, size=n_features).tolist() for _ in range(pop_size)]

    for generation in range(n_generations):
        scores = [evaluate_feature_subset(ind, X_train, y_train, X_test, y_test, model_type) for ind in population]
        print(f"Generation {generation}: Best score = {max(scores):.4f}")

        # Select top 50%
        sorted_pop = [x for _, x in sorted(zip(scores, population), reverse=True)]
        parents = sorted_pop[:pop_size // 2]

        # Crossover
        offspring = []
        while len(offspring) < pop_size - len(parents):
            p1, p2 = random.sample(parents, 2)
            cut = random.randint(1, n_features - 1)
            child = p1[:cut] + p2[cut:]
            offspring.append(child)

        # Mutation
        for child in offspring:
            if random.random() < mutation_rate:
                idx = random.randint(0, n_features - 1)
                child[idx] = 1 - child[idx]

        population = parents + offspring

    # Return best feature subset
    final_scores = [evaluate_feature_subset(ind, X_train, y_train, X_test, y_test, model_type) for ind in population]
    best_idx = np.argmax(final_scores)
    best_features = [f for i, f in enumerate(X_train.columns) if population[best_idx][i] == 1]
    
    return best_features

In [27]:
input_cols = ['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 
              'OWN SCORE', 'OPP SCORE', 
              'GN/LS LAG 1', 'GN/LS LAG 2', 
              'OFF FORM LAG 1', 'OFF FORM LAG 2'
              , 'SCORE DIFF'
              , 'MID OR NOT'
              , 'HASH OR NOT'
              , 'PERSONNEL_10'
              , 'PERSONNEL_11'
              , 'PERSONNEL_12'
              , 'PERSONNEL_11T'
              , 'PERSONNEL_12T'
              , 'PLAY OF DRIVE NUM'
              , '0-2'
              , '2-6'
              , '6+'
              , '2 MIN OR NOT'
              , 'OWN END'
              , 'OPP END'
              , 'RED ZONE'
              , 'HALF_NUM'
              , 'TIME LEFT'
              , 'WINNING'
              , 'PPS NEEDED'
              , 'PLAY TYPE LAG 1'
              , 'PLAY TYPE LAG 2'
            #   , 'PERSONNEL!'
              , 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2'
              , 'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2'
              , 'PERSONNEL_11T LAG 1', 'PERSONNEL_11T LAG 2'
              , 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2'
              , 'PERSONNEL_12T LAG 1', 'PERSONNEL_12T LAG 2'
              , 'PREV PLAY PASS OR NOT'
              , 'DN X DIST'
              , 'SCORE DIFF ^2'
              , 'SCORE DIFF x TIME LEFT'
              , 'SCORE DIFF x DN'
              , 'SCORE DIFF / 7'
              , 'SCORE DIFF x QTR'
              , 'TIME LEFT * SCORE DIFF / 7'
              , 'YARDS TO TD'
              , 'YARDS TO TD * SCORE DIFF / 7'
              ]  
target_col = 'OFF FORM' 

In [28]:
def split_train(df, location, inputs, target):
    split_index = int(len(df) * location)
    train = df.iloc[:split_index]
    test = df.iloc[split_index:]

    X_train = train[inputs]
    X_test = test[inputs]
    y_train = train[target]
    y_test = test[target]

    return X_train, X_test, y_train, y_test

In [29]:
def xgb_input(X_train, X_test, y_train, y_test, features, target):
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[('cat', categorical_transformer, features)]
    )

    # Create a Random Forest pipeline
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=45))
    ])

    # Train the model
    rf_pipeline.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = rf_pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))

In [30]:
def run_ga_feature_selection(
    df,
    input_cols,
    target_col,
    *,
    split_loc: float = 0.80,
    ga_generations: int = 30,
    ga_pop_size: int = 50,
    ga_mutation_rate: float = 0.15,
    random_state: int = 45,
    do_trim_fn=None,
    gen_features_fn=None,
    split_fn=None
):
    """
    End-to-end:
      1) copy raw df
      2) optional df_trim
      3) optional generate/lag features (can be in-place)
      4) chronological split
      5) GA for top1 and top2
      6) return both selected feature lists + their holdout scores
    """

    # Reproducibility for GA
    np.random.seed(random_state)
    random.seed(random_state)

    # 1) Work on a copy so upstream df is untouched
    df_work = df.copy()

    # 2) Optional trimming that RETURNS a new df
    if do_trim_fn is not None:
        df_work = do_trim_fn(df_work)

    # 3) Generate features in-place
    if gen_features_fn is not None:
        gen_features_fn(df_work)   # no return expected

    # ensure all requested inputs exist post-feature-gen
    missing = [c for c in input_cols if c not in df_work.columns]
    if missing:
        raise ValueError(f"These input columns are missing after feature-gen: {missing}")


    # 4) Chronological split using your helper
    if split_fn is None:
        # fallback if you don't pass split_train
        split_idx = int(len(df_work) * split_loc)
        train = df_work.iloc[:split_idx]
        test  = df_work.iloc[split_idx:]
        X_train_full, X_test_full = train[input_cols], test[input_cols]
        y_train_full, y_test_full = train[target_col], test[target_col]
    else:
        X_train_full, X_test_full, y_train_full, y_test_full = split_fn(
            df_work, split_loc, input_cols, target_col
        )

    # 5) Run GA twice (top1 + top2)
    best_features_top1 = genetic_algorithm(
        X_train_full, y_train_full, X_test_full, y_test_full,
        n_generations=ga_generations, pop_size=ga_pop_size,
        mutation_rate=ga_mutation_rate, model_type='top1'
    )

    best_features_top2 = genetic_algorithm(
        X_train_full, y_train_full, X_test_full, y_test_full,
        n_generations=ga_generations, pop_size=ga_pop_size,
        mutation_rate=ga_mutation_rate, model_type='top2'
    )

    # 6) Score the selected sets on the same holdout (so you can see how they did)
    top1_score = evaluate_feature_subset(
        [1 if c in best_features_top1 else 0 for c in X_train_full.columns],
        X_train_full, y_train_full, X_test_full, y_test_full, model_type='top1'
    )

    top2_score = evaluate_feature_subset(
        [1 if c in best_features_top2 else 0 for c in X_train_full.columns],
        X_train_full, y_train_full, X_test_full, y_test_full, model_type='top2'
    )

    return {
        "best_features_top1": best_features_top1,
        "best_features_top2": best_features_top2,
        "top1_holdout_score": float(top1_score),
        "top2_holdout_score": float(top2_score),
        "split_index": split_idx,
    }


In [None]:

# Copy df, remove weird plays, create features
def pre_split_prep(df):
    df = df.copy()
    df = df_trim(df)
    df = create_features(df)
    return df

# Manually do df.columns and create a list df_input_cols with the appropriate valid columns, create df_target_col = 'OFF FORM'

# 
def get_best_features(df, df_input_cols, df_target_col):
    df = df.copy()
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = split_train(
            df, 0.7, df_input_cols, df_target_col
        )
    best_features_top1 = genetic_algorithm(
        X_train_temp, y_train_temp, X_test_temp, y_test_temp,
        n_generations=10, pop_size=50,
        mutation_rate=0.15, model_type='top1'
    )

    best_features_top2 = genetic_algorithm(
        X_train_temp, y_train_temp, X_test_temp, y_test_temp,
        n_generations=10, pop_size=50,
        mutation_rate=0.15, model_type='top2'
    )

    evaluate_feature_subset_single(best_features_top1, X_train_temp, y_train_temp, 
                        X_test_temp, y_test_temp, model_type='top1')

    evaluate_feature_subset_single(best_features_top2, X_train_temp, y_train_temp, 
                        X_test_temp, y_test_temp, model_type='top2')
    
    return best_features_top1, best_features_top2
    

In [None]:
fp_gc = "GCC Offense.xlsx"
fp_wj = "WJ Offense.xlsx"

# Load data
df_gc = pd.read_excel(os.path.join(DATA_PATH,fp_gc))
df_wj = pd.read_excel(os.path.join(DATA_PATH,fp_wj))

print("GCC shape:", df_gc.shape)
print("WJ shape:", df_wj.shape)

GCC shape: (197, 35)
WJ shape: (230, 35)


In [12]:
df_gc.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'PERSONNEL', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM',
       'VARIATION', 'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH',
       'TARGET', 'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT'],
      dtype='object')

In [13]:
df_gc = df_gc.copy()
df_gc = df_trim(df_gc)
df_gc = create_features(df_gc)
df_gc.columns

Index(['PLAY #', 'ODK', 'QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH',
       'PLAY TYPE', 'RESULT', 'OWN SCORE', 'OPP SCORE', 'GN/LS', 'INC10',
       '2 MIN', 'FSL', 'SHIFT FROM', 'MOTION', 'OFF FORM', 'VARIATION',
       'BACKFIELD SET', 'TREE', 'OFF PLAY', 'TAG', 'STRENGTH', 'TARGET',
       'B/S ROUTE', 'DEEP SHOT', 'PROTECTION', 'RECEIVER_Jersey',
       'RECEIVER_Name', 'WR ALIGNMENT', 'ROUTE', 'COMMENT', 'GN/LS LAG 1',
       'GN/LS LAG 2', 'PLAY # LAG 1', 'PLAY # LAG 2', 'PLAY TYPE LAG 1',
       'PLAY TYPE LAG 2', 'OFF FORM LAG 1', 'OFF FORM LAG 2', 'SCORE DIFF',
       'MID OR NOT', 'HASH OR NOT', '0-2', '2-6', '6+', 'PERSONNEL_10',
       'PERSONNEL_11', 'PERSONNEL_11T', 'PERSONNEL_12', 'PERSONNEL_12T',
       'PLAY OF DRIVE NUM', 'PERSONNEL_10 LAG 1', 'PERSONNEL_10 LAG 2',
       'PERSONNEL_11 LAG 1', 'PERSONNEL_11 LAG 2', 'PERSONNEL_12 LAG 1',
       'PERSONNEL_12 LAG 2', 'PERSONNEL_12T LAG 1', 'PERSONNEL_12T LAG 2',
       'PERSONNEL_11T LAG 1', 'PERSONNEL_11T L

In [14]:
print(df_gc.isna().sum().sort_values(ascending=False).to_string())

COMMENT                         175
RECEIVER_Name                   175
RECEIVER_Jersey                 175
PROTECTION                      175
INC10                           173
TARGET                          169
TAG                             168
FSL                             159
VARIATION                       153
B/S ROUTE                       152
SHIFT FROM                      145
MOTION                          142
ROUTE                           110
WR ALIGNMENT                    109
STRENGTH                         97
TREE                             93
PERSONNEL_12T LAG 2               2
PLAY TYPE LAG 2                   2
PERSONNEL_10 LAG 2                2
PERSONNEL_12 LAG 2                2
PLAY # LAG 2                      2
PERSONNEL_11T LAG 2               2
PERSONNEL_11 LAG 2                2
DEEP SHOT                         1
PERSONNEL_10 LAG 1                1
PERSONNEL_11 LAG 1                1
PERSONNEL_11T LAG 1               1
PLAY # LAG 1                

In [15]:
df_gc.loc[df_gc['RESULT'] == 'Interception']

Unnamed: 0,PLAY #,ODK,QTR,TIME TO HALF,DN,DIST,YARD LN,HASH,PLAY TYPE,RESULT,...,DN X DIST,PREV PLAY PASS OR NOT,SCORE DIFF ^2,SCORE DIFF x TIME LEFT,SCORE DIFF x DN,SCORE DIFF / 7,TIME LEFT * SCORE DIFF / 7,SCORE DIFF x QTR,YARDS TO TD,YARDS TO TD * SCORE DIFF / 7


In [16]:
X_train_full, X_test_full, y_train_full, y_test_full = split_train(
            df_gc, 0.8, input_cols, target_col
        )

In [17]:
best_features_top1 = genetic_algorithm(
        X_train_full, y_train_full, X_test_full, y_test_full,
        n_generations=30, pop_size=50,
        mutation_rate=0.15, model_type='top1'
    )


Generation 0: Best score = 0.5714
Generation 1: Best score = 0.5714
Generation 2: Best score = 0.5714
Generation 3: Best score = 0.6000
Generation 4: Best score = 0.6286
Generation 5: Best score = 0.6286
Generation 6: Best score = 0.6286
Generation 7: Best score = 0.6286
Generation 8: Best score = 0.6286
Generation 9: Best score = 0.6286
Generation 10: Best score = 0.6286
Generation 11: Best score = 0.6571
Generation 12: Best score = 0.6571
Generation 13: Best score = 0.6571
Generation 14: Best score = 0.6571


KeyboardInterrupt: 

In [19]:

best_features_top2 = genetic_algorithm(
        X_train_full, y_train_full, X_test_full, y_test_full,
        n_generations=10, pop_size=50,
        mutation_rate=.15, model_type='top2'
    )

Generation 0: Best score = 0.8286
Generation 1: Best score = 0.8286
Generation 2: Best score = 0.8571
Generation 3: Best score = 0.8571
Generation 4: Best score = 0.8571
Generation 5: Best score = 0.8571
Generation 6: Best score = 0.8571
Generation 7: Best score = 0.8571
Generation 8: Best score = 0.8571
Generation 9: Best score = 0.8571


In [32]:
print(best_features_top2)

['QTR', 'TIME TO HALF', 'DN', 'DIST', 'YARD LN', 'HASH', 'OWN SCORE', 'MID OR NOT', 'HASH OR NOT', 'PERSONNEL_10', 'PERSONNEL_11', 'PERSONNEL_12', 'PERSONNEL_12T', 'PLAY OF DRIVE NUM', '6+', '2 MIN OR NOT', 'OWN END', 'OPP END', 'TIME LEFT', 'WINNING', 'PLAY TYPE LAG 1', 'PLAY TYPE LAG 2', 'PERSONNEL_11 LAG 1', 'PERSONNEL_11T LAG 1', 'PERSONNEL_12 LAG 1', 'PERSONNEL_12 LAG 2', 'PREV PLAY PASS OR NOT', 'DN X DIST', 'SCORE DIFF x DN', 'SCORE DIFF / 7', 'SCORE DIFF x QTR', 'TIME LEFT * SCORE DIFF / 7', 'YARDS TO TD * SCORE DIFF / 7']


In [20]:
evaluate_feature_subset(input_cols, X_train_full, y_train_full, X_test_full, y_test_full, model_type='top2')

0.0

In [49]:
result_gc = run_ga_feature_selection(
    df=df_gc,
    input_cols=input_cols,
    target_col=target_col,
    split_loc=0.80,
    ga_generations=30,
    ga_pop_size=50,
    ga_mutation_rate=0.15,
    random_state=45,
    do_trim_fn=df_trim,              # your prep fn
    gen_features_fn=create_features,     # feature-gen mutates in place
    split_fn=split_train             # helper for chronological split
)

print("ðŸ”µ GCC Results")
print("Top-1 acc:", result_gc["top1_holdout_score"])
print("Top-2 acc:", result_gc["top2_holdout_score"])
print("Top-1 features:", result_gc["best_features_top1"])
print("Top-2 features:", result_gc["best_features_top2"])

KeyError: 'PERSONNEL'

In [None]:
result_wj = run_ga_feature_selection(
    df=df_wj,
    input_cols=input_cols,
    target_col=target_col,
    split_loc=0.80,
    ga_generations=30,
    ga_pop_size=50,
    ga_mutation_rate=0.15,
    random_state=45,
    do_trim_fn=df_trim,
    gen_features_fn=create_features,
    split_fn=split_train
)

print("ðŸŸ¢ WJ Results")
print("Top-1 acc:", result_wj["top1_holdout_score"])
print("Top-2 acc:", result_wj["top2_holdout_score"])
print("Top-1 features:", result_wj["best_features_top1"])
print("Top-2 features:", result_wj["best_features_top2"])