In [2]:
! pip uninstall tensorflow
! pip install tensorflow==2.12.0

Found existing installation: tensorflow 2.14.0
Uninstalling tensorflow-2.14.0:
  Would remove:
    /usr/local/bin/estimator_ckpt_converter
    /usr/local/bin/import_pb_to_tensorboard
    /usr/local/bin/saved_model_cli
    /usr/local/bin/tensorboard
    /usr/local/bin/tf_upgrade_v2
    /usr/local/bin/tflite_convert
    /usr/local/bin/toco
    /usr/local/bin/toco_from_protos
    /usr/local/lib/python3.10/dist-packages/tensorflow-2.14.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/tensorflow/*
Proceed (Y/n)? Y
  Successfully uninstalled tensorflow-2.14.0
Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (585.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloadin

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold

In [5]:
# Load the dataset
data = pd.read_csv('Football 49 Data_Season 2023.csv')

# Selecting relevant features for the LSTM model
selected_features = ['Play Number', 'Series', 'Down', 'Distance', 'Field Position', 'Gain',
                     'Formation', 'Motion', 'Play', 'Run Concept', 'The_Play',
                     'R/P', 'Pass Result']

In [6]:
# Adding lagged features for 'QB Comment'
num_lags = 5
for lag in range(1, num_lags + 1):
    data[f'QB_Comment_lag_{lag}'] = data['QB Comment'].shift(lag)

In [7]:
# Dropping rows with NaN values created due to lagging
data.dropna(inplace=True)

In [8]:
# Encoding categorical variables
categorical_columns = data[selected_features].select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [9]:
# Normalizing numerical features
numerical_columns = data[selected_features].select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [10]:
# Defining the target variable (QB Comment) and encoding it
target_variable = 'QB Comment'
le_target = LabelEncoder()
data[target_variable] = le_target.fit_transform(data[target_variable])

In [11]:
# Split the dataset into main and hold-out sets
main_data, holdout_data = train_test_split(data, test_size=0.2, random_state=42)

In [12]:
# Further split the main dataset into training, validation, and test sets
split_index_1 = int(len(main_data) * 0.6)  # 60% for training
split_index_2 = int(len(main_data) * 0.8)  # Next 20% for validation, remaining for testing

In [13]:
train_data = main_data[:split_index_1]
validation_data = main_data[split_index_1:split_index_2]
test_data = main_data[split_index_2:]


In [14]:
# Extract features and target for each set
X_train, y_train = train_data[selected_features].values, train_data[target_variable].values
X_validation, y_validation = validation_data[selected_features].values, validation_data[target_variable].values
X_test, y_test = test_data[selected_features].values, test_data[target_variable].values
X_holdout, y_holdout = holdout_data[selected_features].values, holdout_data[target_variable].values


In [15]:
# Defining the window size for the LSTM model
window_size = 5  # Using 5 previous time steps to predict the next one

In [16]:
# Create time series generators for each dataset
train_generator = TimeseriesGenerator(X_train, y_train, length=window_size, batch_size=1)
validation_generator = TimeseriesGenerator(X_validation, y_validation, length=window_size, batch_size=1)
test_generator = TimeseriesGenerator(X_test, y_test, length=window_size, batch_size=1)
holdout_generator = TimeseriesGenerator(X_holdout, y_holdout, length=window_size, batch_size=1)

In [17]:
# Function to create the LSTM model
def create_lstm_model(lstm_units=50, dropout_rate=0.2, optimizer='adam'):
    model = Sequential([
        LSTM(lstm_units, activation='relu', input_shape=(window_size, X_train.shape[1])),
        Dropout(dropout_rate),
        Dense(units=len(np.unique(y_train)), activation='softmax')
    ])
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [18]:
# Define hyperparameter options
lstm_units_options = [50, 100]
dropout_rate_options = [0.0, 0.2, 0.5]
optimizer_options = ['adam', 'rmsprop']

best_score = 0
best_params = {}

for lstm_units in lstm_units_options:
    for dropout_rate in dropout_rate_options:
        for optimizer in optimizer_options:
            # K-Fold Cross Validation
            kfold = KFold(n_splits=3, shuffle=True, random_state=42)
            cv_scores = []

            for train, val in kfold.split(X_train, y_train):
                model = create_lstm_model(lstm_units, dropout_rate, optimizer)

                # Create generators for each fold
                train_gen = TimeseriesGenerator(X_train[train], y_train[train], length=window_size, batch_size=1)
                val_gen = TimeseriesGenerator(X_train[val], y_train[val], length=window_size, batch_size=1)

                # Train the model
                model.fit(train_gen, epochs=10)

                # Evaluate the model
                scores = model.evaluate(val_gen, verbose=0)
                cv_scores.append(scores[1])  # Assuming 1 is the index for accuracy

            # Average CV score for this hyperparameter combo
            avg_cv_score = np.mean(cv_scores)

            if avg_cv_score > best_score:
                best_score = avg_cv_score
                best_params = {'lstm_units': lstm_units, 'dropout_rate': dropout_rate, 'optimizer': optimizer}

print(f'Best score: {best_score}, Best parameters: {best_params}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [22]:
# Create and compile the model with the best hyperparameters
best_lstm_units = best_params['lstm_units']
best_dropout_rate = best_params['dropout_rate']
best_optimizer = best_params['optimizer']

In [23]:
model = create_lstm_model(lstm_units=best_lstm_units, dropout_rate=best_dropout_rate, optimizer=best_optimizer)

# Train the model with the best hyperparameters
model.fit(train_generator, epochs=10, validation_data=validation_generator)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7af2c7a6d420>

In [24]:
# Model summary
model.summary()

Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_36 (LSTM)              (None, 50)                12800     
                                                                 
 dropout_36 (Dropout)        (None, 50)                0         
                                                                 
 dense_36 (Dense)            (None, 10)                510       
                                                                 
Total params: 13,310
Trainable params: 13,310
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Evaluate the model on the training, validation, test, and hold-out sets
print('Training Set Evaluation:', model.evaluate(train_generator))
print('Validation Set Evaluation:', model.evaluate(validation_generator))
print('Test Set Evaluation:', model.evaluate(test_generator))
print('Holdout Set Evaluation:', model.evaluate(holdout_generator))

Training Set Evaluation: [1.692723035812378, 0.3978201746940613]
Validation Set Evaluation: [1.936793327331543, 0.3613445460796356]
Test Set Evaluation: [1.9108823537826538, 0.38655462861061096]
Holdout Set Evaluation: [1.827436089515686, 0.41333332657814026]
