In [1]:
# !python3.11 -m pip install seaborn matplotlib tensorflow

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [3]:
def read_csv_file(file_path):
    df = pd.read_csv(file_path)
    return df

# Load the dataset
# Assuming 'chunked_pbp_with_scores_2022.csv' is in the same directory
# If the file is in a different directory, provide the full path to the file
def group_by_game_id_and_chunk(df):
    grouped = df.groupby(['game_id', 'time_chunk'])
    return grouped 

df = read_csv_file('chunked_pbp_with_scores_2022.csv')
grouped_df = group_by_game_id_and_chunk(df)


def clean_scores(chunk_df):
    df = chunk_df.copy()
    
    df = df.sort_values('eventnum').reset_index(drop=True)
        
    # Initialize previous score and scoremargin
    prev_score = None
    prev_scoremargin = None
    for index, row in enumerate(df.itertuples(index=False)):
        # print(row.score, row.scoremargin, pd.isna(row.score), pd.isna(row.scoremargin))
        if row.score == '' or pd.isna(row.score):
            if prev_score is not None:
                # print(f"Using previous score: {prev_score}")
                df.at[index, 'score'] = prev_score
            else:
                # print("Both previous score and current score are None. Setting to default.")
                df.at[index, 'score'] = '0 - 0'
        else:
            # print(f"Current score is valid: {row.score}")
            prev_score = row.score

        if row.scoremargin == '' or pd.isna(row.scoremargin) or row.scoremargin == 'TIE':
            if prev_scoremargin is not None:
                # print(f"Using previous scoremargin: {prev_scoremargin}")
                df.at[index, 'scoremargin'] = prev_scoremargin
            else:
                # print("Both previous scoremargin and current scoremargin are None. Setting to default.")
                df.at[index, 'scoremargin'] = '0'
        else:
            # print(f"Current scoremargin is valid: {row.scoremargin}")
            prev_scoremargin = row.scoremargin
    # print("Final scores and score margins after filling:")
    # print(df[['score', 'scoremargin']])
    return df

        
        
    # # Replace 'TIE' with 0 and clean numeric scoremargin
    # df['scoremargin_clean'] = df['scoremargin'].replace({'TIE': '0', '': np.nan})
    # df['scoremargin_clean'] = pd.to_numeric(df['scoremargin_clean'], errors='coerce')

    # # Forward fill scoremargin
    # df['scoremargin_filled'] = df['scoremargin_clean'].ffill().fillna(0.0)

    # # Forward fill score (like '2 - 0')
    # df['score_filled'] = df['score'].ffill().fillna('0 - 0')

    # # Compute margin from score if score exists
    # def margin_from_score(score_str):
    #     try:
    #         home, away = map(int, score_str.strip().split('-'))
    #         return home - away
    #     except:
    #         return np.nan

    # df['margin_from_score'] = df['score_filled'].apply(margin_from_score)

    # # Final fallback: use margin from score if scoremargin was missing
    # df['scoremargin_final'] = df['scoremargin_clean'].combine_first(df['margin_from_score'])
    # df['scoremargin_final'] = df['scoremargin_final'].ffill().fillna(0.0)

    # return df[['eventnum', 'score', 'score_filled', 'scoremargin', 'scoremargin_filled', 'scoremargin_final']]


In [4]:
# Pick one sample group (first one)
sample_key = list(grouped_df.groups.keys())[0]
print(f"\nSample key: {sample_key}")

# Extract the chunk DataFrame
# For demonstration, we will take the first group as a sample

# Print the number of groups
chunk_df = grouped_df.get_group(sample_key)

cleaned_scores = clean_scores(chunk_df)
print("\nCleaned scores for the sample chunk:")
print(cleaned_scores["eventmsgactiontype"].value_counts())  # Display the count of each event type
print(cleaned_scores[['eventnum', 'score', 'scoremargin']])  # Display first 10 rows for brevity


Sample key: (22100001, 1.0)

Cleaned scores for the sample chunk:
eventmsgactiontype
0      15
1       4
79      4
72      3
2       2
11      2
12      2
3       2
52      1
80      1
47      1
5       1
97      1
103     1
Name: count, dtype: int64
    eventnum   score scoremargin
0          2   0 - 0           0
1          4   0 - 0           0
2          7   0 - 0           0
3          8   0 - 0           0
4          9   0 - 0           0
5         11   0 - 0           0
6         12   0 - 0           0
7         13   0 - 0           0
8         14   0 - 0           0
9         15   0 - 0           0
10        16   0 - 0           0
11        18   0 - 0           0
12        19   0 - 0           0
13        20   2 - 0          -2
14        22   2 - 0          -2
15        23   2 - 0          -2
16        24   2 - 2          -2
17        25   2 - 2          -2
18        26   2 - 2          -2
19        27   2 - 5           3
20        29   4 - 5           1
21        31   4 - 5  

In [5]:
# !python3.11 -m pip install scikit-learn

In [6]:
from sklearn.preprocessing import OneHotEncoder
event_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
event_encoder.fit(df[['eventmsgtype', 'eventmsgactiontype']])

In [7]:
print(event_encoder.categories_)

[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 18]), array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  25,  26,  27,
        28,  29,  30,  33,  35,  36,  37,  39,  40,  41,  42,  43,  44,
        45,  47,  50,  51,  52,  57,  58,  63,  66,  67,  71,  72,  73,
        74,  75,  76,  78,  79,  80,  86,  87,  93,  96,  97,  98,  99,
       100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110])]


In [8]:
def parse_score(score_str):
    try:
        parts = str(score_str).split(" - ")
        return [int(parts[0]), int(parts[1])]
    except:
        return [0, 0]

def parse_scoremargin(margin):
    if pd.isna(margin) or margin == 'TIE':
        return 0
    try:
        return int(float(margin))
    except:
        return 0

def vectorize_chunk(chunk_df, event_encoder, max_seq_len=100):
    """
    Vectorizes a chunk_df into a fixed-size 2D numpy array of shape (max_seq_len, feature_dim).
    Uses:
      - One-hot encoding for eventmsgtype and eventmsgactiontype
      - Parsed score and scoremargin
      - Numeric columns: period, eventnum, game_seconds, pts_home, pts_away, video_available_flag
    """
    # print(f"Input parameters are: {chunk_df.shape}, {event_encoder}, {max_seq_len}")
    if chunk_df.empty:
        print("Chunk is empty. Returning zero vector.")
        return np.zeros((max_seq_len, event_encoder.transform([[0, 0]]).shape[1] + 9))  # 2 from score, 1 from margin, 6 numerics

    # --- One-hot encoding of eventmsgtype + eventmsgactiontype ---
    # print(f"Before one-hot encoding, chunk_df has shape: {chunk_df.shape} and {chunk_df[['eventmsgtype', 'eventmsgactiontype']].dropna().shape}")
    event_ohe = event_encoder.transform(chunk_df[['eventmsgtype', 'eventmsgactiontype']])
    # print(f"One-hot encoded shape: {event_ohe.shape}")
    # print(f"Event OHE sample:\n{event_ohe[:5]}")  # Display first 5 rows for debugging

    # --- Score: "98 - 97" -> [98, 97] ---
    # print(f"Before parsing scores, chunk_df has shape: {chunk_df.shape} and score column:\n{chunk_df['score']}")
    score_matrix = np.array(chunk_df['score'].apply(parse_score).to_list())  # (N, 2)
    # print(f"After parsing scores, score_matrix shape: {score_matrix.shape}, sample:\n{score_matrix[:5]}")  # Display first 5 rows for debugging

    # --- Scoremargin: 'TIE' or NaN -> 0 ---
    # print(f"Before parsing scoremargin, chunk_df has shape: {chunk_df.shape} and scoremargin column:\n{chunk_df['scoremargin']}")
    scoremargin = chunk_df['scoremargin'].apply(parse_scoremargin).astype(float).values.reshape(-1, 1)
    # print(f"After parsing scoremargin, scoremargin shape: {scoremargin.shape}, sample:\n{scoremargin[:5]}")  # Display first 5 rows for debugging

    # --- Other numerical columns ---
    numeric_cols = ['period', 'eventnum', 'game_seconds']
    # print("Before extracting numeric columns, chunk_df has shape with numeric columns: ", chunk_df[numeric_cols].shape)   
    numeric_data = chunk_df[numeric_cols].fillna(0).astype(float).values  # (N, 6)
    # print("After extracting numeric columns, chunk_df has shape with numeric columns: ", chunk_df[numeric_cols].shape)   
    # --- Combine all features ---
    full_vector = np.concatenate([event_ohe, score_matrix, scoremargin, numeric_data], axis=1)  # (N, F)
    # print(f"Full vector shape after concatenation: {full_vector.shape}")
    # print(f"Full vector sample:\n{full_vector[:5]}")  # Display first 5 rows for debugging
    # --- Pad or truncate to max_seq_len ---
    seq_len, feature_dim = full_vector.shape

    if seq_len < max_seq_len:
        pad = np.zeros((max_seq_len - seq_len, feature_dim))
        full_vector = np.vstack([full_vector, pad])
    elif seq_len > max_seq_len:
        full_vector = full_vector[:max_seq_len]

    return full_vector  




chunk_df = clean_scores(grouped_df.get_group(sample_key))
# print("\nVectorizing chunk..., shape: ", chunk_df.shape)
# print("Columns in chunk_df: ", chunk_df.columns)
vectorized_chunk = vectorize_chunk(chunk_df, event_encoder)
# print("\nVectorized chunk shape: ", vectorized_chunk.shape)
# print("Vectorized chunk sample (first 5 rows):\n", vectorized_chunk[:5])  # Display first 5 rows for debugging
# print("\nVectorized chunk:\n", vectorized_chunk)

flattened_vector = vectorized_chunk.flatten()
print("\nFlattened vector shape: ", flattened_vector.shape)


Flattened vector shape:  (9600,)


In [9]:
X = []
y_cls = []   # For classification (home win/loss)
y_reg = []   # For regression (score difference)
chunk_window = 12
for game_id, game_df in df.groupby('game_id'):
        game_df = game_df.sort_values('time_chunk')
        chunks = sorted(game_df['time_chunk'].unique())
        if len(chunks) != 13:
            # this is for something that can't even get final label and have 3 chunk windows
            continue 
        final_chunk = chunks[-1]
        final_chunk_df = grouped_df.get_group((game_id, final_chunk))
        final_chunk_df = clean_scores(final_chunk_df)
        pts_home = final_chunk_df['pts_home'].iloc[-1]
        pts_away = final_chunk_df['pts_away'].iloc[-1]
        final_score = f"{pts_home} - {pts_away}"
        final_margin = pts_home - pts_away
        
        for i in range(0, len(chunks) - chunk_window):
            try:
                chunk_group = []
                for t in range(i, i + chunk_window):
                    chunk = grouped_df.get_group((game_id, chunks[t]))
                    chunk = clean_scores(chunk)
                    chunk_vector = vectorize_chunk(chunk, event_encoder)
                    chunk_group.append(chunk_vector)
            except KeyError:
                continue
            chunk_stack = np.stack(chunk_group)
            chunk_flat = chunk_stack.flatten()
            X.append(chunk_flat)
            y_cls.append(1 if final_margin > 0 else 0)
            y_reg.append(final_margin)
            
print("\nNumber of samples: ", len(X))
print("Sample X shape: ", np.array(X).shape)
print("Sample y_cls shape: ", np.array(y_cls).shape)
print("Sample y_reg shape: ", np.array(y_reg).shape)


Number of samples:  1138
Sample X shape:  (1138, 115200)
Sample y_cls shape:  (1138,)
Sample y_reg shape:  (1138,)


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Scale input first (always before PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reduce to 300 or 100 dims (try tuning)
pca = PCA(n_components=300)
X_reduced = pca.fit_transform(X_scaled)

print("New shape:", X_reduced.shape)  # (10*num_games, 300)

New shape: (1138, 300)


In [11]:
from sklearn.model_selection import train_test_split

X_cls_train, X_cls_val, y_cls_train, y_cls_val = train_test_split(X_reduced, y_cls, test_size=0.2, random_state=42, stratify=y_cls)
print("Train shapes:", X_cls_train.shape, len(y_cls_train))

y_cls_train = np.array(y_cls_train)
y_cls_val = np.array(y_cls_val)

## print the type of the data
print(type(X_cls_train))
print(type(y_cls_train))
print(type(X_cls_val))
print(type(y_cls_val))
# print the shapes of the data
print(X_cls_train.shape)
print(y_cls_train.shape)
print(X_cls_val.shape)
print(y_cls_val.shape)


Train shapes: (910, 300) 910
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(910, 300)
(910,)
(228, 300)
(228,)


In [12]:
X_reg_train, X_reg_val, y_reg_train, y_reg_val = train_test_split(X_reduced, y_reg, test_size=0.2, random_state=42, stratify=y_cls)

In [13]:
# !pip uninstall tensorflow -y

In [14]:
# !pip install tensorflow-macos
# !pip install tensorflow-metal

In [15]:
import platform
print(platform.machine())

x86_64


In [16]:
# !pip uninstall jax jaxlib -y

In [17]:
import tensorflow as tf

2025-03-29 00:50:47.038712: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
import time
import os
import numpy as np
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, mean_squared_error
import psutil

In [19]:
tf.keras.backend.clear_session()
gc.collect()

0

In [20]:
## import callback from keras
from tensorflow.keras.callbacks import Callback

# RAM usage function
def print_memory_usage(stage=""):
    process = psutil.Process(os.getpid())
    mem_bytes = process.memory_info().rss
    mem_mb = mem_bytes / (1024 ** 2)
    print(f"[{stage}] RAM usage: {mem_mb:.2f} MB")

# Timing callback to monitor epoch durations
class TimingCallback(Callback):
    def on_train_begin(self, logs=None):
        self.times = []

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs=None):
        epoch_time = time.time() - self.epoch_time_start
        self.times.append(epoch_time)
        print(f"Epoch {epoch+1} took {epoch_time:.2f} seconds")

print_memory_usage("After imports")

[After imports] RAM usage: 4039.24 MB


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_cls_train.shape[1],), kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

timing_callback = TimingCallback()

print_memory_usage("After model definition")

2025-03-29 00:50:48.434736: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Max
2025-03-29 00:50:48.434837: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-03-29 00:50:48.434844: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
2025-03-29 00:50:48.434901: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-03-29 00:50:48.434928: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


[After model definition] RAM usage: 3754.78 MB


In [22]:
print("\n--- Starting Training ---")
train_start = time.time()

history = model.fit(
    X_cls_train, y_cls_train,
    validation_data=(X_cls_val, y_cls_val),
    epochs=30,
    batch_size=32,
    callbacks=[early_stop, timing_callback],
    verbose=1
)

train_end = time.time()
print(f"✅ Total training time: {train_end - train_start:.2f} seconds")
print_memory_usage("After training")


--- Starting Training ---
Epoch 1/30


2025-03-29 00:50:49.328160: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-03-29 00:50:49.376707: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp_10.




2025-03-29 00:50:52.620131: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 1 took 4.15 seconds
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
✅ Total training time: 8.68 seconds
[After training] RAM usage: 3796.31 MB


In [23]:
print("\n--- Evaluating Model ---")
eval_start = time.time()

loss, acc, auc = model.evaluate(X_cls_val, y_cls_val, verbose=0)
print(f"📈 Val Accuracy: {acc:.4f} | Val AUC: {auc:.4f}")

eval_end = time.time()
print(f"✅ Evaluation time: {eval_end - eval_start:.2f} seconds")



--- Evaluating Model ---
📈 Val Accuracy: 0.4868 | Val AUC: 0.4903
✅ Evaluation time: 0.11 seconds


In [24]:
print("\n--- Predicting ---")
pred_start = time.time()

y_pred = (model.predict(X_cls_val, verbose=0) > 0.5).astype(int)

pred_end = time.time()
print(f"✅ Prediction time: {pred_end - pred_start:.2f} seconds")
print_memory_usage("After evaluation")


# === Classification Report ===
print("\n--- Classification Report ---")
report_start = time.time()

print(classification_report(y_cls_val, y_pred))

report_end = time.time()
print(f"✅ Classification report time: {report_end - report_start:.2f} seconds")
print_memory_usage("After classification report")


--- Predicting ---
✅ Prediction time: 0.10 seconds
[After evaluation] RAM usage: 3705.67 MB

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.53      0.56      0.54       124
           1       0.43      0.40      0.42       104

    accuracy                           0.49       228
   macro avg       0.48      0.48      0.48       228
weighted avg       0.48      0.49      0.48       228

✅ Classification report time: 0.00 seconds
[After classification report] RAM usage: 3705.67 MB


2025-03-29 00:50:57.844093: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [25]:
weights, biases = model.layers[0].get_weights()
print("First Dense Layer Weights Shape:", weights.shape)
print("First Dense Layer Biases Shape:", biases.shape)


First Dense Layer Weights Shape: (300, 256)
First Dense Layer Biases Shape: (256,)
