In [131]:
import pandas as pd
import numpy as np
import pickle as pkl

import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Dropout, TimeDistributed, LayerNormalization, GlobalAveragePooling1D
from keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [132]:
df = pd.read_csv(
    'data\premades\over_by_over_data_with_player_data.csv',
    dtype=float,
    low_memory=True
)

In [133]:
X = df.drop(columns=['innings_total_score', 'ball'])
y = df['innings_total_score']

X = X.apply(pd.to_numeric, errors='coerce')

In [134]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)

X_test = X[X['year'] >= 2024].copy()
X_train_val = X[X['year'] < 2024].copy()

grouping_cols = ['match_id', 'innings', 'over', 'year']

X_train_val_features = X_train_val.drop(columns=grouping_cols)
X_test_features = X_test.drop(columns=grouping_cols)

scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val_features)
X_test_scaled = scaler.transform(X_test_features)

pca = PCA(n_components=0.95)
X_train_val_pca = pca.fit_transform(X_train_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_train_val_pca_df = pd.DataFrame(X_train_val_pca, index=X_train_val.index)
X_test_pca_df = pd.DataFrame(X_test_pca, index=X_test.index)

X_train_val_processed = pd.concat([X_train_val[grouping_cols], X_train_val_pca_df], axis=1)
X_test_processed = pd.concat([X_test[grouping_cols], X_test_pca_df], axis=1)

X = pd.concat([X_train_val_processed, X_test_processed])

In [135]:
X_test = X[X['year'] >= 2024].copy()
X_train = X[X['year'] < 2024].copy()
train_match_ids = X_train['match_id'].unique().tolist()
test_match_ids = X_test['match_id'].unique().tolist()

y_train = y[X['year'] < 2024].copy()
y_test = y[X['year'] >= 2024].copy()

In [136]:
train_ids, val_ids = train_test_split(X_train['match_id'].unique(), test_size=0.10, random_state=42)

X_train_seq = X_train[X_train['match_id'].isin(train_ids)]
X_val_seq = X_train[X_train['match_id'].isin(val_ids)]

y_train_seq = y_train[y_train.index.isin(X_train_seq.index)]
y_val_seq = y_train[y_train.index.isin(X_val_seq.index)]

print(X_train_seq.shape, X_val_seq.shape)
print(y_train_seq.shape, y_val_seq.shape)

(98287, 107) (10980, 107)
(98287,) (10980,)


In [137]:
def convert_to_sequences(X_df, y_series, key_cols=('match_id','innings'), 
                        time_col='over', max_len=50, feature_cols=None,
                        exclude_cols=['match_id', 'year']):
    if feature_cols is None:
        exclude = set(key_cols) | {time_col} | set(exclude_cols)
        feature_cols = [c for c in X_df.columns if c not in exclude]

    seqs, targets = [], []

    for key_vals, grp in X_df.groupby(list(key_cols)):
        grp = grp.sort_values(time_col)
        feats = grp[feature_cols].to_numpy(dtype=np.float32)

        if feats.shape[0] >= max_len:
            padded = feats[:max_len]
        else:
            pad = np.zeros((max_len - feats.shape[0], feats.shape[1]), dtype=np.float32)
            padded = np.vstack([feats, pad])

        seqs.append(padded)
        targets.append(y_series.loc[grp.index].iloc[0])

    X_seq = np.stack(seqs, axis=0)
    y_seq = np.array(targets, dtype=np.float32)
    return X_seq, y_seq

In [138]:
X_train_seq, y_train_seq = convert_to_sequences(X_train_seq, y_train_seq)
X_val_seq, y_val_seq = convert_to_sequences(X_val_seq, y_val_seq)
X_test_seq, y_test_seq = convert_to_sequences(X_test, y_test)

print(X_train_seq.shape, y_train_seq.shape)
print(X_val_seq.shape, y_val_seq.shape)
print(X_test_seq.shape, y_test_seq.shape)

(2193, 50, 103) (2193,)
(243, 50, 103) (243,)
(79, 50, 103) (79,)


In [139]:
model = Sequential()
model.add(Input(shape=(50, X_train_seq.shape[2])))
model.add(TimeDistributed(Dense(128, activation='relu')))
model.add(LSTM(256, return_sequences=True, dropout=0.1))
model.add(LSTM(128, return_sequences=True, dropout=0.1))
model.add(LSTM(64, return_sequences=False, dropout=0.1))
model.add(LayerNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

optimizer = Adam(learning_rate=1e-3, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='mean_squared_error')

In [140]:
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-8,
    verbose=1
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [141]:
epochs = 100
batch_size = 128

In [142]:
history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[reduce_lr, early_stop],
    verbose=1
)

Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 366ms/step - loss: 56784.0039 - val_loss: 20594.7285 - learning_rate: 0.0010
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 292ms/step - loss: 12046.2441 - val_loss: 5685.3374 - learning_rate: 0.0010
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 283ms/step - loss: 5620.9727 - val_loss: 3366.6887 - learning_rate: 0.0010
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 284ms/step - loss: 2912.9863 - val_loss: 1824.6439 - learning_rate: 0.0010
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 294ms/step - loss: 1899.3141 - val_loss: 1712.3573 - learning_rate: 0.0010
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 312ms/step - loss: 1515.2280 - val_loss: 951.3123 - learning_rate: 0.0010
Epoch 7/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 293ms/step - lo

In [None]:
y_pred = model.predict(X_test_seq).flatten()
rmse = np.sqrt(np.mean((y_test_seq - y_pred)**2))
print(f'RMSE: {rmse}')

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 492ms/step
RMSE: 19.455677032470703
