In [1]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

2025-07-28 15:26:23.108782: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-28 15:26:23.115680: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753696583.123665  375958 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753696583.126073  375958 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753696583.132361  375958 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [3]:
DATA_PATH = '/home/smayan/Desktop/Cricket Pose Estimation /Data'
sequence_length = 30
min_sequences_per_class = 10

In [4]:
actions = np.array(sorted([folder for folder in os.listdir(DATA_PATH) 
                          if os.path.isdir(os.path.join(DATA_PATH, folder))]))
print(f"Detected cricket shots: {actions}")
for action in actions:
    video_files = [f for f in os.listdir(os.path.join(DATA_PATH, action)) if f.endswith(('.mp4', '.avi', '.mov'))]
    video_files = video_files[:25]
    print(f"{action}: {len(video_files)} videos")

Detected cricket shots: ['Backfoot punch' 'Cover drive' 'Cut Shot' 'FBD' 'Flick'
 'Front Foot defence' 'On Drive' 'Pull Shot' 'Reverse Sweep' 'Stance'
 'Straight Drive' 'Sweep' 'Uppercut' 'loft']
Backfoot punch: 19 videos
Cover drive: 25 videos
Cut Shot: 25 videos
FBD: 15 videos
Flick: 22 videos
Front Foot defence: 25 videos
On Drive: 25 videos
Pull Shot: 25 videos
Reverse Sweep: 25 videos
Stance: 25 videos
Straight Drive: 25 videos
Sweep: 25 videos
Uppercut: 25 videos
loft: 25 videos


In [5]:
sequences = []
labels = []
label_map = {label: num for num, label in enumerate(actions)}

In [6]:
X = np.load('training_data_noresize.npy')
y = np.load('labels.npy')
print(f"Dataset shape: {X.shape}")
print(f"Labels shape: {y.shape}")


Dataset shape: (1827, 30, 132)
Labels shape: (1827,)


In [7]:
X

array([[[ 0.5554406 ,  0.24139841, -0.18845586, ...,  0.77534622,
          0.08676209,  0.38804743],
        [ 0.55521965,  0.24002098, -0.25809762, ...,  0.7693364 ,
          0.21733756,  0.39571512],
        [ 0.55163604,  0.23883222, -0.26282305, ...,  0.76868081,
          0.25477239,  0.40266779],
        ...,
        [ 0.37919977,  0.29017907,  0.15896487, ...,  0.72024399,
          0.3753553 ,  0.7028569 ],
        [ 0.37274122,  0.29221627,  0.21446192, ...,  0.72471052,
          0.29324484,  0.72602713],
        [ 0.36856019,  0.29323617,  0.14072758, ...,  0.72836959,
          0.10429919,  0.74803698]],

       [[ 0.50944591,  0.25721657, -0.21083091, ...,  0.76363158,
          0.1139462 ,  0.71500087],
        [ 0.52320641,  0.25505579, -0.24155426, ...,  0.77121109,
          0.11704806,  0.68001729],
        [ 0.51852798,  0.25573382, -0.22674216, ...,  0.77223337,
          0.14097594,  0.6512875 ],
        ...,
        [ 0.34827006,  0.29428247,  0.15070571, ...,  

In [8]:
X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
y_categorical = to_categorical(y, num_classes=len(actions))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)


In [10]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))

In [11]:
model = Sequential()

# 1. TimeDistributed CNN block with BatchNorm
model.add(TimeDistributed(Conv1D(64, kernel_size=3, padding='same', activation='relu'),
                          input_shape=(sequence_length, X.shape[2], 1)))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))

# 2. Second CNN block
model.add(TimeDistributed(Conv1D(128, kernel_size=3, padding='same', activation='relu')))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Dropout(0.3)))

# 3. Flatten before LSTM
model.add(TimeDistributed(Flatten()))

# 4. Bidirectional LSTM layers
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.4, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(64, return_sequences=False, dropout=0.4, recurrent_dropout=0.2)))

# 5. Fully Connected
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(actions), activation='softmax'))

  super().__init__(**kwargs)
I0000 00:00:1753696584.508591  375958 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9251 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


In [12]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f'logs/cricket_model_{timestamp}'

In [13]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [14]:
callbacks = [
    TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, update_freq='epoch'),
    EarlyStopping(patience=10, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=10, min_lr=1e-7)
]

In [15]:
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=16,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/50


I0000 00:00:1753696596.163880  376250 cuda_dnn.cc:529] Loaded cuDNN version 91100


[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 141ms/step - accuracy: 0.1315 - loss: 2.5680 - val_accuracy: 0.4180 - val_loss: 1.8194 - learning_rate: 0.0010
Epoch 2/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 120ms/step - accuracy: 0.3247 - loss: 1.8338 - val_accuracy: 0.5874 - val_loss: 1.1405 - learning_rate: 0.0010
Epoch 3/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 119ms/step - accuracy: 0.5081 - loss: 1.3227 - val_accuracy: 0.7678 - val_loss: 0.6720 - learning_rate: 0.0010
Epoch 4/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 117ms/step - accuracy: 0.6923 - loss: 0.9466 - val_accuracy: 0.8497 - val_loss: 0.4906 - learning_rate: 0.0010
Epoch 5/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 116ms/step - accuracy: 0.7428 - loss: 0.8074 - val_accuracy: 0.9071 - val_loss: 0.3059 - learning_rate: 0.0010
Epoch 6/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 

In [16]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9836


In [17]:
model.save('2ndbest_cricket_pose_mode_simple_even.h5')
model.save('2ndbest_cricket_pose_model_simple_even.keras')
np.save('cricket_label_map.npy', label_map)



In [19]:
# Load trained model and label map
model = tf.keras.models.load_model('cricket_pose_mode_simple_even.h5')
label_map = np.load('cricket_label_map.npy', allow_pickle=True).item()
actions = list(label_map.keys())

# Variables for prediction
sequence = []
sequence_length = 30
threshold = 0.9

# Start webcam
cap = cv.VideoCapture('/home/smayan/Desktop/Cricket Pose Estimation /Model Training/Test videos/test3.mp4')

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize for consistent input
        # frame = cv.resize(frame, (640, 480))

        # Detection
        image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = holistic.process(image)
        image.flags.writeable = True
        image = cv.cvtColor(image, cv.COLOR_RGB2BGR)

        # Draw landmarks
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(
                image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
            )

        # Extract keypoints
        if results.pose_landmarks:
            keypoints = np.array([[res.x, res.y, res.z, res.visibility]
                                  for res in results.pose_landmarks.landmark]).flatten()
        else:
            keypoints = np.zeros(33*4)

        # Append to sequence
        sequence.append(keypoints)
        sequence = sequence[-sequence_length:]

        if len(sequence) == sequence_length:
            input_seq = np.expand_dims(np.array(sequence), axis=0)
            input_seq = input_seq.reshape(1, sequence_length, -1, 1)

            # Predict
            res = model.predict(input_seq, verbose=0)[0]
            predicted_action = actions[np.argmax(res)]
            confidence = np.max(res)

            # Show prediction
            if confidence > threshold:
                cv.putText(image, f'{predicted_action}: {confidence:.2f}',
                           (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Show probabilities
            for i, (action, prob) in enumerate(zip(actions, res)):
                y_pos = 100 + i * 30
                cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), (0, 255, 0), -1)
                cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18),
                           cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # Show output
        cv.imshow('Cricket Pose Estimation', image)

        # Quit
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()


I0000 00:00:1753697679.468058  375958 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1753697679.511484  510271 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
W0000 00:00:1753697679.542786  510248 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753697679.560045  510253 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753697679.561387  510270 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753697679.561648  510253 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000