In [2]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [4]:
DATA_PATH = '/home/smayan/Desktop/Cricket Pose Estimation /Data'
sequence_length = 30
min_sequences_per_class = 10

In [5]:
actions = np.array(sorted([folder for folder in os.listdir(DATA_PATH) 
                          if os.path.isdir(os.path.join(DATA_PATH, folder))]))
print(f"Detected cricket shots: {actions}")
for action in actions:
    video_files = [f for f in os.listdir(os.path.join(DATA_PATH, action)) if f.endswith(('.mp4', '.avi', '.mov'))]
    print(f"{action}: {len(video_files)} videos")


Detected cricket shots: ['Backfoot punch' 'Cover drive' 'Cut Shot' 'FBD' 'Flick'
 'Front Food defence' 'On Drive' 'Pull Shot' 'Reverse Sweep'
 'Straight Drive' 'Sweep' 'Uppercut' 'loft']
Backfoot punch: 19 videos
Cover drive: 29 videos
Cut Shot: 43 videos
FBD: 15 videos
Flick: 22 videos
Front Food defence: 32 videos
On Drive: 40 videos
Pull Shot: 40 videos
Reverse Sweep: 30 videos
Straight Drive: 25 videos
Sweep: 27 videos
Uppercut: 29 videos
loft: 31 videos


In [6]:
sequences = []
labels = []
label_map = {label: num for num, label in enumerate(actions)}

In [7]:
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for action in actions:
        action_path = os.path.join(DATA_PATH, action)
        video_files = [f for f in os.listdir(action_path) if f.endswith(('.mp4', '.avi', '.mov'))]
        action_sequences = []

        for video_file in tqdm(video_files, desc=f"Processing {action}"):
            video_path = os.path.join(action_path, video_file)
            cap = cv.VideoCapture(video_path)
            total_frames = int(cap.get(cv.CAP_PROP_FRAME_COUNT))
            stride = max(1, sequence_length // 4)

            for start_frame in tqdm(range(0, total_frames - sequence_length + 1, stride), 
                                    desc=f"Frames in {video_file}", leave=False):
                cap.set(cv.CAP_PROP_POS_FRAMES, start_frame)
                sequence = []

                for _ in range(sequence_length):
                    ret, frame = cap.read()
                    if not ret:
                        break
                    frame = cv.resize(frame, (640, 480))
                    image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
                    image.flags.writeable = False
                    results = holistic.process(image)
                    image.flags.writeable = True
                    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
                    if results.pose_landmarks:
                        pose = np.array([[res.x, res.y, res.z, res.visibility] 
                                         for res in results.pose_landmarks.landmark]).flatten()
                    else:
                        pose = np.zeros(33*4)
                    sequence.append(pose)

                if len(sequence) == sequence_length:
                    action_sequences.append(sequence)

            cap.release()

        while len(action_sequences) < min_sequences_per_class:
            if action_sequences:
                original_seq = np.array(action_sequences[len(action_sequences) % len(action_sequences)])
                noise = np.random.normal(0, 0.01, original_seq.shape)
                augmented_seq = original_seq + noise
                action_sequences.append(augmented_seq.tolist())

        for seq in action_sequences:
            sequences.append(seq)
            labels.append(label_map[action])
        print(f"Generated {len(action_sequences)} sequences for {action}")

I0000 00:00:1753630544.439201   34338 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1753630544.492223   34584 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
Processing Backfoot punch:   0%|          | 0/19 [00:00<?, ?it/s]INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1753630544.531157   34569 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753630544.549785   34580 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753630544.551244   34570 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753630544.551617   34581 inference_feedback_manager.c

Generated 112 sequences for Backfoot punch


Processing Cover drive: 100%|██████████| 29/29 [01:54<00:00,  3.94s/it]


Generated 152 sequences for Cover drive


Processing Cut Shot: 100%|██████████| 43/43 [02:24<00:00,  3.36s/it]


Generated 193 sequences for Cut Shot


Processing FBD: 100%|██████████| 15/15 [01:08<00:00,  4.59s/it]


Generated 91 sequences for FBD


Processing Flick: 100%|██████████| 22/22 [01:25<00:00,  3.90s/it]


Generated 112 sequences for Flick


Processing Front Food defence: 100%|██████████| 32/32 [02:22<00:00,  4.45s/it]


Generated 186 sequences for Front Food defence


Processing On Drive: 100%|██████████| 40/40 [01:57<00:00,  2.94s/it]


Generated 155 sequences for On Drive


Processing Pull Shot: 100%|██████████| 40/40 [02:38<00:00,  3.97s/it]


Generated 212 sequences for Pull Shot


Processing Reverse Sweep: 100%|██████████| 30/30 [02:32<00:00,  5.07s/it]


Generated 199 sequences for Reverse Sweep


Processing Straight Drive: 100%|██████████| 25/25 [02:23<00:00,  5.76s/it]


Generated 191 sequences for Straight Drive


Processing Sweep: 100%|██████████| 27/27 [02:00<00:00,  4.48s/it]


Generated 159 sequences for Sweep


Processing Uppercut: 100%|██████████| 29/29 [01:49<00:00,  3.79s/it]


Generated 146 sequences for Uppercut


Processing loft: 100%|██████████| 31/31 [02:17<00:00,  4.43s/it]

Generated 184 sequences for loft





In [8]:
X = np.array(sequences)
y = np.array(labels)
print(f"Dataset shape: {X.shape}")
print(f"Labels shape: {y.shape}")

Dataset shape: (2092, 30, 132)
Labels shape: (2092,)


In [9]:
X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
y_categorical = to_categorical(y, num_classes=len(actions))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)


In [11]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))

In [12]:
model = Sequential()
model.add(TimeDistributed(Conv1D(64, kernel_size=3, activation='relu'), input_shape=(sequence_length, X.shape[2], 1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=3, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Dropout(0.25)))

model.add(TimeDistributed(Conv1D(128, kernel_size=3, activation='relu')))
model.add(TimeDistributed(Conv1D(128, kernel_size=3, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Dropout(0.25)))

model.add(TimeDistributed(Flatten()))
model.add(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))
model.add(LSTM(64, return_sequences=False, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(actions), activation='softmax'))

  super().__init__(**kwargs)
I0000 00:00:1753632119.587438   34338 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1154 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


In [13]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f'logs/cricket_model_{timestamp}'

In [14]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [15]:
callbacks = [
    TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, update_freq='epoch'),
    EarlyStopping(patience=15, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=10, min_lr=1e-7)
]

In [16]:
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=16,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/100


I0000 00:00:1753632128.534471   47413 cuda_dnn.cc:529] Loaded cuDNN version 91100


[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 85ms/step - accuracy: 0.0784 - loss: 2.6215 - val_accuracy: 0.0979 - val_loss: 2.5623 - learning_rate: 0.0010
Epoch 2/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 71ms/step - accuracy: 0.0857 - loss: 2.5333 - val_accuracy: 0.1575 - val_loss: 2.3869 - learning_rate: 0.0010
Epoch 3/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 70ms/step - accuracy: 0.1820 - loss: 2.3239 - val_accuracy: 0.2267 - val_loss: 2.1264 - learning_rate: 0.0010
Epoch 4/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 70ms/step - accuracy: 0.2062 - loss: 2.1847 - val_accuracy: 0.2912 - val_loss: 2.0094 - learning_rate: 0.0010
Epoch 5/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 70ms/step - accuracy: 0.1849 - loss: 2.1185 - val_accuracy: 0.3270 - val_loss: 1.8970 - learning_rate: 0.0010
Epoch 6/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [17]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9475


In [18]:
model.save('cricket_pose_model.h5')
model.save('cricket_pose_model.keras')
np.save('cricket_label_map.npy', label_map)



In [20]:
# Load trained model and label map
model = tf.keras.models.load_model('cricket_pose_model.h5')
label_map = np.load('cricket_label_map.npy', allow_pickle=True).item()
actions = list(label_map.keys())

# Variables for prediction
sequence = []
sequence_length = 30
threshold = 0.7

# Start webcam
cap = cv.VideoCapture('/home/smayan/Desktop/Cricket Pose Estimation /Model Training/test.mp4')

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize for consistent input
        frame = cv.resize(frame, (640, 480))

        # Detection
        image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = holistic.process(image)
        image.flags.writeable = True
        image = cv.cvtColor(image, cv.COLOR_RGB2BGR)

        # Draw landmarks
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(
                image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
            )

        # Extract keypoints
        if results.pose_landmarks:
            keypoints = np.array([[res.x, res.y, res.z, res.visibility]
                                  for res in results.pose_landmarks.landmark]).flatten()
        else:
            keypoints = np.zeros(33*4)

        # Append to sequence
        sequence.append(keypoints)
        sequence = sequence[-sequence_length:]

        if len(sequence) == sequence_length:
            input_seq = np.expand_dims(np.array(sequence), axis=0)
            input_seq = input_seq.reshape(1, sequence_length, -1, 1)

            # Predict
            res = model.predict(input_seq, verbose=0)[0]
            predicted_action = actions[np.argmax(res)]
            confidence = np.max(res)

            # Show prediction
            if confidence > threshold:
                cv.putText(image, f'{predicted_action}: {confidence:.2f}',
                           (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Show probabilities
            for i, (action, prob) in enumerate(zip(actions, res)):
                y_pos = 100 + i * 30
                cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), (0, 255, 0), -1)
                cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18),
                           cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # Show output
        cv.imshow('Cricket Pose Estimation', image)

        # Quit
        if cv.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv.destroyAllWindows()


I0000 00:00:1753632962.569709   34338 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1753632962.616315   93594 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2
W0000 00:00:1753632962.648508   93566 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753632962.666440   93571 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753632962.667323   93575 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753632962.667893   93584 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000