In [32]:
import cv2 as cv
import matplotlib.pyplot as plt
import os
import numpy as np
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from datetime import datetime
from tqdm import tqdm

In [33]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [20]:
def mediapipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results

In [21]:
def draw_styled_landmarks(image, results):
    # Draw pose connections for cricket analysis
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))

In [22]:
def extract_keypoints(results):
    """Extract pose keypoints - focusing on pose for cricket shots"""
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    return pose

In [23]:
DATA_PATH = '/home/smayan/Desktop/Cricket Pose Estimation /Data'  # Your cricket dataset path
sequence_length = 30  # Frames per sequence
min_sequences_per_class = 10

In [24]:
def load_cricket_data():
    """Load cricket shot data with variable number of videos per class"""
    # Get all cricket shot classes from folder names
    actions = np.array(sorted([folder for folder in os.listdir(DATA_PATH) 
                              if os.path.isdir(os.path.join(DATA_PATH, folder))]))
    print(f"Detected cricket shots: {actions}")
    
    # Count videos per class
    for action in actions:
        action_path = os.path.join(DATA_PATH, action)
        video_files = [f for f in os.listdir(action_path) if f.endswith(('.mp4', '.avi', '.mov'))]
        print(f"{action}: {len(video_files)} videos")
    
    return actions

In [25]:
from tqdm import tqdm

def extract_sequences_from_videos(actions):
    """Extract sequences from videos with data augmentation for classes with fewer videos"""
    sequences = []
    labels = []
    label_map = {label: num for num, label in enumerate(actions)}
    
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for action in actions:
            action_path = os.path.join(DATA_PATH, action)
            video_files = [f for f in os.listdir(action_path) if f.endswith(('.mp4', '.avi', '.mov'))]
            
            action_sequences = []
            
            for video_file in tqdm(video_files, desc=f"Processing {action}"):
                video_path = os.path.join(action_path, video_file)
                cap = cv.VideoCapture(video_path)
                
                # Get total frames in video
                total_frames = int(cap.get(cv.CAP_PROP_FRAME_COUNT))
                
                # Extract multiple sequences from each video using sliding window
                stride = max(1, sequence_length // 4)  # Overlapping sequences
                
                for start_frame in tqdm(range(0, total_frames - sequence_length + 1, stride), 
                        desc=f"Frames in {video_file}", leave=False):
                    cap.set(cv.CAP_PROP_POS_FRAMES, start_frame)
                    sequence = []
                    
                    for frame_idx in range(sequence_length):
                        ret, frame = cap.read()
                        if not ret:
                            break
                        
                        # Resize frame for consistency
                        frame = cv.resize(frame, (640, 480))
                        
                        _, results = mediapipe_detection(frame, holistic)
                        keypoints = extract_keypoints(results)
                        sequence.append(keypoints)
                    
                    if len(sequence) == sequence_length:
                        action_sequences.append(sequence)
                
                cap.release()
            
            # Data augmentation for classes with fewer sequences
            while len(action_sequences) < min_sequences_per_class:
                # Add augmented versions (you can implement more sophisticated augmentation)
                if action_sequences:
                    # Simple augmentation: add noise
                    original_seq = np.array(action_sequences[len(action_sequences) % len(action_sequences)])
                    noise = np.random.normal(0, 0.01, original_seq.shape)
                    augmented_seq = original_seq + noise
                    action_sequences.append(augmented_seq.tolist())
            
            # Add sequences and labels
            for seq in action_sequences:
                sequences.append(seq)
                labels.append(label_map[action])
            
            print(f"Generated {len(action_sequences)} sequences for {action}")
    
    return np.array(sequences), np.array(labels), label_map

In [26]:
def create_hybrid_cnn_lstm_model(input_shape, num_classes):
    """Create a Hybrid CNN-LSTM model for cricket pose estimation"""
    model = Sequential()
    
    # CNN layers for spatial feature extraction
    model.add(TimeDistributed(Conv1D(64, kernel_size=3, activation='relu'), 
                             input_shape=input_shape))
    model.add(TimeDistributed(Conv1D(64, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Dropout(0.25)))
    
    model.add(TimeDistributed(Conv1D(128, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(Conv1D(128, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Dropout(0.25)))
    
    # Flatten the CNN output for LSTM
    model.add(TimeDistributed(Flatten()))
    
    # LSTM layers for temporal sequence modeling
    model.add(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))
    model.add(LSTM(64, return_sequences=False, dropout=0.3, recurrent_dropout=0.3))
    
    # Dense layers for classification
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    return model

In [27]:
def train_cricket_model():
    # Load data
    actions = load_cricket_data()
    print("Extracting sequences from videos...")
    X, y, label_map = extract_sequences_from_videos(actions)
    
    print(f"Dataset shape: {X.shape}")
    print(f"Labels shape: {y.shape}")
    
    # Reshape X for CNN input (samples, timesteps, features, 1)
    X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
    
    # Convert labels to categorical
    y_categorical = to_categorical(y, num_classes=len(actions))
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_categorical, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    
    # Handle class imbalance
    class_weights = compute_class_weight(
        'balanced', 
        classes=np.unique(y), 
        y=y
    )
    class_weight_dict = dict(enumerate(class_weights))
    
    # Create model
    model = create_hybrid_cnn_lstm_model(
        input_shape=(sequence_length, X.shape[2], 1), 
        num_classes=len(actions)
    )
# Create timestamp for TensorBoard logs
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = f'logs/cricket_model_{timestamp}'
    # Compile model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Callbacks
    callbacks = [
            TensorBoard(
                log_dir=log_dir,
                histogram_freq=1,
                write_graph=True,
                update_freq='epoch'
            ),
            EarlyStopping(patience=15, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=10, min_lr=1e-7)
        ]
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=16,
        validation_data=(X_test, y_test),
        callbacks=callbacks,
        class_weight=class_weight_dict,
        verbose=1
    )
    
    # Evaluate model
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Save model
    model.save('cricket_pose_model.h5')
    model.save('cricket_pose_model.keras')
    
    # Save label mapping
    np.save('cricket_label_map.npy', label_map)
    
    return model, history, label_map

In [28]:
def real_time_cricket_prediction():
    """Real-time cricket shot prediction"""
    model = tf.keras.models.load_model('cricket_pose_model.h5')
    label_map = np.load('cricket_label_map.npy', allow_pickle=True).item()
    actions = list(label_map.keys())
    
    # Prediction variables
    sequence = []
    predictions = []
    threshold = 0.7
    
    cap = cv.VideoCapture(0)  # Use webcam
    
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            draw_styled_landmarks(image, results)
            
            # Extract keypoints
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-sequence_length:]
            
            if len(sequence) == sequence_length:
                # Reshape for model input
                input_seq = np.expand_dims(np.array(sequence), axis=0)
                input_seq = input_seq.reshape(1, sequence_length, -1, 1)
                
                # Make prediction
                res = model.predict(input_seq, verbose=0)[0]
                predicted_action = actions[np.argmax(res)]
                confidence = np.max(res)
                
                # Display prediction
                if confidence > threshold:
                    cv.putText(image, f'{predicted_action}: {confidence:.2f}', 
                              (10, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                
                # Visualization of probabilities
                for i, (action, prob) in enumerate(zip(actions, res)):
                    y_pos = 100 + i * 30
                    cv.rectangle(image, (10, y_pos), (int(prob * 300) + 10, y_pos + 25), 
                               (0, 255, 0), -1)
                    cv.putText(image, f'{action}: {prob:.2f}', (15, y_pos + 18), 
                              cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
            
            cv.imshow('Cricket Pose Estimation', image)
            
            if cv.waitKey(10) & 0xFF == ord('q'):
                break
    
    cap.release()
    cv.destroyAllWindows()

In [29]:
if __name__ == "__main__":
    # Train the model
    model, history, label_map = train_cricket_model()

I0000 00:00:1753628289.843167   15138 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1753628289.902044   17549 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.172.08), renderer: NVIDIA GeForce RTX 4070 SUPER/PCIe/SSE2


Detected cricket shots: ['Backfoot punch' 'Cover drive' 'Cut Shot' 'FBD' 'Flick'
 'Front Food defence' 'On Drive' 'Pull Shot' 'Reverse Sweep'
 'Straight Drive' 'Sweep' 'Uppercut' 'loft']
Backfoot punch: 19 videos
Cover drive: 29 videos
Cut Shot: 43 videos
FBD: 15 videos
Flick: 22 videos
Front Food defence: 32 videos
On Drive: 40 videos
Pull Shot: 40 videos
Reverse Sweep: 30 videos
Straight Drive: 25 videos
Sweep: 27 videos
Uppercut: 29 videos
loft: 31 videos
Extracting sequences from videos...


Processing Backfoot punch:   0%|          | 0/19 [00:00<?, ?it/s]W0000 00:00:1753628289.934759   17522 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753628289.951133   17534 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753628289.952187   17528 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753628289.952442   17531 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753628289.952510   17521 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1753628289.95

Generated 112 sequences for Backfoot punch


Processing Cover drive: 100%|██████████| 29/29 [01:50<00:00,  3.80s/it]


Generated 152 sequences for Cover drive


Processing Cut Shot: 100%|██████████| 43/43 [02:19<00:00,  3.23s/it]


Generated 193 sequences for Cut Shot


Processing FBD: 100%|██████████| 15/15 [01:06<00:00,  4.45s/it]


Generated 91 sequences for FBD


Processing Flick: 100%|██████████| 22/22 [01:21<00:00,  3.72s/it]


Generated 112 sequences for Flick


Processing Front Food defence: 100%|██████████| 32/32 [02:17<00:00,  4.29s/it]


Generated 186 sequences for Front Food defence


Processing On Drive: 100%|██████████| 40/40 [01:52<00:00,  2.82s/it]


Generated 155 sequences for On Drive


Processing Pull Shot: 100%|██████████| 40/40 [02:31<00:00,  3.80s/it]


Generated 212 sequences for Pull Shot


Processing Reverse Sweep: 100%|██████████| 30/30 [02:26<00:00,  4.87s/it]


Generated 199 sequences for Reverse Sweep


Processing Straight Drive: 100%|██████████| 25/25 [02:18<00:00,  5.53s/it]


Generated 191 sequences for Straight Drive


Processing Sweep: 100%|██████████| 27/27 [01:56<00:00,  4.33s/it]


Generated 159 sequences for Sweep


Processing Uppercut: 100%|██████████| 29/29 [01:45<00:00,  3.65s/it]


Generated 146 sequences for Uppercut


Processing loft: 100%|██████████| 31/31 [02:12<00:00,  4.27s/it]
  super().__init__(**kwargs)
I0000 00:00:1753629805.539658   15138 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9350 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


Generated 184 sequences for loft
Dataset shape: (2092, 30, 132)
Labels shape: (2092,)
Training set: (1673, 30, 132, 1)
Test set: (419, 30, 132, 1)


AttributeError: module 'datetime' has no attribute 'now'

In [31]:
X

NameError: name 'X' is not defined