In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Model
import cv2
from google.colab import drive
import os

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open("/content/drive/MyDrive/Shot_JSON.json", "r") as f:
    annotations = json.load(f)

In [None]:
print(json.dumps(annotations[100], indent=4))

{
    "id": 102,
    "annotations": [
        {
            "id": 65,
            "completed_by": 2,
            "result": [
                {
                    "value": {
                        "framesCount": 80,
                        "duration": 2.670295,
                        "sequence": [
                            {
                                "frame": 15,
                                "enabled": true,
                                "rotation": 0,
                                "x": 41.85185185185185,
                                "y": 39.166666666666664,
                                "width": 23.703703703703706,
                                "height": 31.166666666666664,
                                "time": 0.5
                            },
                            {
                                "x": 39.777777777777615,
                                "y": 39.16666666666663,
                                "width": 48.88888888888902,
              

EXTRACTING FEATURES FROM THE JSON FILE FOR CNN

In [None]:
cnn_data = []

for entry in annotations:
    # Extract the video path
    video_path = entry['data'].get('video', entry.get('file_upload'))

    for annotation in entry['annotations']:
        for result in annotation['result']:
            if result['type'] == 'videorectangle':  # Ensure it's a bounding box annotation
                sequence = result['value']['sequence']  # List of bounding box details

                # Extract the frame values for the sequence
                if len(sequence) >= 2:  # Ensure at least two frames are present
                    first_frame = sequence[0]
                    second_frame = sequence[1]
                    frame_gap = second_frame['frame'] - first_frame['frame']  # Calculate the gap

                    frame_info = {
                        "video_path": video_path,
                        "frame_start":first_frame['frame'],
                        "frame_gap": frame_gap,
                        "x": first_frame['x'],
                        "y": first_frame['y'],
                        "width": first_frame['width'],
                        "height": first_frame['height'],
                        "label": result['value']['labels'][0]  # Assuming one label per box
                    }
                    cnn_data.append(frame_info)

# Print extracted frame gap with bounding box details for the first item
print(json.dumps(cnn_data[:2], indent=2))


[
  {
    "video_path": "/data/upload/1/ea8c80e2-ad2662dc-4a3d-49ab-8bae-d73c7c1fad19.mp4",
    "frame_start": 12,
    "frame_gap": 38,
    "x": 40.97877358490566,
    "y": 26.458333333333332,
    "width": 19.45754716981132,
    "height": 68.54166666666667,
    "label": "DRIVE"
  },
  {
    "video_path": "/data/upload/1/2ec6bc1b-7853f0c2-f84e-4a1a-8fad-5866df67aec8.mp4",
    "frame_start": 9,
    "frame_gap": 47,
    "x": 41.214622641509436,
    "y": 10,
    "width": 20.63679245283019,
    "height": 81.875,
    "label": "DRIVE"
  }
]


### PREPROCESSING

In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input

cnn_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

In [None]:
import cv2
import numpy as np

def extract_frames_based_on_json(video_path, json_data, frame_rate=5):
    """
    Extracts frames starting from `frame_start` and for the next `frame_gap` frames based on JSON metadata.

    Args:
        video_path (str): Path to the video file.
        json_data (dict): JSON metadata containing annotations.
        frame_rate (int): Frames per second to process (default: 5).

    Returns:
        list: A list of extracted frames.
    """
    # Parse video path and frame details from JSON
    start_frame = json_data.get("frame_start", 0)
    frame_gap = json_data.get("frame_gap", 0)

    if start_frame < 0 or frame_gap <= 0:
        print("Invalid frame_start or frame_gap: Ensure they are non-negative and greater than 0.")
        return []  # Return an empty list if frame_start or frame_gap is invalid

    print(f"Video path: {video_path}")
    print(f"Starting frame: {start_frame}")
    print(f"Extracting {frame_gap} frames starting from frame {start_frame}")

    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return []

    frames = []

    # Extract frames from the starting frame to the range defined by frame_gap
    for frame_num in range(start_frame, start_frame + frame_gap):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            # Resize the frame for CNN input
            frame = cv2.resize(frame, (224, 224))
            frames.append(frame)
        else:
            print(f"Warning: Could not read frame {frame_num} from {video_path}")
            break  # Stop if a frame cannot be read

    cap.release()
    return frames


In [None]:
def extract_cnn_features(frames, cnn_model):
    """
    Extract CNN features for each frame.

    Parameters:
    - frames (list of np.array): A list of frames to be processed.
    - cnn_model (tensorflow.keras.Model): The pre-trained CNN model.

    Returns:
    - features (list of np.array): List of feature vectors extracted from CNN.
    """
    features = []
    for frame in frames:
        # Preprocess the frame for CNN
        x = np.expand_dims(frame, axis=0)
        x = preprocess_input(x)  # Apply the specific preprocess for the CNN model

        # Get CNN feature vector
        feature = cnn_model.predict(x)
        features.append(feature)

    return np.array(features)

In [None]:
def process_for_lstm(cnn_data, cnn_model):
    """
    Preprocess CNN data for LSTM training: extract frames, get CNN features,
    and prepare the data with labels.

    Parameters:
    - cnn_data (list of dict): Contains video metadata and frame info.
    - cnn_model (tensorflow.keras.Model): The pre-trained CNN model.

    Returns:
    - X_train (np.array): Features for LSTM input (padded sequences).
    - y_train (np.array): Labels corresponding to each video.
    """
    X_train = []
    y_train = []

    for video_metadata in cnn_data:
        video_path = video_metadata['video_path']
        video_path = video_path.split('/')[-1].split('-')
        if(len(video_path)>2):
                  video_path='-'.join(video_path[0:])
        else:
                  video_path=video_path[-1]
        if '_1.39.36AM' in video_path:
                  video_path = video_path.replace("_1.39.36AM", "")
        print(video_path)

        frames = extract_frames_based_on_json('/content/drive/MyDrive/all_videos/'+video_path, video_metadata)

        cnn_features = extract_cnn_features(frames, cnn_model)


        label = video_metadata['label']


        X_train.append(cnn_features)
        y_train.append(label)


    X_train_padded = pad_sequences(X_train, dtype='float32', padding='post', value=0.0)
    label_encoder = LabelEncoder()


    y_train_numeric = label_encoder.fit_transform(y_train)

    y_train_encoded = to_categorical(y_train_numeric)

    return X_train_padded, y_train_encoded

In [None]:
X_train, y_train = process_for_lstm(cnn_data, cnn_model)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extracting 32 frames starting from frame 9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 344ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 320ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 305ms/step
[1m1/1[0m [32

In [None]:
X_train_,X_test,y_train_,y_test=train_test_split(X_train,y_train,test_size=0.2,random_state=42)

In [None]:
X_train_.shape

(250, 109, 1, 2048)

In [None]:
y_train_.shape

(250, 3)

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

num_classes = y_train_.shape[1]

input_shape = (55, 2048)

# Build the LSTM model
model = Sequential()

model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
model.add(Dropout(0.5))  # Add dropout to prevent overfitting

model.add(LSTM(64))
model.add(Dropout(0.5))

model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()
X_train_reshaped = X_train_.squeeze(axis=-2)
model.fit(X_train_reshaped, y_train_, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(**kwargs)


Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 650ms/step - accuracy: 0.3598 - loss: 1.0944 - val_accuracy: 0.4200 - val_loss: 1.0927
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 676ms/step - accuracy: 0.3549 - loss: 1.0739 - val_accuracy: 0.6400 - val_loss: 1.0073
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 751ms/step - accuracy: 0.6014 - loss: 0.8799 - val_accuracy: 0.5800 - val_loss: 1.1162
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 607ms/step - accuracy: 0.5920 - loss: 1.0008 - val_accuracy: 0.5800 - val_loss: 0.9392
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 688ms/step - accuracy: 0.3635 - loss: 1.1259 - val_accuracy: 0.6200 - val_loss: 0.8954
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 553ms/step - accuracy: 0.6661 - loss: 0.7359 - val_accuracy: 0.3600 - val_loss: 1.1982
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f136eff5f90>

In [None]:
# Assuming you have test data: X_test, y_test
X_test_reshaped = X_test.squeeze(axis=-2)
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step - accuracy: 0.5471 - loss: 0.7743
Test Loss: 0.7902085781097412, Test Accuracy: 0.523809552192688


In [None]:
from sklearn.metrics import confusion_matrix
predictions=model.predict(X_test_reshaped)
y_pred_classes = np.argmax(predictions, axis=1)
y_true_classes = np.argmax(y_test, axis=1)  # Assuming you have y_test as true labels

cm = confusion_matrix(y_true_classes, y_pred_classes)
print("Confusion Matrix:")
print(cm)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step
Confusion Matrix:
[[ 0  0 25]
 [ 0 16  5]
 [ 0  0 17]]


In [None]:
model.save('lstm_model_cpu.h5')



In [None]:
def extract_and_preprocess_frames(video_path, frame_rate=5):
    """
    Extract and preprocess frames from the video to match model's input shape.

    Args:
        video_path (str): Path to the video file.
        frame_rate (int): Frames per second to extract (default: 5).

    Returns:
        np.ndarray: Preprocessed frames as a 3D array (timesteps, features).
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return None

    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Extract frames at specified frame_rate
    for frame_num in range(0, frame_count, frame_rate):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            # Resize and preprocess the frame
            frame_resized = cv2.resize(frame, (224, 224))
            frame_features = extract_features(frame_resized)
            frames.append(frame_features)
        else:
            print(f"Warning: Could not read frame {frame_num} from {video_path}")

    cap.release()

    return np.array(frames)



In [None]:

def extract_features(frame):
    """
    Extract features from a single frame using a pretrained CNN (ResNet50).

    Args:
        frame (np.array): The input frame (image) to extract features from.

    Returns:
        np.array: The extracted 2048-dimensional feature vector.
    """
    img = image.img_to_array(frame)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    features = cnn_model.predict(img)

    return features.flatten()

In [None]:
video_frames = extract_and_preprocess_frames('/content/drive/MyDrive/all_videos/IMG_3672.mp4')
pred_labels=['DRIVE','CUT','SWEEP']
if video_frames is not None:

      num_frames = video_frames.shape[0]
      feature_dim = video_frames.shape[1]

      print(f"Number of frames: {num_frames}, Feature dimension: {feature_dim}")

      if feature_dim == 2048:
          video_frames_reshaped = video_frames.reshape(1, num_frames, feature_dim)

          # Make the prediction
          predictions = model.predict(video_frames_reshaped)
          print(f"Predictions for new video: {predictions}")
          if(predictions[0,0]<0.08):
            print("Drive")
          else:
            print(pred_labels[np.argmax(predictions)])

      else:
          print(f"Feature dimension mismatch. Expected 2048 but got {feature_dim}.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
Drives:
3519 - cut [[0.0357267  0.95555335 0.00872004]]
3526 - cut [[0.0357267  0.95555335 0.00872004]]
3715 - cut [[0.05750339 0.8998511  0.04264552]]
3712 - cut [[0.06390148 0.8905781  0.04552043]]
3687 - cut [[0.06368186 0.89665484 0.03966327]]
3685 - cut [[0.04314112 0.92388755 0.03297131]]

In [None]:
Cut:
3760 - cut [[0.16932677 0.59303975 0.23763353]]
3750 - cut [[0.25067297 0.60800284 0.14132418]]
3732 - cut [[0.1349786 0.7685429 0.0964785]]
3610 - cut [[0.12145452 0.8322539  0.04629161]]
3603 - cut [[0.1372246  0.8170885  0.04568694]]

In [None]:
sweep:
3668 - sweep [[0.00951817 0.00635219 0.98412967]]
3636 - sweep [[0.1408576  0.31536686 0.54377556]]
3793 - sweep [[0.2691766  0.43742046 0.29340294]]
3800 - sweep [[0.14770539 0.25613624 0.5961584 ]]
3672 - sweep [[0.01251993 0.0081762  0.97930384]]