In [8]:
from transformers import ViTFeatureExtractor, ViTModel
import tensorflow as tf
import numpy as np
import os
import cv2
import torch

# Constants
IMG_SIZE = 224
NUM_FRAMES = 16
BATCH_SIZE = 8
EPOCHS = 10
NUM_CLASSES = 5  # Update based on your dataset

# Load ViT model and feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

# Function to load and preprocess video frames
def load_video_frames(video_path, num_frames=NUM_FRAMES, img_size=IMG_SIZE):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            break
        if i in frame_indices:
            frame = cv2.resize(frame, (img_size, img_size))
            frames.append(frame)

    cap.release()
    frames = np.array(frames)
    if len(frames) < num_frames:
        padding = num_frames - len(frames)
        frames = np.pad(frames, ((0, padding), (0, 0), (0, 0), (0, 0)), mode='constant')
    return frames

# Function to extract features using ViT
def extract_vit_features(frames):
    inputs = feature_extractor(images=list(frames), return_tensors="pt")
    with torch.no_grad():
        outputs = vit_model(**inputs).last_hidden_state
    return outputs[:, 0, :].numpy()  # [CLS] token features

# Load dataset
def load_dataset(dataset_path):
    X, y = [], []
    action_classes = sorted(os.listdir(dataset_path))
    class_to_idx = {action: idx for idx, action in enumerate(action_classes)}

    for action in action_classes:
        action_folder = os.path.join(dataset_path, action)
        for video in os.listdir(action_folder):
            video_path = os.path.join(action_folder, video)
            frames = load_video_frames(video_path)
            features = extract_vit_features(frames)
            X.append(features)
            y.append(class_to_idx[action])

    return np.array(X), np.array(y), class_to_idx

# Load data
DATASET_PATH = r"C:\Users\UTKARSH\Desktop\data science\dl\cvt\data"  # Update with your dataset path
X, y, class_to_idx = load_dataset(DATASET_PATH)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, num_classes=NUM_CLASSES)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=NUM_CLASSES)

# Define the model
def create_model():
    input_layer = tf.keras.Input(shape=(NUM_FRAMES, 768))  # 768 is the ViT hidden size
    lstm_layer = tf.keras.layers.LSTM(128)(input_layer)
    dense_layer = tf.keras.layers.Dense(128, activation="relu")(lstm_layer)
    output_layer = tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")(dense_layer)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

model = create_model()

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=BATCH_SIZE, epochs=EPOCHS)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Save the model
model.save("vit_har_model")  # Saves in SavedModel format




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 4.013370471511735e-06, Test Accuracy: 1.0




INFO:tensorflow:Assets written to: vit_har_model\assets


INFO:tensorflow:Assets written to: vit_har_model\assets


In [9]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Loss: 4.013370471511735e-06
Test Accuracy: 100.00%
