In [1]:
!pip install transformers decord numpy torch




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from decord import VideoReader
import numpy as np
import torch

# Load model and feature extractor
model = VideoMAEForVideoClassification.from_pretrained(
    r"F:\quest_digiflex\exp\tf\videomae-finetuned",  # or wherever your config.json and safetensors are stored
    local_files_only=True
)
feature_extractor = VideoMAEImageProcessor.from_pretrained(
    "MCG-NJU/videomae-base"
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAESdpaAttention(
            (attention): VideoMAESdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (den

In [6]:
def predict_video_class(video_path, model, extractor, num_frames=16, id2label=None):
    vr = VideoReader(video_path)
    total_frames = len(vr)
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = vr.get_batch(frame_indices).asnumpy()  # shape: (num_frames, H, W, 3)

    # Extract features
    inputs = extractor(list(frames), return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        predicted_class = logits.argmax(-1).item()

    # Decode class
    if id2label:
        return id2label[predicted_class]
    return predicted_class


In [7]:
video_path = r"F:\quest_digiflex\exp\tf\videomae-finetuned\Bodyweight Squats.mp4"

# You must define this mapping (same used during training)
id2label = {0: "jumping_jack", 1: "squat", 2: "pushup", ...}  # example

predicted_class = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
print("Predicted class:", predicted_class)


SyntaxError: ':' expected after dictionary key (2496193448.py, line 4)

In [8]:
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor, AutoConfig
from decord import VideoReader
import numpy as np
import torch
import os

# --- Load Model, Config, Feature Extractor ---
model_dir = r"F:/quest_digiflex/exp/tf/videomae-finetuned"

# Make sure model_10.safetensor is renamed to model.safetensors before running this
model = VideoMAEForVideoClassification.from_pretrained(model_dir, local_files_only=True)
feature_extractor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
config = AutoConfig.from_pretrained(model_dir, local_files_only=True)
id2label = config.id2label  # Automatically retrieved from saved config

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# --- Inference Function ---
def predict_video_class(video_path, model, extractor, num_frames=16, id2label=None):
    vr = VideoReader(video_path)
    total_frames = len(vr)
    
    if total_frames < num_frames:
        raise ValueError(f"Video has only {total_frames} frames, but {num_frames} are required.")

    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = vr.get_batch(frame_indices).asnumpy()  # (num_frames, H, W, 3)

    inputs = extractor(list(frames), return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)

    with torch.no_grad():
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        predicted_class = logits.argmax(-1).item()

    return id2label[predicted_class] if id2label else predicted_class

# --- Predict on Unseen Video ---
video_path = r"F:\quest_digiflex\exp\tf\videomae-finetuned\Bodyweight Squats.mp4"  # Update this to your actual video path

predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
print("✅ Predicted Exercise Class:", predicted_label)


✅ Predicted Exercise Class: squat


In [9]:
# --- Predict on Unseen Video ---
video_path = r"F:\quest_digiflex\exp\tf\videomae-finetuned\test\pull Up\pull up_1.mp4"  # Update this to your actual video path

predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
print("✅ Predicted Exercise Class:", predicted_label)


✅ Predicted Exercise Class: tricep dips


In [10]:
# --- Predict on Unseen Video ---
video_path = r"F:\quest_digiflex\exp\tf\videomae-finetuned\test\pull Up\pull up_2.mp4"  # Update this to your actual video path

predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
print("✅ Predicted Exercise Class:", predicted_label)


✅ Predicted Exercise Class: pull Up


In [11]:
# --- Predict on Unseen Video ---
video_path = r"F:\quest_digiflex\exp\tf\videomae-finetuned\test\push-up\push-up_1.mp4"  # Update this to your actual video path

predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
print("✅ Predicted Exercise Class:", predicted_label)


✅ Predicted Exercise Class: russian twist


In [12]:
# --- Predict on Unseen Video ---
video_path = r"F:\quest_digiflex\exp\tf\videomae-finetuned\test\push-up\video15.mp4"  # Update this to your actual video path

predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
print("✅ Predicted Exercise Class:", predicted_label)


✅ Predicted Exercise Class: push-up


In [15]:
import os
from glob import glob
import pandas as pd
from collections import Counter

# --- Test Directory ---
test_root = r"F:\quest_digiflex\exp\tf\videomae-finetuned\test"

# --- Store results ---
results = []

# --- Process each video ---
for actual_class in os.listdir(test_root):
    class_path = os.path.join(test_root, actual_class)
    if not os.path.isdir(class_path):
        continue

    video_files = glob(os.path.join(class_path, "*.mp4"))

    for video_path in video_files:
        try:
            predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
            video_name = os.path.basename(video_path)
            results.append({
                "video": video_name,
                "actual_class": actual_class,
                "predicted_class": predicted_label
            })
            print(f"✅ {video_name} | Actual: {actual_class} --> Predicted: {predicted_label}")
        except Exception as e:
            print(f"⚠️ Error processing {video_path}: {e}")
            continue

# --- Convert to DataFrame ---
df_results = pd.DataFrame(results)

# --- Accuracy Analysis ---
df_results["is_correct"] = df_results["actual_class"] == df_results["predicted_class"]

total_videos = len(df_results)
correct_preds = df_results["is_correct"].sum()
incorrect_preds = total_videos - correct_preds

# --- Most misclassified class ---
incorrect_df = df_results[df_results["is_correct"] == False]
most_misclassified_class = incorrect_df["actual_class"].value_counts().idxmax() if not incorrect_df.empty else None

# --- Output Summary ---
print("\n=== Prediction Summary ===")
print(df_results)

print("\n=== Statistics ===")
print(f"🎞️ Total videos processed     : {total_videos}")
print(f"✅ Correct predictions        : {correct_preds}")
print(f"❌ Incorrect predictions      : {incorrect_preds}")
if most_misclassified_class:
    print(f"⚠️ Most misclassified class   : {most_misclassified_class}")

# --- Optional: Save to CSV ---
# df_results.to_csv("video_predictions_with_accuracy.csv", index=False)


✅ barbell biceps curl_1.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ video1.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ bench press_1.mp4 | Actual: bench press --> Predicted: lat pulldown
✅ bench press_2.mp4 | Actual: bench press --> Predicted: bench press
✅ video2.mp4 | Actual: bench press --> Predicted: bench press
✅ chest fly machine_1.mp4 | Actual: chest fly machine --> Predicted: chest fly machine
✅ chest fly machine_2.mp4 | Actual: chest fly machine --> Predicted: barbell biceps curl
✅ chest fly machine_3.mp4 | Actual: chest fly machine --> Predicted: chest fly machine
✅ video3.mp4 | Actual: chest fly machine --> Predicted: chest fly machine
✅ deadlift_1.mp4 | Actual: deadlift --> Predicted: russian twist
✅ video4.mp4 | Actual: deadlift --> Predicted: tricep dips
✅ declince bench press_1.mp4 | Actual: decline bench press --> Predicted: bench press
✅ decline bench press_2.mp4 | Actual: decline bench press --> Predicted: bench pr

In [14]:
import os
from glob import glob
import pandas as pd
from collections import Counter

# --- Test Directory ---
test_root = r"F:\quest_digiflex\exp\data\archive"

# --- Store results ---
results = []

# --- Process each video ---
for actual_class in os.listdir(test_root):
    class_path = os.path.join(test_root, actual_class)
    if not os.path.isdir(class_path):
        continue

    video_files = glob(os.path.join(class_path, "*.mp4"))

    for video_path in video_files:
        try:
            predicted_label = predict_video_class(video_path, model, feature_extractor, id2label=id2label)
            video_name = os.path.basename(video_path)
            results.append({
                "video": video_name,
                "actual_class": actual_class,
                "predicted_class": predicted_label
            })
            print(f"✅ {video_name} | Actual: {actual_class} --> Predicted: {predicted_label}")
        except Exception as e:
            print(f"⚠️ Error processing {video_path}: {e}")
            continue

# --- Convert to DataFrame ---
df_results = pd.DataFrame(results)

# --- Accuracy Analysis ---
df_results["is_correct"] = df_results["actual_class"] == df_results["predicted_class"]

total_videos = len(df_results)
correct_preds = df_results["is_correct"].sum()
incorrect_preds = total_videos - correct_preds

# --- Most misclassified class ---
incorrect_df = df_results[df_results["is_correct"] == False]
most_misclassified_class = incorrect_df["actual_class"].value_counts().idxmax() if not incorrect_df.empty else None

# --- Output Summary ---
print("\n=== Prediction Summary ===")
print(df_results)

print("\n=== Statistics ===")
print(f"🎞️ Total videos processed     : {total_videos}")
print(f"✅ Correct predictions        : {correct_preds}")
print(f"❌ Incorrect predictions      : {incorrect_preds}")
if most_misclassified_class:
    print(f"⚠️ Most misclassified class   : {most_misclassified_class}")

# --- Optional: Save to CSV ---
# df_results.to_csv("video_predictions_with_accuracy.csv", index=False)


✅ barbell biceps curl_1.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_10.mp4 | Actual: barbell biceps curl --> Predicted: tricep Pushdown
✅ barbell biceps curl_11.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_12.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_13.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_14.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_15.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_16.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_17.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_18.mp4 | Actual: barbell biceps curl --> Predicted: barbell biceps curl
✅ barbell biceps curl_19.mp4 | Actual: barbell biceps curl --> Pr

Exception ignored in: <function VideoReader.__del__ at 0x00000190096C2D40>
Traceback (most recent call last):
  File "C:\Users\prana\AppData\Local\Programs\Python\Python312\Lib\site-packages\decord\video_reader.py", line 67, in __del__
    _CAPI_VideoReaderFree(self._handle)
  File "C:\Users\prana\AppData\Local\Programs\Python\Python312\Lib\site-packages\decord\_ffi\_ctypes\function.py", line 173, in __call__
    check_call(_LIB.DECORDFuncCall(
               ^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt: 


✅ bench press_16.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_17.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_18.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_19.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_2.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_20.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_21.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_22.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_23.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_24.mp4 | Actual: bench press --> Predicted: bench press
✅ bench press_25.mp4 | Actual: bench press --> Predicted: bench press


KeyboardInterrupt: 