In [None]:
## Project: Action Recognition using CNN + LSTM (PyTorch)
import sys
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
# Project root: works when run from repo root or from notebooks/
try:
    PROJECT_ROOT = Path(__file__).resolve().parent.parent
except NameError:
    PROJECT_ROOT = Path.cwd() if (Path.cwd() / "src" / "vllmd").exists() else Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
# Make the package importable as "vllmd" (package lives under src/vllmd)
_src = PROJECT_ROOT / "src"
if _src.exists() and str(_src) not in sys.path:
    sys.path.insert(0, str(_src))

import pandas as pd
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch  # pyright: ignore[reportMissingImports]
import torch.nn as nn  # pyright: ignore[reportMissingImports]
from torch.utils.data import DataLoader  # pyright: ignore[reportMissingImports]
from torch.optim import Adam  # pyright: ignore[reportMissingImports]
import torchvision.models as models  # pyright: ignore[reportMissingImports]
from vllmd.video_processing import VideoDataProcessor, VideoDataset, ActionRecognitionPipeline, describe_frames_after_predict_each_frame

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

In [None]:
# Paths relative to project root (use data/ for datasets)
DIRECTORY = PROJECT_ROOT / "data" / "abnormal"
NORMAL_DIRECTORY = PROJECT_ROOT / "data" / "normal"
PATH_VIDEO_TEST = PROJECT_ROOT / "data" / "test" / "19.mp4"
RULE_PATH = PROJECT_ROOT / "data" / "rules" / "entity_rules.txt"
PATH_MODEL = PROJECT_ROOT / "models" / "best_model.pt"

DIRECTORY = str(DIRECTORY)
NORMAL_DIRECTORY = str(NORMAL_DIRECTORY)
PATH_VIDEO_TEST = str(PATH_VIDEO_TEST)
RULE_PATH = str(RULE_PATH)
PATH_MODEL = str(PATH_MODEL)

processor = VideoDataProcessor(DIRECTORY, NORMAL_DIRECTORY, frame_size=64, num_frames=40)
selected_classes = processor.selected_classes
Frame_Size = processor.frame_size
print(selected_classes)

In [None]:
train_df = processor.build_dataframe()
print(train_df.head())

In [None]:
# Saving original frame without any resizing
from PIL import Image

for class_index, cls in enumerate(selected_classes):
    if cls == "Normal Videos":
        class_path = os.path.join(NORMAL_DIRECTORY, cls)
        videos = os.listdir(class_path)[:30]
    else:
        class_path = os.path.join(DIRECTORY, cls)
        videos = os.listdir(class_path)[:30]
    for video in videos:
        video_path = os.path.join(class_path, video)
        frames = processor.extract_original_frames(video_path, num_frames=40)
        a = frames.shape

In [None]:
X_train, X_test, y_train, y_test = processor.load_training_arrays(videos_per_class=30, test_size=0.2, random_state=42)
print("Training:", len(X_train), "Testing:", len(X_test))

In [None]:
train_dataset = VideoDataset(X_train, y_train)
val_dataset = VideoDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0)

In [None]:
num_classes = len(selected_classes)
pipeline = ActionRecognitionPipeline(num_classes, device, frame_size=Frame_Size, num_frames=40)
model = pipeline.model

In [None]:
pipeline.train(train_loader, val_loader, epochs=1, save_path=str(PATH_MODEL))

In [None]:
pipeline.load(str(PATH_MODEL))
# pipeline.show_predictions(X_test, y_test, selected_classes, num_samples=5)

In [None]:
# Run prediction on test video to get frame_indices, frames, frames_original
frame_indices, preds, probs, frames, frames_original = pipeline.predict_each_frame(
    str(PATH_VIDEO_TEST), processor, selected_classes
)

In [None]:
# External API image description (5 LLM models: OpenAI, Anthropic, Google)
from vllmd.llm import ExternalLLMImageDescriber, MODEL_NAMES

# Optional: set API keys here or via env (OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY)
describer = ExternalLLMImageDescriber(
    default_model="gemini-3-flash-preview",  # advanced Gemini; or "gemini-2.0-flash", "gpt-4o-mini", etc.
)

# Describe a few predicted frames with the external API (uses frames from cell above)
num_frames_to_describe = 4
show_indices = np.linspace(0, len(frame_indices) - 1, min(num_frames_to_describe, len(frame_indices)), dtype=int)
frames_to_describe = [frames_original[i] if frames_original is not None else frames[i] for i in show_indices]

external_descriptions = []
for i, frame in enumerate(frames_to_describe):
    desc = describer.describe_frame(frame, prompt="Describe this video frame in 2-3 short sentences.")
    external_descriptions.append(desc)
    print(f"Frame {frame_indices[show_indices[i]]} [{describer.default_model}]: {desc}")
    #print()

In [None]:
# Display each described frame and its description in a two-column table
from vllmd.utils import display_frames_with_descriptions

display_frames_with_descriptions(
    frames_to_describe,
    external_descriptions,
    frame_indices=[frame_indices[show_indices[i]] for i in range(len(frames_to_describe))],
)

In [None]:
# Custom entity extraction on frame descriptions (regex + keywords)
from vllmd.utils import load_rules_from_file
custom = load_rules_from_file(RULE_PATH, merge_overlaps=True)

custom_entities_per_frame = custom.extract_batch(external_descriptions, merge_duplicates=True)

print("Custom entities per frame description (ACTION / OBJECT / SCENE):")
for i, (desc, entities) in enumerate(zip(external_descriptions, custom_entities_per_frame)):
    print(f"\nFrame {i}: {desc}")
    if entities:
        print(f"  -> {[str(e) for e in entities]}")
    else:
        print("  -> (none)")