In [None]:
import os
import cv2
import json
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoProcessor, AutoModelForVision2Seq
import torch

In [None]:
def extract_frames(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError(f"Could not open {video_path}")

    prev_sec = -1
    saved = 0

    while True:
        success, frame = cap.read()
        if not success:
            break

        # get current position in milliseconds
        ms = cap.get(cv2.CAP_PROP_POS_MSEC)
        sec = int(ms // 1000)

        # save first frame of each new second
        if sec > prev_sec:
            prev_sec = sec
            fname = os.path.join(output_folder, f"frame_{sec:03d}s.jpg")
            cv2.imwrite(fname, frame)
            saved += 1

    cap.release()
    print(f"Extracted {saved} frames — one per second.")

In [None]:
#This workflow was not at all performant, and took 238 minutes to process a 26 second clip.
'''bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

pipe = pipeline(
    "image-text-to-text",
    model="llava-hf/llava-1.5-7b-hf",
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=bnb_config
)

def caption_frames(frames_dir="frames", output_json="captions.json"):
    captions = {}
    for fname in sorted(os.listdir(frames_dir)):
        if not fname.lower().endswith(".jpg"):
            continue

        img_path = os.path.join(frames_dir, fname)
        img = Image.open(img_path).convert("RGB")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text": "Describe the student's facial expression, gaze direction, and body posture in specific terms."}
                ]
            }
        ]

        out = pipe(text=messages, max_new_tokens=50, num_beams=3)[0]
        caption = out["generated_text"]
        assistant_msgs = [m["content"] for m in caption if m.get("role")=="assistant"]

        captions[fname] = assistant_msgs
        print(f"{fname} → {assistant_msgs}")

    # 3️⃣ Save all captions
    with open(output_json, "w") as f:
        json.dump(captions, f, indent=2)

    return captions'''


In [None]:
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cuda":
    autocast_ctx = torch.cuda.amp.autocast(dtype=torch.float16)
else:                           # CPU fallback → do nothing special
    from contextlib import nullcontext
    autocast_ctx = nullcontext()
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    torch_dtype=torch.float16,
    _attn_implementation= "eager",
).to(DEVICE)
def caption_frames(frames_dir="frames", output_json="captions.json"):
    captions = {}
    for fname in sorted(os.listdir(frames_dir)):
        if not fname.lower().endswith(".jpg"):
            continue

        img_path = os.path.join(frames_dir, fname)
        img = Image.open(img_path).convert("RGB")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text": "““Describe **all** observable facial muscle states, eye openness, head tilt, torso posture, and any hand/arm positions. Avoid repeating defaults—if a cue isn’t present, say ‘none.’ For each category, use concrete, frame-specific language (e.g., ‘left eyebrow slightly raised,’ ‘right eye half-closed,’ ‘chin tucked,’ ‘leaning forward on elbows,’ ‘right hand supporting chin’).”””"}
                ]
            }
        ]
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[img], return_tensors="pt")
        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
                  for k, v in inputs.items()}
        with torch.amp.autocast(DEVICE):          # fp16, saves VRAM
            generated_ids = model.generate(**inputs, max_new_tokens=500)
        generated_texts = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )
        raw = generated_texts[0]
        if "Assistant: " in raw:
            assistant_reply = raw.split("Assistant: ", 1)[1].strip()
        else:
            assistant_reply = raw
        captions[fname] = assistant_reply
        print(f"{fname} : {assistant_reply}")
    
    with open(output_json, "w") as f:
        json.dump(captions, f, indent=2)    

In [None]:
from huggingface_hub import login
login()

In [None]:
def run_llm_analysis(caption_file, model_id="google/gemma-3-1b-it"):
    with open(caption_file, 'r') as f:
        captions = json.load(f)
    text = "\n".join([f"{k}: {v}" for k, v in captions.items()])
    prompt = f"""You are analyzing student behavior from video frames.
Here are the frame-wise captions:
{text}

Is the student focused or distracted? Justify your answer in one paragraph.
"""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
    #pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2,
        repetition_penalty=1.1,
    )
    gen_ids = outputs[0, input_ids.shape[-1]:]
    result = tokenizer.decode(gen_ids, skip_special_tokens=True)

    print("\nLLM Analysis Result:\n", result)

In [None]:
def full_pipeline(video_path):
    extract_frames(video_path, "frames")
    caption_frames("frames", "captions.json")
    run_llm_analysis("captions.json")

In [None]:
full_pipeline("testingVideo.mp4")