In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/video_vectorizer/

In [None]:
!pip install ultralytics
!pip install -q insightface onnxruntime-gpu
!pip install easyocr

In [None]:
import os
from google.colab import files
import tqdm
import json
import numpy as np
import pandas as pd
from main_vectorizer import vectorize_video

In [None]:
X_list = []
y_list = []
feature_names = None

# Constants
FRAMES_ROOT = "/content/drive/MyDrive/all_videos_frames/"
CSV_PATH = "classification_summary.csv"
MANIFEST_PATH = "processed.json"
OUT_DIR = "features"
os.makedirs(OUT_DIR, exist_ok=True)

# Load Metadata
meta_df = pd.read_csv(CSV_PATH, dtype=str).fillna('')
label_map = dict(zip(meta_df.video_id, meta_df.final_classification))
music_map = dict(zip(meta_df.video_id, meta_df.music_id))
username_map = dict(zip(meta_df.video_id, meta_df.username))
description_map = dict(zip(meta_df.video_id, meta_df.description))

# Video Folders
video_dirs = sorted(os.listdir(FRAMES_ROOT))
video_dirs = [d for d in video_dirs if os.path.isdir(os.path.join(FRAMES_ROOT, d))]

# Load/Create manifest
if os.path.exists(MANIFEST_PATH):
    processed = set(json.load(open(MANIFEST_PATH)))
    print(f"{len(processed)} videos already done")
else:
    processed = set()

# Process a folder of video frames
def vectorize_vid(folder_name, video_id):
    frames_path = os.path.join(FRAMES_ROOT, folder_name)

    if video_id not in label_map:
        return None

    description = description_map.get(video_id, "")
    username = username_map.get(video_id, "")
    music_id = music_map.get(video_id, "")

    try:
        vector, feature_names = vectorize_video(
            frames_path,
            description,
            username,
            music_id
        )
        return (vector, label_map[video_id], feature_names)
    except Exception as e:
        print(f"[{video_id}] Error: {e}")
        return None

# Loop over all videos, process and save them
for folder in tqdm.tqdm(video_dirs, desc="vectorizing"):
    if folder in processed:
        continue # skip already processed videos

    video_id = folder.replace("_frames", "")
    result = vectorize_vid(folder, video_id)
    if result is None:
        continue

    vector, label, feat_names = result
    out_path = f"{OUT_DIR}/{folder}.npz"

    # atomic 1-file-per-video save
    np.savez_compressed(
      f"{OUT_DIR}/{folder}.npz",
      X=vector,
      y=np.array([label]),
      feature_names=np.array(feat_names),
      video_id=np.array([video_id])
    )

    # update manifest on Drive
    processed.add(folder)
    with open(MANIFEST_PATH, "w") as jf:
        json.dump(list(processed), jf)

print(f"Done. {len(processed)}/{len(video_dirs)} videos processed.")


In [None]:
import glob

FEATURE_DIR   = "features"
OUT_NPZ       = "all_features.npz"
OUT_PARQUET   = "all_features.parquet"

Xs, ys, ids = [], [], []

# Gather all per-video file
for path in glob.glob(f"{FEATURE_DIR}/*.npz"):
    d = np.load(path)
    Xs.append(d["X"])          # shape (feature_dim,)
    ys.append(d["y"])          # shape (1,)
    ids.append(d["video_id"])  # shape (1,)

# Merge into one big table
X_all   = np.vstack(Xs)            # (N, feature_dim)
y_all   = np.hstack(ys)            # (N,)
ids_all = np.hstack(ids)           # (N,)

print("Merged:", X_all.shape, y_all.shape, ids_all.shape)

# Save in npx format
np.savez_compressed(OUT_NPZ,
                    X=X_all,
                    y=y_all,
                    video_id=ids_all)

# Save as Parquet table
df = pd.DataFrame(X_all, columns=[f"f{i}" for i in range(X_all.shape[1])])
df.insert(0, "video_id", ids_all)
df.insert(1, "label",    y_all)
df.to_parquet(OUT_PARQUET, compression="zstd")

# Download final tables
files.download(OUT_NPZ)
files.download(OUT_PARQUET)
