# Interpretation

### Load videos + clusters

In [22]:
import pandas as pd 
from pathlib import Path

clusters = pd.read_csv("cluster_links.csv")

#videos location
videos_dir = Path("..") / "Data" / "Videos"
clusters["video_path"] = clusters["video_id"].apply(lambda vid: videos_dir / f"{vid}.mp4")

clusters.head(10)

Unnamed: 0,video_id,cluster,url,video_path
0,challenge Shorts13,0,https://www.youtube.com/shorts/5Ul5w2owi-c,..\Data\Videos\challenge Shorts13.mp4
1,challenge Shorts16,0,https://www.youtube.com/shorts/7Wfqv7v1NUk,..\Data\Videos\challenge Shorts16.mp4
2,challenge Shorts19,0,https://www.youtube.com/shorts/cLhYKrMhOO0,..\Data\Videos\challenge Shorts19.mp4
3,challenge Shorts9,0,https://www.youtube.com/shorts/XN9UWfzYm64,..\Data\Videos\challenge Shorts9.mp4
4,funny Shorts10,0,https://www.youtube.com/shorts/FSHKhK16NH4,..\Data\Videos\funny Shorts10.mp4
5,funny Shorts4,0,https://www.youtube.com/shorts/kKr3dczgIGk,..\Data\Videos\funny Shorts4.mp4
6,meme Shorts7,0,https://www.youtube.com/shorts/0ZzMS-PNKIs,..\Data\Videos\meme Shorts7.mp4
7,recipe Shorts11,0,https://www.youtube.com/shorts/zJ6J0_-FHSY,..\Data\Videos\recipe Shorts11.mp4
8,recipe Shorts15,0,https://www.youtube.com/shorts/18gdBoDT0Rk,..\Data\Videos\recipe Shorts15.mp4
9,recipe Shorts20,0,https://www.youtube.com/shorts/HhHkKyu2xwQ,..\Data\Videos\recipe Shorts20.mp4


### Pacing for every video



Measure motion by finding average pixel change and Cut rate by frame to frame difference. 

In [None]:
import av
import numpy as np

def pacing_metrics(video_path, fps_sample=2, diff_thresh=25):
    
    container = av.open(str(video_path))
    stream = container.streams.video[0]
    src_fps = float(stream.average_rate) if stream.average_rate else 30.0
    #determine step size for sampling
    step = max(1, int(src_fps / fps_sample))

    prev = None
    diffs = []
    cuts = 0
    frame_idx = 0
    total_frames = 0

    for frame in container.decode(video=0):
        total_frames += 1
        #skip frames to fit the sample rate
        if frame_idx % step != 0:
            frame_idx += 1
            continue

        #convert gray easier to compute difference
        gray = frame.to_ndarray(format="gray")
        if prev is not None:
            #average pixel difference between frame before
            diff = np.mean(np.abs(gray.astype(np.float32) - prev.astype(np.float32)))
            diffs.append(diff)
            #if very differenct count as a change in shot
            if diff > diff_thresh:
                cuts += 1
        prev = gray
        frame_idx += 1

    duration_s = total_frames / src_fps if src_fps > 0 else 0.0
    return {
        "motion_mean": float(np.mean(diffs)) if diffs else 0.0,
        "cut_rate_per_min": float(cuts / max(duration_s / 60.0, 1e-6)),
        "duration_s": float(duration_s),
    }

metrics = []
for i, (_, row) in enumerate(clusters.iterrows(), start=1):
    print(f"{i}/{len(clusters)} {row['video_id']}")
    m = pacing_metrics(row["video_path"])
    m["video_id"] = row["video_id"]
    metrics.append(m)

metrics_df = pd.DataFrame(metrics)

out_df = clusters.merge(metrics_df, on="video_id", how="left")

#no need for these 
out_df = out_df.drop(columns=["duration_s", "video_path"], errors="ignore")
# Save full table
out_df.to_csv("interpretation.csv", index=False)




Visualize the pacing in each cluster

In [None]:
import matplotlib.pyplot as plt

df = pd.read_csv("interpretation.csv")

# Boxplot for cut rate
plt.figure(figsize=(10,4))
df.boxplot(column="cut_rate_per_min", by="cluster")
plt.suptitle("")
plt.title("Cut Rate per Minute by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Cut Rate per Min")
plt.show()

# Boxplot for motion
plt.figure(figsize=(10,4))
df.boxplot(column="motion_mean", by="cluster")
plt.suptitle("")
plt.title("Motion Mean by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Motion Mean")
plt.show()

### Audio Energy



Download audio

In [None]:
import yt_dlp
import csv

csv_path = Path("..") / "Data" / "Links" / "shorts_data" / "shorts_links_wide.csv"
out_dir = Path("audio")
out_dir.mkdir(exist_ok=True)

def download_audio(url, out_stem):
    ydl_opts = {
        "outtmpl": str(out_stem) + ".%(ext)s",
        "format": "bestaudio/best",
        "noplaylist": True,
        "quiet": True,
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192",
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

with csv_path.open(newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader, start=1):
        url = row["url"].strip()
        out_stem = out_dir / f"audio_{i}"
        print(f"{i}: {url}")
        download_audio(url, out_stem)

**audio_rms_mean** = overall loudness  
  High → music/shouting/crowd noise  
  Low → quiet narration or calm videos  

**audio_rms_std** = how much loudness changes  
  High → dynamic audio (drops, emphasis, punchlines)  
  Low → steady background or monotone speech  

In [23]:
def audio_energy_metrics(video_path):
    container = av.open(str(video_path))
    audio_streams = [s for s in container.streams if s.type == "audio"]
    if not audio_streams:
        return {"audio_rms_mean": 0.0, "audio_rms_std": 0.0}

    rms_vals = []
    #loop through audio frames to compute RMS energy
    for frame in container.decode(audio=0):
        samples = frame.to_ndarray().astype(np.float32)
        rms = np.sqrt(np.mean(samples**2))
        rms_vals.append(rms)

    if not rms_vals:
        return {"audio_rms_mean": 0.0, "audio_rms_std": 0.0}

    return {
        "audio_rms_mean": float(np.mean(rms_vals)),
        "audio_rms_std": float(np.std(rms_vals)),
    }

audio_metrics = []
for i, (_, row) in enumerate(clusters.iterrows(), start=1):
    print(f"{i}/{len(clusters)} {row['video_id']}")
    m = audio_energy_metrics(row["video_path"])
    m["video_id"] = row["video_id"]
    audio_metrics.append(m)

audio_df = pd.DataFrame(audio_metrics)

# add to csv 
base = pd.read_csv("interpretation.csv")
out_df = base.merge(audio_df, on="video_id", how="left")
out_df.to_csv("interpretation.csv", index=False)

1/222 challenge Shorts13
2/222 challenge Shorts16
3/222 challenge Shorts19
4/222 challenge Shorts9
5/222 funny Shorts10
6/222 funny Shorts4
7/222 meme Shorts7
8/222 recipe Shorts11
9/222 recipe Shorts15
10/222 recipe Shorts20
11/222 recipe Shorts3
12/222 recipe Shorts8
13/222 recipe Shorts9
14/222 tech Shorts13
15/222 tech Shorts14
16/222 tech Shorts16
17/222 tech Shorts17
18/222 tech Shorts18
19/222 tech Shorts4
20/222 travel Shorts15
21/222 travel Shorts17
22/222 travel Shorts8
23/222 ai Shorts10
24/222 ai Shorts3
25/222 challenge Shorts5
26/222 educational Shorts8
27/222 fashion Shorts10
28/222 fashion Shorts12
29/222 fashion Shorts14
30/222 fashion Shorts2
31/222 gaming Shorts11
32/222 gaming Shorts14
33/222 gaming Shorts17
34/222 gaming Shorts3
35/222 gaming Shorts5
36/222 motivational Shorts10
37/222 motivational Shorts11
38/222 motivational Shorts15
39/222 motivational Shorts20
40/222 music Shorts14
41/222 music Shorts7
42/222 random Shorts12
43/222 reaction Shorts11
44/222 reac