In [None]:
# Cell 1 — Imports & configuration
import os
import shutil
import subprocess
import time
import pandas as pd
from yt_dlp import YoutubeDL

OUTPUT_DIR   = "data_set_try1/videos"                             # dossier de sortie
CSV_LABELS   = "data_set_try1/dataset/labels_semantic_clusters.csv"
CSV_DATA     = "data_set_try1/dataset/vggsound.csv"
CLUSTER      = '3'                                  # numéro de cluster
DURATION_SEC = 10                                   # durée du clip en secondes

# Vérifions qu’on a bien ffmpeg dispo
if shutil.which("ffmpeg") is None:
    raise RuntimeError("⚠️ ffmpeg n'est pas installé — indispensable pour ce script.")

In [None]:
# Cell 2 — Fonctions utilitaires
def sec_to_timestamp(s: int) -> str:
    """Convertit un entier de secondes en 'HH:MM:SS'."""
    return time.strftime("%H:%M:%S", time.gmtime(s))

def download_segment_fast(video_id: str, start_sec: int, duration: int, dest_folder: str):
    """
    1) Récupère l’URL du flux MP4 (360p, conteneur mp4).
    2) Lance ffmpeg avec -ss et -t pour ne tirer que la portion souhaitée.
    """
    os.makedirs(dest_folder, exist_ok=True)
    url = f"https://www.youtube.com/watch?v={video_id}"

    # 1) Extraction de l’URL directe du flux MP4 (audio+vidéo) H264≤360p
    ydl_opts = {
        'format': 'best[height<=360][ext=mp4]',
        'quiet': True,
        'skip_download': True,
    }
    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        stream_url = info['url']

    # 2) Construction du nom et des timestamps
    t0 = sec_to_timestamp(start_sec)
    output_name = f"{video_id}_{start_sec}s.mp4"
    output_path = os.path.join(dest_folder, output_name)

    # 3) Appel ffmpeg : -ss avant -i pour seek rapide, -t pour durée, -c copy pour pas ré-encoder
    cmd = [
        'ffmpeg',
        '-ss', t0,
        '-i', stream_url,
        '-t', str(duration),
        '-c', 'copy',
        output_path
    ]
    subprocess.run(cmd, check=True)

def download_segment_crop256(video_id: str, start_sec: int, duration: int, dest_folder: str):
    """
    1) Récupère l’URL du flux MP4 (360p).
    2) Coupe le segment [start_sec, start_sec+duration].
    3) Croppe en 256×256 centré.
    4) Sort en MP4 (H.264+AAC).
    """
    os.makedirs(dest_folder, exist_ok=True)
    url = f"https://www.youtube.com/watch?v={video_id}"

    # Extraction de l’URL directe du flux MP4
    ydl_opts = {'format': 'best[height<=360][ext=mp4]', 'quiet': True, 'skip_download': True}
    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        stream_url = info['url']

    # Timestamp de début
    t0 = sec_to_timestamp(start_sec)
    output_name = f"{video_id}_{start_sec}s_256crop.mp4"
    output_path = os.path.join(dest_folder, output_name)

    # ffmpeg : -ss avant -i pour seek rapide, -t pour durée, -vf pour crop, 
    # -c:v libx264 pour encoder, -preset veryfast pour vitesse, -crf 23 qualité raisonnable, -c:a copy
    cmd = [
        'ffmpeg',
        '-ss', t0,
        '-i', stream_url,
        '-t', str(duration),
        '-vf', 'crop=256:256:(in_w-256)/2:(in_h-256)/2',
        '-c:v', 'libx264',
        '-preset', 'veryfast',
        '-crf', '23',
        '-c:a', 'copy',
        output_path
    ]
    subprocess.run(cmd, check=True)



In [None]:
# Cell 3 — Lecture & filtrage du dataset
labels_df = pd.read_csv(CSV_LABELS, header=None,
                        names=['label','count','cluster'])
used_labels = labels_df[labels_df['cluster']==CLUSTER]['label'].values

data_df = pd.read_csv(CSV_DATA, header=None,
                      names=['video_id','start_sec','label','split'])
sub_df = data_df[data_df['label'].isin(used_labels)].sample(10, random_state=42)

In [None]:
labels_df

In [None]:
# Cell 4 — Boucle de téléchargement
for vid, start, label, split in sub_df.itertuples(index=False):
    real_end = min(start + DURATION_SEC, start + DURATION_SEC)  # juste pour l'affichage
    print(f"▶️ {vid} @ {start}s → +{DURATION_SEC}s …")
    try:
        download_segment_crop256(vid, int(start), DURATION_SEC, OUTPUT_DIR)
    except Exception as e:
        print(f"⚠️ Erreur pour {vid}@{start}s : {e}")

print("✅ Tous les segments (10 s) sont prêts en MP4 dans", OUTPUT_DIR)

# THIS PART TRY TO CREATE THE DATASET ON THE FLY WHILE DOWNLOADING THE VIDEO

In [6]:
import os, shutil, subprocess, time, sys
import pandas as pd
import numpy as np
from pydub import AudioSegment
from yt_dlp import YoutubeDL,DownloadError
import cv2
import shutil

OUTPUT_DIR   = "data_set_try1/processed"       # dossier racine de sortie
CSV_LABELS   = "data_set_try1/dataset/labels_semantic_clusters.csv"
CSV_DATA     = "data_set_try1/dataset/vggsound.csv"
CLUSTER      = '3'
DURATION_SEC = 10

# check ffmpeg
if shutil.which("ffmpeg") is None:
    raise RuntimeError("⚠️ ffmpeg n'est pas installé — indispensable.")
os.makedirs(OUTPUT_DIR, exist_ok=True)



In [7]:
#CLEAN PROCESSED FILE (TO REMOVE JUST USEFULL WHILE DEBUGING)
OUTPUT_DIR = "data_set_try1/processed"

# Supprime tous les sous-dossiers de OUTPUT_DIR
for name in os.listdir(OUTPUT_DIR):
    path = os.path.join(OUTPUT_DIR, name)
    if os.path.isdir(path):
        shutil.rmtree(path)

In [8]:
# Cell 2 — Fonctions utilitaires

def sec_to_timestamp(s: int) -> str:
    return time.strftime("%H:%M:%S", time.gmtime(s))

def download_segment(video_id: str, start: int, dur: int, out_dir: str) -> str:
    """Récupère le segment mp4 (audio+vidéo) et renvoie le chemin."""
    url = f"https://www.youtube.com/watch?v={video_id}"
    ydl_opts = {'format': 'best[height<=360][ext=mp4]', 'quiet': True, 'skip_download': True}
    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        stream_url = info['url']
    t0 = sec_to_timestamp(start)
    segment_path = os.path.join(out_dir, "segment.mp4")
    cmd = [
        'ffmpeg', '-ss', t0, '-i', stream_url,
        '-t', str(dur), '-c', 'copy',
        segment_path
    ]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return segment_path

def download_segment_copy(video_id: str, start: int, dur: int, out_dir: str) -> str:
    """Récupère le segment mp4 (audio+vidéo) en copiant tout (pas de transcodage)."""
    url = f"https://www.youtube.com/watch?v={video_id}"
    ydl_opts = {
        'format': 'best[height<=360][ext=mp4]',
        'quiet': True,
        'skip_download': True,
    }
    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        stream_url = info['url']
    t0 = sec_to_timestamp(start)
    raw_path = os.path.join(out_dir, "raw_segment.mp4")
    cmd = [
        'ffmpeg',
        '-ss', t0,           # SEEK avant l'input → range request
        '-i', stream_url,    # URL YouTube
        '-t', str(dur),
        '-c', 'copy',        # copie brute audio+vidéo
        raw_path
    ]
    subprocess.run(cmd, check=True)
    return raw_path


def download_segment_crop256_macos(video_id: str, start_sec: int, duration: int, dest_folder: str) -> str:
    os.makedirs(dest_folder, exist_ok=True)

    # ⚙️ 1) ON DÉCOUPE d'abord en copy
    raw = download_segment_copy(video_id, start_sec, duration, dest_folder)
    if not os.path.exists(raw):
        return ""

    # ⚙️ 2) puis on recadre localement
    out_path = os.path.join(dest_folder, "segment.mp4")
    cmd = [
        'ffmpeg',
        '-i',      raw,
        '-threads','0',
        '-vf',     'crop=256:256:(in_w-256)/2:(in_h-256)/2',
        '-c:v',    'hevc_videotoolbox',  # HEVC via VideoToolbox
        '-b:v',    '1800k',              # ~1.8 Mbps, ajustez selon la qualité souhaitée
        '-c:a',    'copy',
        out_path
    ]
    subprocess.run(cmd, check=True)
    return out_path


def download_segment_crop256_cluster(video_id: str, start_sec: int, duration: int, dest_folder: str) -> str:
    os.makedirs(dest_folder, exist_ok=True)
    url = f"https://www.youtube.com/watch?v={video_id}"

    try:
        info = ydl.extract_info(url, download=False)
    except DownloadError as e:
        print(f"[Erreur] Impossible de récupérer {video_id} : {e}")
        return ""

    stream_url = info['url']
    t0 = sec_to_timestamp(start_sec)
    out_path = os.path.join(dest_folder, "segment.mp4")

    # GPU accel. avec NVENC + multithreading
    cmd = [
        'ffmpeg',
        '-hwaccel', 'cuda',          # initialise CUDA
        '-i', stream_url,
        '-threads', '0',
        '-ss', t0,
        '-t', str(duration),
        '-vf', 'crop=256:256:(in_w-256)/2:(in_h-256)/2',
        '-c:v', 'h264_nvenc',        # encodeur NVIDIA
        '-preset', 'p1',             # preset NVENC (p1 = fastest)
        '-rc', 'vbr_hq',             # mode rate control haute qualité
        '-cq', '23',                 # qualité visuelle (comme crf)
        '-c:a', 'copy',
        out_path
    ]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return out_path


def analyze_audio_max_timestamp(video_path: str) -> float:
    """Retourne en secondes le point de volume max dans segment.mp4."""
    audio = AudioSegment.from_file(video_path)
    mono  = audio.set_channels(1)
    arr   = np.array(mono.get_array_of_samples())
    idx   = int(np.argmax(np.abs(arr)))
    return idx / mono.frame_rate

def sample_frames(video_path, num_samples=30):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = np.linspace(0, total - 1, num=num_samples, dtype=int)
    frames = []
    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, img = cap.read()
        if ret:
            frames.append(img)
    cap.release()
    return frames

def extract_best_thumbnail(video_path: str, ts: float, out_path: str):
    """
    => Prend plusieurs frames autour de ts (±0.5s), calcule la variance
       du Laplacien (proxy pour la netteté), choisit la meilleure,
       fait une égalisation d’histogramme sur la luminance, et sauve.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError(f"Impossible d'ouvrir {video_path}")
    fps      = cap.get(cv2.CAP_PROP_FPS) or 25
    frame_ct = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = frame_ct / fps

    # offsets en secondes autour du pic
    offsets = [-0.5, -0.25, 0, 0.25, 0.5]
    best_score = -1
    best_frame = None

    for off in offsets:
        t = min(max(ts + off, 0), duration)
        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
        ret, frame = cap.read()
        if not ret: 
            continue
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        score = cv2.Laplacian(gray, cv2.CV_64F).var()
        if score > best_score:
            best_score, best_frame = score, frame

    cap.release()

    if best_frame is None:
        raise RuntimeError("Aucune frame extraite pour la thumbnail")

    # égalisation de l'histogramme sur le canal Y de YUV
    yuv = cv2.cvtColor(best_frame, cv2.COLOR_BGR2YUV)
    yuv[:,:,0] = cv2.equalizeHist(yuv[:,:,0])
    enhanced = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR)

    cv2.imwrite(out_path, enhanced)

def strip_audio(video_path: str, out_path: str):
    """Génère video_no_audio.mp4."""
    cmd = ['ffmpeg', '-i', video_path, '-c', 'copy', '-an', out_path]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def extract_audio(video_path: str, out_path: str):
    """Génère audio.m4a."""
    cmd = ['ffmpeg', '-i', video_path, '-vn', '-c:a', 'copy', out_path]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def process_one(vid: str, start: int, label : str):
    """Orchestre tout pour un segment."""
    name = f"{vid}_{start}s"
    folder = os.path.join(OUTPUT_DIR, name)
    os.makedirs(folder, exist_ok=True)

    seg = download_segment_crop256_macos(vid, start, DURATION_SEC, folder)
    ts  = analyze_audio_max_timestamp(seg)
    # extract_best_thumbnail(seg, ts, os.path.join(folder, "thumbnail.jpg"))
    
    extract_best_thumbnail(
        seg,
        start,
        os.path.join(folder, "thumbnail.jpg"),
    )

    strip_audio(seg,  os.path.join(folder, "video_no_audio.mp4"))
    extract_audio(seg, os.path.join(folder, "audio.m4a"))

In [9]:
# Cell 3 — Lecture & filtrage du dataset
labels_df = pd.read_csv(CSV_LABELS, header=None,
                        names=['label','count','cluster'])
used_labels = labels_df[labels_df['cluster']==CLUSTER]['label'].values

data_df = pd.read_csv(CSV_DATA, header=None,
                      names=['video_id','start_sec','label','split'])
sub_df = data_df[data_df['label'].isin(used_labels)].sample(10, random_state=42)


In [10]:
# Cell 4 — Traitement en boucle
for vid, start, label ,*_ in sub_df.itertuples(index=False):
    print(f"🔄 Processing {vid} @ {start}s …")
    try:
        process_one(vid, int(start), label)
    except Exception as e:
        print(f"⚠️ Erreur pour {vid}@{start}s : {e}")

print("✅ Tout est prêt dans", OUTPUT_DIR)

🔄 Processing 5PVKgUGg0LQ @ 450s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour 5PVKgUGg0LQ@450s : Aucune frame extraite pour la thumbnail
🔄 Processing wd2RVda164o @ 30s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour wd2RVda164o@30s : Aucune frame extraite pour la thumbnail
🔄 Processing IpHCwR7ACjI @ 189s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour IpHCwR7ACjI@189s : Aucune frame extraite pour la thumbnail
🔄 Processing 9Qttcz2JRlI @ 282s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour 9Qttcz2JRlI@282s : Aucune frame extraite pour la thumbnail
🔄 Processing wwxXsszinio @ 105s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour wwxXsszinio@105s : Aucune frame extraite pour la thumbnail
🔄 Processing 1f9IgOjZjn4 @ 107s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour 1f9IgOjZjn4@107s : Aucune frame extraite pour la thumbnail
🔄 Processing r3VPp4TGkGQ @ 147s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour r3VPp4TGkGQ@147s : Aucune frame extraite pour la thumbnail
🔄 Processing xkjOXJDOOFo @ 162s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour xkjOXJDOOFo@162s : Aucune frame extraite pour la thumbnail
🔄 Processing OVBQuUJhtfg @ 30s …


ERROR: [youtube] OVBQuUJhtfg: Video unavailable


⚠️ Erreur pour OVBQuUJhtfg@30s : ERROR: [youtube] OVBQuUJhtfg: Video unavailable
🔄 Processing CeTbzVsGfRw @ 30s …


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with clang version 14.0.6
  configuration: --prefix=/opt/anaconda3/envs/dataloader --cc=arm64-apple-darwin20.0.0-clang --ar=arm64-apple-darwin20.0.0-ar --nm=arm64-apple-darwin20.0.0-nm --ranlib=arm64-apple-darwin20.0.0-ranlib --strip=arm64-apple-darwin20.0.0-strip --disable-doc --enable-swresample --enable-swscale --enable-openssl --enable-libxml2 --enable-libtheora --enable-demuxer=dash --enable-postproc --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libdav1d --enable-zlib --enable-libaom --enable-pic --enable-shared --disable-static --disable-gpl --enable-version3 --disable-sdl2 --enable-libopenh264 --enable-libopus --enable-libmp3lame --enable-libopenjpeg --enable-libvorbis --enable-pthreads --enable-libtesseract --enable-libvpx --enable-librsvg
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 6

⚠️ Erreur pour CeTbzVsGfRw@30s : Aucune frame extraite pour la thumbnail
✅ Tout est prêt dans data_set_try1/processed


In [16]:
import json
import csv
from pathlib import Path

def convert_json_to_csv(json_path: str, csv_path: str) -> None:
    """
    Convert a list of annotation records in JSON format to CSV.

    Each record should contain:
      - video clip name (keys: 'video_clip_name', 'video_clip', or 'name')
      - class label (keys: 'class' or 'label')
      - bounding box dict with keys: 'Xmin', 'Ymin', 'Xmax', 'Ymax'
    """
    # Load JSON
    with open(json_path, 'r', encoding='utf-8') as jf:
        records = json.load(jf)

    # Prepare CSV output
    with open(csv_path, 'w', newline='', encoding='utf-8') as cf:
        writer = csv.writer(cf)
        # Header row
        writer.writerow(['video_clip_name', 'class', 'Xmin', 'Ymin', 'Xmax', 'Ymax'])

        for rec in records:
            # Extract fields with flexible key names
            print(rec)
            video_name = rec.get('file')
            class_label = rec.get('class')
            bbox = rec.get('bbox')
            bbox = bbox[0]

            # Skip if no bounding box
            if not bbox:
                continue

            xmin = bbox[0]
            ymin = bbox[1]
            xmax = bbox[2]
            ymax = bbox[3]

            writer.writerow([video_name, class_label, xmin, ymin, xmax, ymax])

    print(f"Converted {len(records)} records from '{json_path}' → '{csv_path}'.")


# Define input and output paths
input_json = 'data_set_try1/dataset/vggss.json'
output_csv = 'data_set_try1/dataset/vggss.csv'

# Run conversion
convert_json_to_csv(input_json, output_csv)

{'file': 'zpWuikVorYg_000032', 'class': 'elk bugling', 'bbox': [[0.403125, 0.253125, 0.6, 0.75]]}
{'file': 'gEvCUcZ6w88_000030', 'class': 'female speech, woman speaking', 'bbox': [[0.45, 0.190625, 0.584375, 0.47812499999999997]]}
{'file': 'JIemsK_0lXc_000364', 'class': 'playing bass drum', 'bbox': [[0.03125, 0.3625, 0.36875, 0.9875]]}
{'file': 'RgyqhpOJFM4_000030', 'class': 'playing cello', 'bbox': [[0.396875, 0.021875, 0.6875, 0.99375]]}
{'file': '3MUeg3nD2OU_000120', 'class': 'playing acoustic guitar', 'bbox': [[0.303125, 0.0, 0.953125, 1.0]]}
{'file': 'iUtE4nRvBsM_000040', 'class': 'people coughing', 'bbox': [[0.29375, 0.03125, 0.73125, 0.578125]]}
{'file': '1cxvg7qu0G0_000070', 'class': 'lighting firecrackers', 'bbox': [[0.025, 0.271875, 0.38125000000000003, 0.85625]]}
{'file': '16CvcIXIjzQ_000332', 'class': 'chinchilla barking', 'bbox': [[0.278125, 0.003125, 0.8500000000000001, 0.59375]]}
{'file': 'gg6yoeBoYxg_000000', 'class': 'children shouting', 'bbox': [[0.13125, 0, 1.0, 0.318

In [None]:
import os
import cv2
import json
import torch
import csv
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import pdb
import time
from PIL import Image
import glob
import sys 
import scipy.io.wavfile as wav
from scipy import signal
import random
import soundfile as sf



class GetAudioVideoDataset(Dataset):

    def __init__(self, args, mode='train', transforms=None):
 
        data = []
        if args.testset == 'flickr':
            testcsv = 'metadata/flickr_test.csv'
        elif args.testset == 'vggss':
            testcsv = 'metadata/vggss_test.csv'

        with open(testcsv) as f:
            csv_reader = csv.reader(f)
            for item in csv_reader:
                data.append(item[0] + '.mp4')
        self.audio_path = args.data_path + 'audio/'
        self.video_path = args.data_path + 'frames/'

        self.imgSize = args.image_size 

        self.mode = mode
        self.transforms = transforms
        # initialize video transform
        self._init_atransform()
        self._init_transform()
        #  Retrieve list of audio and video files
        self.video_files = []
   
        for item in data[:]:
            self.video_files.append(item )
        print(len(self.video_files))
        self.count = 0

    def _init_transform(self):
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        if self.mode == 'train':
            self.img_transform = transforms.Compose([
                transforms.Resize(int(self.imgSize * 1.1), Image.BICUBIC),
                transforms.RandomCrop(self.imgSize),
                transforms.RandomHorizontalFlip(),
                transforms.CenterCrop(self.imgSize),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)])
        else:
            self.img_transform = transforms.Compose([
                transforms.Resize(self.imgSize, Image.BICUBIC),
                transforms.CenterCrop(self.imgSize),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)])            

    def _init_atransform(self):
        self.aid_transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean=[0.0], std=[12.0])])
#  

    def _load_frame(self, path):
        img = Image.open(path).convert('RGB')
        return img

    def __len__(self):
        # Consider all positive and negative examples
        return len(self.video_files)  # self.length

    def __getitem__(self, idx):
        file = self.video_files[idx]

        # Image
        frame = self.img_transform(self._load_frame(self.video_path + file[:-3] + 'jpg'))
        frame_ori = np.array(self._load_frame(self.video_path  + file[:-3] + 'jpg'))
        # Audio
        samples, samplerate = sf.read(self.audio_path + file[:-3]+'wav')

        # repeat if audio is too short
        if samples.shape[0] < samplerate * 10:
            n = int(samplerate * 10 / samples.shape[0]) + 1
            samples = np.tile(samples, n)
        resamples = samples[:samplerate*10]

        resamples[resamples > 1.] = 1.
        resamples[resamples < -1.] = -1.
        frequencies, times, spectrogram = signal.spectrogram(resamples,samplerate, nperseg=512,noverlap=274)
        spectrogram = np.log(spectrogram+ 1e-7)
        spectrogram = self.aid_transform(spectrogram)
 

        return frame,spectrogram,resamples,file,torch.tensor(frame_ori)