# Phase2

# Connection

In [15]:

import pprint as pp
import requests

host = 'api.novasearch.org'
port = 443

user = 'user02' # Add your user name here.
password = 'marco.2025+' # Add your user password here. For testing only. Don't store credentials in code. 
index_name = user

In [14]:
import urllib3
urllib3.disable_warnings()  # Suppress self-signed cert warnings

from requests.auth import HTTPBasicAuth
import requests
import pprint as pp

host = 'localhost'
port = 9200
user = 'admin'
password = 'MyStr0ng@Pass'
index_name = user

url = f"http://{host}:{port}"

try:
    res = requests.get(url, auth=HTTPBasicAuth(user, password), verify=False)
    pp.pprint(res.json())
except requests.exceptions.RequestException as e:
    print("Request failed:", e)


Request failed: HTTPConnectionPool(host='localhost', port=9200): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000024ED1764970>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))


# Frame Extraction

In [20]:
import pandas as pd
from pathlib import Path
import json
import yt_dlp
import av # For frame extraction
import os
import time
import math # For ceiling function if needed for frame counts

# --- Provided load_captions function (remains the same) ---
def load_captions(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    result = []
    for vid_key, meta_info in data.items():
        if isinstance(meta_info, dict) and 'timestamps' in meta_info and 'sentences' in meta_info:
            for (start, end), sentence in zip(meta_info['timestamps'], meta_info['sentences']):
                result.append({
                    "video_id": vid_key,
                    "duration": meta_info.get("duration", 0),
                    "start": start,
                    "end": end,
                    "caption": sentence,
                    "video_url": f"https://www.youtube.com/watch?v={vid_key[2:]}" if vid_key.startswith("v_") else f"https://www.youtube.com/watch?v={vid_key}"
                })
        else:
            print(f"Warning: Skipping entry {vid_key} in load_captions due to unexpected structure or missing keys.")
    return result

# --- Setup ---
repo_root = Path().resolve()
json_path = repo_root / "captions" / "train.json"
OUTPUT_DIR = "extracted_frames" # Updated output directory name
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- CONFIGURATION for frame extraction ---
FRAME_EXTRACTION_INTERVAL_SECONDS = 2.0

if not json_path.exists():
    print(f"Error: JSON file not found at {json_path}")
    exit()
print(f"Loading captions from: {json_path}")
data = load_captions(str(json_path))
if not data:
    print("No data loaded from captions file. Exiting.")
    exit()
df = pd.DataFrame(data)
if df.empty:
    print("DataFrame is empty after loading captions. Exiting.")
    exit()

# --- Filtering logic (same as before to get all potentially valid IDs) ---
contains_car = df[df['caption'].str.contains(r'\bcar\b', case=False, na=False)]
car_video_ids = set(contains_car['video_id'])
caption_counts = df['video_id'].value_counts()
videos_with_8plus = set(caption_counts[caption_counts > 8].index)
valid_video_ids = list(car_video_ids & videos_with_8plus)

print(f"Found {len(valid_video_ids)} potentially valid video IDs based on caption criteria.")

video_id_to_url_map = {}
if not df.empty and 'video_id' in df.columns and 'video_url' in df.columns:
    temp_df_for_urls = df[df['video_id'].isin(valid_video_ids)][['video_id', 'video_url']].drop_duplicates()
    video_id_to_url_map = pd.Series(temp_df_for_urls.video_url.values, index=temp_df_for_urls.video_id).to_dict()
else:
    print("DataFrame is missing 'video_id' or 'video_url' columns. Cannot proceed with URL mapping.")
    exit()

# --- Download videos and extract frames using yt-dlp, aiming for 10 successful ones ---
TARGET_SUCCESSFUL_VIDEOS = 10
successfully_processed_count = 0
attempted_video_index = 0
successfully_downloaded_ids_and_framesinfo = [] # Store dicts: {'video_id': id, 'num_frames_extracted': count}


print(f"\nStarting download and frame extraction (every {FRAME_EXTRACTION_INTERVAL_SECONDS}s), aiming for {TARGET_SUCCESSFUL_VIDEOS} successful videos...")

while successfully_processed_count < TARGET_SUCCESSFUL_VIDEOS and attempted_video_index < len(valid_video_ids):
    video_id = valid_video_ids[attempted_video_index]
    attempted_video_index += 1

    url = video_id_to_url_map.get(video_id)
    if not url:
        print(f"Could not find URL for video_id: {video_id}. Skipping.")
        continue

    print(f"\nAttempting to process video {attempted_video_index}/{len(valid_video_ids)} (target {successfully_processed_count+1}/{TARGET_SUCCESSFUL_VIDEOS}): {video_id} from {url}")
    
    download_filename_template = f"{video_id}.mp4"
    # Save downloaded videos temporarily in a subfolder or main output dir
    temp_download_dir = os.path.join(OUTPUT_DIR, "temp_videos")
    os.makedirs(temp_download_dir, exist_ok=True)
    local_video_path_template = os.path.join(temp_download_dir, download_filename_template)
    
    actual_downloaded_video_path = None

    ydl_opts = {
        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
        'outtmpl': local_video_path_template,
        'quiet': False, # Set to False to see yt-dlp output, True to suppress
        'noplaylist': True,
        'socket_timeout': 30,
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            print(f"Downloading video: {video_id} using yt-dlp...")
            info_dict = ydl.extract_info(url, download=True)
            actual_downloaded_video_path = ydl.prepare_filename(info_dict)

            if not os.path.exists(actual_downloaded_video_path):
                 # If outtmpl was fully specified, actual_downloaded_video_path might be it
                 if os.path.exists(local_video_path_template):
                      actual_downloaded_video_path = local_video_path_template
                 else:
                      print(f"Download failed or file not found at expected path for {video_id}.")
                      continue
        
        print(f"Downloaded to {actual_downloaded_video_path}")

        print(f"Extracting frames for {video_id} at ~{FRAME_EXTRACTION_INTERVAL_SECONDS}s intervals...")
        # Create a sub-directory for frames of this specific video
        video_frames_output_dir = os.path.join(OUTPUT_DIR, video_id)
        os.makedirs(video_frames_output_dir, exist_ok=True)
        
        frames_extracted_count = 0
        with av.open(actual_downloaded_video_path) as container:
            stream = container.streams.video[0]
            
            # Get FPS. stream.average_rate might be Fraction(0, 1) if not available, so try others.
            fps = stream.average_rate
            if not fps or float(fps) == 0:
                fps = stream.guessed_rate
            if not fps or float(fps) == 0:
                fps = stream.r_frame_rate # Often more reliable
            if not fps or float(fps) == 0:
                print(f"Could not determine FPS for {video_id}. Skipping frame extraction.")
                continue
            fps = float(fps)
            if fps <= 0:
                print(f"Invalid FPS ({fps}) for {video_id}. Skipping frame extraction.")
                continue

            frames_to_skip_per_interval = int(round(fps * FRAME_EXTRACTION_INTERVAL_SECONDS))
            if frames_to_skip_per_interval <= 0: # Ensure we always advance
                frames_to_skip_per_interval = 1
            
            print(f"  Video FPS: {fps:.2f}, Extracting 1 frame every {frames_to_skip_per_interval} frames.")

            frame_index_in_video = 0
            next_extraction_frame_index = 0

            for frame_obj in container.decode(stream):
                if frame_index_in_video >= next_extraction_frame_index:
                    # Calculate timestamp for the frame filename
                    timestamp_seconds = frame_obj.time # frame.time is in seconds
                    # Sanitize timestamp for filename (e.g., replace dot with underscore)
                    timestamp_str = f"{timestamp_seconds:.2f}".replace('.', '_')
                    
                    out_frame_path = os.path.join(video_frames_output_dir, f"{video_id}_ts_{timestamp_str}_frame_{frames_extracted_count:04d}.jpg")
                    frame_obj.to_image().save(out_frame_path, quality=80)
                    frames_extracted_count += 1
                    next_extraction_frame_index += frames_to_skip_per_interval
                
                frame_index_in_video += 1
        
        if frames_extracted_count > 0:
            print(f"{frames_extracted_count} frames saved for {video_id} in {video_frames_output_dir}")
            successfully_processed_count += 1
            successfully_downloaded_ids_and_framesinfo.append({'video_id': video_id, 'num_frames_extracted': frames_extracted_count, 'frames_path': video_frames_output_dir})
        else:
            print(f"No frames extracted for {video_id} despite successful download. Check FPS or video length.")

    except yt_dlp.utils.DownloadError as e_dl:
        error_msg = str(e_dl).lower()
        if "video unavailable" in error_msg: print(f"Skipping {video_id}: Video is unavailable (yt-dlp).")
        elif "private video" in error_msg: print(f"Skipping {video_id}: Video is private (yt-dlp).")
        elif "login required" in error_msg or "account action required" in error_msg: print(f"Skipping {video_id}: Video requires login or account action (yt-dlp).")
        elif "http error 400" in error_msg: print(f"yt-dlp HTTP Error 400 for {video_id}. URL: {url}")
        elif "http error 403" in error_msg: print(f"yt-dlp HTTP Error 403 (Forbidden) for {video_id}. URL: {url}")
        elif "socket timeout" in error_msg: print(f"yt-dlp socket timeout for {video_id}. URL: {url}")
        else: print(f"yt-dlp download error for {video_id}: {e_dl}")
    except Exception as e:
        print(f"An unexpected error occurred while processing {video_id}: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for unexpected errors
    finally:
        # Clean up the downloaded video file
        if actual_downloaded_video_path and os.path.exists(actual_downloaded_video_path):
            try:
                print(f"Cleaning up video file: {actual_downloaded_video_path}")
                os.remove(actual_downloaded_video_path)
            except Exception as e_remove:
                print(f"Failed to remove video file {actual_downloaded_video_path}: {e_remove}")
    
    # time.sleep(0.5)

print(f"\nFinished processing loop.")
print(f"{successfully_processed_count} videos had frames successfully extracted.")
if successfully_processed_count < TARGET_SUCCESSFUL_VIDEOS:
    print(f"Could not reach the target of {TARGET_SUCCESSFUL_VIDEOS} videos. Attempted {attempted_video_index} out of {len(valid_video_ids)} potential videos.")

# --- Information about successfully processed videos and their frames ---
if successfully_downloaded_ids_and_framesinfo:
    print("\nSummary of successfully processed videos and extracted frames:")
    total_frames_globally = 0
    for info in successfully_downloaded_ids_and_framesinfo:
        print(f"  Video ID: {info['video_id']}, Frames Extracted: {info['num_frames_extracted']}, Path: {info['frames_path']}")
        total_frames_globally += info['num_frames_extracted']
    print(f"Total frames extracted across all successful videos: {total_frames_globally}")

    # --- Caption confirmation for videos that had frames extracted ---
    video_ids_with_frames = [info['video_id'] for info in successfully_downloaded_ids_and_framesinfo]
    if not df.empty and 'video_id' in df.columns:
        selected_captions_df = df[df['video_id'].isin(video_ids_with_frames)]
        if not selected_captions_df.empty:
            print(f"\nSelected {len(selected_captions_df)} captions from the {len(video_ids_with_frames)} videos that had frames extracted:")
            print(selected_captions_df.groupby("video_id").size())
        else:
            print("\nNo captions found in the dataframe for the videos that had frames extracted.")
else:
    print("\nNo videos were successfully downloaded and/or no frames were extracted.")


Loading captions from: C:\Users\nunoj\Desktop\MPWD\Project\VideoDialog\captions\train.json
Found 12 potentially valid video IDs based on caption criteria.

Starting download and frame extraction (every 2.0s), aiming for 10 successful videos...

Attempting to process video 1/12 (target 1/10): v_MWdPh6J-YXM from https://www.youtube.com/watch?v=MWdPh6J-YXM
Downloading video: v_MWdPh6J-YXM using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=MWdPh6J-YXM
[youtube] MWdPh6J-YXM: Downloading webpage
[youtube] MWdPh6J-YXM: Downloading tv client config
[youtube] MWdPh6J-YXM: Downloading tv player API JSON
[youtube] MWdPh6J-YXM: Downloading ios player API JSON


ERROR: [youtube] MWdPh6J-YXM: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping v_MWdPh6J-YXM: Video is private (yt-dlp).

Attempting to process video 2/12 (target 1/10): v_1LdbczjQPII from https://www.youtube.com/watch?v=1LdbczjQPII
Downloading video: v_1LdbczjQPII using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=1LdbczjQPII
[youtube] 1LdbczjQPII: Downloading webpage
[youtube] 1LdbczjQPII: Downloading tv client config
[youtube] 1LdbczjQPII: Downloading tv player API JSON
[youtube] 1LdbczjQPII: Downloading ios player API JSON
[youtube] 1LdbczjQPII: Downloading m3u8 information
[info] 1LdbczjQPII: Downloading 1 format(s): 136+140
[download] Destination: extracted_frames\temp_videos\v_1LdbczjQPII.f136.mp4
[download] 100% of   20.39MiB in 00:00:12 at 1.66MiB/s     
[download] Destination: extracted_frames\temp_videos\v_1LdbczjQPII.f140.m4a
[download] 100% of    2.52MiB in 00:00:01 at 1.49MiB/s   
[Merger] Merging formats into "extracted_frames\temp_videos\v_1LdbczjQPII.mp4"
Deleting original file extracted_frames\temp_videos\v_1Ldbcz

ERROR: [youtube] amCD-2TIKw0: Video unavailable


Skipping v_amCD-2TIKw0: Video is unavailable (yt-dlp).

Attempting to process video 6/12 (target 4/10): v_jPLJAYnjsBw from https://www.youtube.com/watch?v=jPLJAYnjsBw
Downloading video: v_jPLJAYnjsBw using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=jPLJAYnjsBw
[youtube] jPLJAYnjsBw: Downloading webpage
[youtube] jPLJAYnjsBw: Downloading tv client config
[youtube] jPLJAYnjsBw: Downloading tv player API JSON
[youtube] jPLJAYnjsBw: Downloading ios player API JSON


ERROR: [youtube] jPLJAYnjsBw: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping v_jPLJAYnjsBw: Video is private (yt-dlp).

Attempting to process video 7/12 (target 4/10): v_Eilil6FZhK8 from https://www.youtube.com/watch?v=Eilil6FZhK8
Downloading video: v_Eilil6FZhK8 using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=Eilil6FZhK8
[youtube] Eilil6FZhK8: Downloading webpage
[youtube] Eilil6FZhK8: Downloading tv client config
[youtube] Eilil6FZhK8: Downloading tv player API JSON
[youtube] Eilil6FZhK8: Downloading ios player API JSON
[youtube] Eilil6FZhK8: Downloading m3u8 information
[info] Eilil6FZhK8: Downloading 1 format(s): 134+140
[download] Destination: extracted_frames\temp_videos\v_Eilil6FZhK8.f134.mp4
[download] 100% of    7.28MiB in 00:00:04 at 1.75MiB/s   
[download] Destination: extracted_frames\temp_videos\v_Eilil6FZhK8.f140.m4a
[download] 100% of    3.38MiB in 00:00:02 at 1.64MiB/s   
[Merger] Merging formats into "extracted_frames\temp_videos\v_Eilil6FZhK8.mp4"
Deleting original file extracted_frames\temp_videos\v_Eilil6FZ

ERROR: [youtube] S5kuckj4Ud4: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping v_S5kuckj4Ud4: Video is private (yt-dlp).

Attempting to process video 11/12 (target 7/10): v_9V7cMp_w1_0 from https://www.youtube.com/watch?v=9V7cMp_w1_0
Downloading video: v_9V7cMp_w1_0 using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=9V7cMp_w1_0
[youtube] 9V7cMp_w1_0: Downloading webpage
[youtube] 9V7cMp_w1_0: Downloading tv client config
[youtube] 9V7cMp_w1_0: Downloading tv player API JSON
[youtube] 9V7cMp_w1_0: Downloading ios player API JSON
[youtube] 9V7cMp_w1_0: Downloading m3u8 information
[info] 9V7cMp_w1_0: Downloading 1 format(s): 605+140
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 35
[download] Destination: extracted_frames\temp_videos\v_9V7cMp_w1_0.f605.mp4
[download] 100% of    4.42MiB in 00:00:06 at 697.83KiB/s                
[download] Destination: extracted_frames\temp_videos\v_9V7cMp_w1_0.f140.m4a
[download] 100% of    2.73MiB in 00:00:02 at 1.30MiB/s   
[Merger] Merging formats into "extracted_frames\temp_vi