# fine tuning the video load funtions

In [7]:
# set up
import sys, os

# Add project root to sys.path
repo_root = os.path.abspath("..")   # assuming notebook is in /notebooks
sys.path.append(repo_root)

# Verify import works
from src.ingestion import video_loader
from typing import Dict, Any
import yt_dlp



In [2]:
#Imports + Model:Load required libraries and your embedding model.
from sentence_transformers import SentenceTransformer

# Choose embedding model for semantic splitting
model = SentenceTransformer("BAAI/bge-small-en")

In [3]:
#Pick a test video
# Short JASP video for testing
url = "https://www.youtube.com/watch?v=j9w7hEfeIbE"

In [8]:
# ---------------------------
# Video info only (include description as meta data)
# ---------------------------
def fetch_video_info(url: str) -> Dict[str, Any]:
    """
    Fetch basic metadata of a YouTube video using yt-dlp.

    Parameters
    ----------
    url : str
        Full YouTube video URL.

    Returns
    -------
    dict
        A dictionary containing:
        - video_id: str, unique YouTube ID of the video
        - url: str, the video URL
        - title: str, video title
        - description: str, video description text
        - author: str, uploader channel name
        - publish_date: str, upload date (YYYYMMDD)
        - duration: int, video length in seconds
        - chapters: list of dicts, chapter info if available
    """
    ydl_opts = {
        "quiet": True,
        "skip_download": True,
        "noplaylist": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        if not info:
            raise RuntimeError(f"yt-dlp failed for {url}")

        meta = {
            "video_id": info.get("id"),
            "url": url,
            "title": info.get("title"),
            "description": info.get("description") or "",
            "author": info.get("uploader"),
            "publish_date": info.get("upload_date"),
            "duration": info.get("duration"),
            "chapters": info.get("chapters") or []
        }

    return meta

meta= fetch_video_info(url)

print ("fetched meta data:\n")
for k,v in meta.items():
    print(f"{k}:{v}")

fetched meta data:

video_id:j9w7hEfeIbE
url:https://www.youtube.com/watch?v=j9w7hEfeIbE
title:How to do a One-Way Goodness of Fit Chi-Square in JASP (15-10)
description:We learn how to calculate a One-Way Chi-Square goodness of fit test in JASP using the setting for Multinomial Test. For the null hypothesis, we assume that the observed values in one group (the Pigs) do not differ from the values in a comparison population (the Bears) or that there is no difference between the group in their choice of building materials: Straw, Sticks, or Brick and mortar. We create a simple data set in Excel, then open the .CSV dataset in JASP. We conduct the test, interpret the results, and write up the findings in APA style. This is the Pig and Bear Chi Square.

Download the Friendly, Free, Flexible, Functional JASP software from the official JASP statistics website: https://jasp-stats.org

This video teaches the following commands and techniques in JASP:
Importing a CSV into JASP
One-Way Chi-Square

In [9]:
import yt_dlp
import requests
from typing import List, Dict, Any

def fetch_transcript(url: str, lang: str = "en") -> Dict[str, Any]:
    """
    Fetch transcript (manual or auto) for a YouTube video using yt-dlp.
    Returns both raw segments and a flattened text block.

    Parameters
    ----------
    url : str
        YouTube video URL.
    lang : str
        Language code for captions (default: "en").

    Returns
    -------
    dict
        {
          "segments": [ { "text": str, "start": float, "duration": float }, ... ],
          "full_text": str   # flattened transcript text
        }
    """
    ydl_opts = {
        "quiet": True,
        "skip_download": True,
        "writesubtitles": True,
        "subtitleslangs": [lang],
        "subtitlesformat": "json3"
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)

        # check manual or auto captions
        subs = info.get("subtitles") or {}
        auto_subs = info.get("automatic_captions") or {}
        tracks = subs.get(lang) or auto_subs.get(lang)

        if not tracks:
            print("⚠️ No transcript available for this video.")
            return {"segments": [], "full_text": ""}

        # pick JSON3 format
        sub_url = next((t["url"] for t in tracks if t["ext"] == "json3"), None)
        if not sub_url:
            print("⚠️ No JSON3 subtitle track available.")
            return {"segments": [], "full_text": ""}

        # fetch JSON3 captions
        resp = requests.get(sub_url)
        resp.raise_for_status()
        data = resp.json()

        segments = []
        for evt in data.get("events", []):
            if "segs" in evt:
                text = "".join(seg.get("utf8", "") for seg in evt["segs"]).strip()
                if text:
                    segments.append({
                        "text": text,
                        "start": evt.get("tStartMs", 0) / 1000.0,
                        "duration": evt.get("dDurationMs", 0) / 1000.0
                    })

        # flatten transcript
        full_text = " ".join(seg["text"] for seg in segments)

        return {"segments": segments, "full_text": full_text}



url = "https://www.youtube.com/watch?v=j9w7hEfeIbE"
transcript_data = fetch_transcript(url)

print("Flattened text preview:")
print(transcript_data["full_text"][:300], "...")
print("\nFirst 3 segments:")
for seg in transcript_data["segments"][:3]:
    print(seg)



Flattened text preview:
We are now going to calculate a one-way chi-square goodness-of-fit test in JASP. For this example, we are going to assume that the observed values in one group (the Pigs) do not differ from the values in a comparison population (the Bears) or that there is "no difference" between the groups. This is ...

First 3 segments:
{'text': 'We are now going to calculate a one-way', 'start': 0.0, 'duration': 5.64}
{'text': 'chi-square goodness-of-fit test in', 'start': 2.85, 'duration': 6.27}
{'text': 'JASP. For this example, we are going to', 'start': 5.64, 'duration': 5.49}


In [13]:
import yt_dlp
import requests
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer, util
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ---------------------------
# Config
# ---------------------------
MAX_CHARS = 1200        # ~300 tokens
OVERLAP_CHARS = 150
SEMANTIC_THRESHOLD = 0.70
TEXT_MODEL_NAME = "BAAI/bge-small-en"

embedder = SentenceTransformer(TEXT_MODEL_NAME)



# ---------------------------
# Splitting functions
# ---------------------------
def length_split(text: str, meta: Dict[str, Any], uid_prefix: str) -> List[Dict[str, Any]]:
    """
    Hard split text into chunks based on length (chars).
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=MAX_CHARS,
        chunk_overlap=OVERLAP_CHARS
    )

    chunks = []
    for i, split_text in enumerate(splitter.split_text(text)):
        chunks.append({
            "id": f"{uid_prefix}_len{i}",
            "text": split_text,
            "meta": meta
        })
    return chunks


def semantic_split(text: str, meta: Dict[str, Any], uid_prefix: str) -> List[Dict[str, Any]]:
    """
    Semantic split using cosine similarity between sentences.
    Falls back to length split if chunks too large.
    """
    if len(text) <= MAX_CHARS:
        return [{"id": f"{uid_prefix}_sem0", "text": text, "meta": meta}]

    sentences = text.split(". ")
    embeddings = embedder.encode(sentences, convert_to_tensor=True)

    sims = util.pytorch_cos_sim(embeddings[:-1], embeddings[1:]).diagonal()
    sims = sims.cpu().numpy()

    breakpoints = [i+1 for i, score in enumerate(sims) if score < SEMANTIC_THRESHOLD]

    chunks, start = [], 0
    seg_id = 0
    for bp in breakpoints + [len(sentences)]:
        chunk_text = ". ".join(sentences[start:bp]).strip()
        if chunk_text:
            if len(chunk_text) > MAX_CHARS:
                # fallback to length split
                subchunks = length_split(chunk_text, meta, f"{uid_prefix}_sem{seg_id}")
                chunks.extend(subchunks)
            else:
                chunks.append({
                    "id": f"{uid_prefix}_sem{seg_id}",
                    "text": chunk_text,
                    "meta": meta
                })
            seg_id += 1
        start = bp

    return chunks

# ---------------------------
# Hybrid split (calls the others)
# ---------------------------
def hybrid_split(meta: Dict[str, Any], transcript: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Hybrid pipeline:
    1. Split by chapters.
    2. Semantic split for large blocks.
    3. Length fallback for oversized chunks.
    """
    segments = transcript["segments"]
    if not segments:
        return []

    # --- 1. Chapter-based grouping ---
    chapter_blocks = []
    if meta.get("chapters"):
        chapters = meta["chapters"]
        for i, ch in enumerate(chapters):
            start = ch["start_time"]
            end = chapters[i+1]["start_time"] if i + 1 < len(chapters) else float("inf")

            texts = [s["text"] for s in segments if start <= s["start"] < end]
            block_text = " ".join(texts).strip()
            if block_text:
                chapter_blocks.append({
                    "text": block_text,
                    "meta": {
                        "video_id": meta["video_id"],
                        "url": meta["url"],
                        "title": meta["title"],
                        "chapter": ch.get("title"),
                        "start_time": start,
                        "yt_link": f"{meta['url']}&t={int(start)}s",
                        "chapter_index": i
                    }
                })
    else:
        block_text = transcript["full_text"].strip()
        chapter_blocks = [{
            "text": block_text,
            "meta": {
                "video_id": meta["video_id"],
                "url": meta["url"],
                "title": meta["title"],
                "chapter": None,
                "start_time": 0,
                "yt_link": meta["url"],
                "chapter_index": 0
            }
        }]

    # --- 2 & 3. Semantic + fallback length split ---
    final_chunks = []
    for block in chapter_blocks:
        uid_prefix = f"{block['meta']['video_id']}_ch{block['meta']['chapter_index']}"
        chunks = semantic_split(block["text"], block["meta"], uid_prefix)
        final_chunks.extend(chunks)

    return final_chunks

# ---------------------------
# Example usage
# ---------------------------
if __name__ == "__main__":
    url = "https://www.youtube.com/watch?v=j9w7hEfeIbE"
    meta = fetch_video_info(url)
    transcript = fetch_transcript(url, lang="en")
    chunks = hybrid_split(meta, transcript)

    print(f"✅ Produced {len(chunks)} chunks\n")
    for ch in chunks:
        print("ID:", ch["id"])
        print("Chapter:", ch["meta"]["chapter"])
        print("Text:\n", ch["text"])
        print("=" * 80)  # separator


✅ Produced 8 chunks

ID: j9w7hEfeIbE_ch0_sem0
Chapter: Intro
Text:
 We are now going to calculate a one-way chi-square goodness-of-fit test in JASP. For this example, we are going to assume that the observed values in one group (the Pigs) do not differ from the values in a comparison population (the Bears) or that there is "no difference" between the groups. This is the "Pig and Bear Chi-Square"! [Music] A house inspector knows that families of three bears choose building materials in a certain pattern. Out of 70 houses built by bear families, 30 were brick-and-mortar, 20 were sticks, and 20 were straw. The house inspector then randomly samples building permits pulled by 70 little pigs who were building houses to determine whether a pattern exists in the materials that the little pigs chose for housing construction. Our research question is: "Do the pig and bear building permit patterns differ significantly?" We know the Bears pattern... is the pigs pattern different? In the previous v

In [14]:
lengths = [len(ch["text"].split()) for ch in chunks]
print("Min:", min(lengths), "Max:", max(lengths), "Avg:", sum(lengths)/len(lengths))


Min: 39 Max: 232 Avg: 169.0


In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")
token_counts = [len(tokenizer.encode(ch["text"])) for ch in chunks]
print(token_counts)


[201, 217, 284, 241, 249, 49, 291, 157]


In [13]:
#Parameter tuning experiments
#Wrap tests in loops to experiment systematically.
for threshold in [0.6, 0.7, 0.8]:
    blocks = semantic_split(transcript, model, threshold=threshold)
    sized = enforce_max_chars(blocks, max_chars=1200, overlap_chars=100)
    print(f"Threshold={threshold}: {len(sized)} chunks")



TypeError: enforce_max_chars() got an unexpected keyword argument 'overlap_chars'

In [None]:
#Save experimental outputs (optional):Store results for inspection later.
import json

with open("transcript_blocks.json", "w") as f:
    json.dump(sized_blocks, f, indent=2)

print("Saved experiment results to transcript_blocks.json")
