In [None]:
import chromadb
import webvtt
import requests
import time
from bs4 import BeautifulSoup
import polars as pl
import patito as pt
import json
from chromadb import QueryResult, PersistentClient, Collection, Settings
import numpy as np
import os
from pydantic import BaseModel, Field, TypeAdapter, ValidationError
import pytest
import helpers
from helpers import video_url_pattern, mp4_url_pattern, mp3_url_pattern, vtt_url_pattern
from enum import Enum
from typing import Optional
from tqdm import tqdm
from config import *
import semchunk
from transformers import AutoTokenizer

In [None]:
# project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# chroma_db_path = os.path.join(project_root, ".chroma")

chroma_db_path = r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\.chroma" #temporary while using notebook

client = PersistentClient(
    path=chroma_db_path,
    settings=Settings(
        is_persistent=True,
        persist_directory=chroma_db_path,
        anonymized_telemetry=False,
    ),
)

client.delete_collection("atp_transcripts")
# transcript_collection = client.get_or_create_collection(name="atp_transcripts", embedding_function=embedding_function)

chunk_collection = client.get_or_create_collection(name="atp_chunks")

print(chunk_collection.count())

In [None]:
df = (
    helpers.TranscriptDataFrameModel.LazyFrame(
    pl.read_parquet(
        r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\transcript.parquet"
    )
    )
    #
    .collect()
)

try:
    df.validate()
except pt.DataFrameValidationError as e:
    print(e)
    # raise(e)

print(df.columns)

In [None]:
def to_chunked_record_df(df_transcript: pt.DataFrame) -> pt.DataFrame:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    chunk_overlap = 30
    chunk_size = 256  # max tokens per chunk
    chunker = semchunk.chunkerify(tokenizer, chunk_size)
    df_chunks = helpers.ChunkedRecordDataFrameModel.DataFrame()
    for i, transcript in enumerate(df_transcript["transcript"]):
        video_id = df_transcript.item(i,0)
        chunks = chunker(transcript, overlap=chunk_overlap)
        indexes = list(range(1,len(chunks) + 1))
        token_counts = [len(tokenizer.tokenize(chunk)) for chunk in chunks]
        chunks_count = [len(chunks) for chunk in chunks]
        df_chunks.vstack(
            pl.DataFrame(
                {
                    "chunk": chunks,
                    "chunk_number": indexes,
                    "token_count": token_counts,
                    "chunks_count": chunks_count,
                }
            )
            .join(
                df_transcript
                .filter(pl.col("video_id").eq(video_id))
                .drop(["vtt", "transcript", "mp3_url", "vtt_url", "transcript_hash"])
                , how="cross"
            )
            , in_place=True
        )
    df_chunks = (
        df_chunks.with_columns(
            (pl.col("video_id").cast(pl.String) + pl.lit("_") + 
            pl.col("chunk_number").cast(pl.String))
            .alias("chunk_id")
        )
    )
    try:
        df_chunks.validate()
    except pt.DataFrameValidationError as e:
        print(e)
        # raise(e)
    return df_chunks
    # (Pandas version, irrelevant now) took 48.5 minutes to run with 16122 transcripts, 30 overlap, 256 size, resulting in 985480 chunks
    # Projected to take 90 minutes with 16k transcripts, have not tested. Maybe it's slower due to patito validation.
to_chunked_record_df(df.head(20))