In [1]:
import chromadb
import webvtt
import requests
import time
from bs4 import BeautifulSoup
import polars as pl
import patito as pt
from pydantic import BaseModel, Field, TypeAdapter, ValidationError
import pytest
import helpers
from helpers import video_url_pattern, mp4_url_pattern, mp3_url_pattern, vtt_url_pattern
from enum import Enum
from typing import Optional
from tqdm import tqdm
from config import *

In [15]:
df_transcript = pl.read_parquet(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\transcripts.parquet"
)

df_transcript = (
    helpers.TranscriptDataFrameModel.DataFrame(df_transcript)
    #
    # .with_columns(dup_mask)
    .derive()
    .unique(
        "video_id"
    )  # TODO: shouldn't need this when scraping. I used it because I am adapting the existing dataset to the new format, and it has dupes.
    # Filters must match TranscriptDataFrameModel field validation definitions
    .filter([pl.col("mp4_url").str.contains(mp4_url_pattern)])
    .filter([pl.col("mp3_url").str.contains(mp3_url_pattern)])
    .filter([pl.col("vtt_url").str.contains(vtt_url_pattern)])
    .filter([pl.col("transcript") != "."])
    .filter([pl.col("vtt").str.len_chars() > 5])
    .filter([pl.col("transcript").str.len_chars() > 5])
    .with_columns(pl.col("transcript").hash().alias("transcript_hash"))
    .unique(
        [
            "mp4_url",
            "transcript",
            # "transcript_hash",
        ],
        # keep="first",
    )
    # .unique(
    #     [
    #         # "mp4_url",
    #         "transcript",
    #         # "transcript_hash",
    #     ],
    #     keep="first",
    # )
    # .unique(
    #     [
    #         # "mp4_url",
    #         # "transcript",
    #         "transcript_hash",
    #     ],
    #     keep="first",
    # )
)
try:
    df_transcript.validate()
except pt.DataFrameValidationError as e:
    print(e)
    # raise(e)

with pl.Config(tbl_cols=-1) and pl.Config(set_tbl_width_chars=200):
    print(df_transcript)

3 validation errors for TranscriptDataFrameModel
transcript_hash
  Superfluous column (type=type_error.superfluouscolumns)
vtt
  18 rows with duplicated values. (type=value_error.rowvalue)
transcript
  34 rows with duplicated values. (type=value_error.rowvalue)
shape: (15_842, 11)
┌──────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬─────────────────┬─────────────────┐
│ video_id ┆ section          ┆ title            ┆ preacher         ┆ video_url        ┆ mp4_url          ┆ mp3_url          ┆ vtt_url          ┆ vtt              ┆ transcript      ┆ transcript_hash │
│ ---      ┆ ---              ┆ ---              ┆ ---              ┆ ---              ┆ ---              ┆ ---              ┆ ---              ┆ ---              ┆ ---             ┆ ---             │
│ i64      ┆ str              ┆ str              ┆ str              ┆ str              ┆ str              ┆ str    

In [21]:
df_transcript.filter([pl.col("vtt").is_duplicated()]).write_csv(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\metadata_check.csv"
)
df_transcript.drop(["vtt","transcript"]).filter([pl.col("transcript_hash").is_duplicated()]).write_csv(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\data_check.csv"
)

In [33]:
import polars as pl

df = pl.DataFrame(
    [
        pl.Series(
            "title",
            ["can christians drink alcohol", "can christians drink alcohol"],
            dtype=pl.String,
        ),
        pl.Series("preacher", ["unknown", "pastor bruce mejia"], dtype=pl.String),
        pl.Series(
            "transcript_hash", [837119213172560523, 837119213172560523], dtype=pl.UInt64
        ),
    ]
)

print("Original DataFrame:")
print(df)

unique_df = df.unique(subset=["preacher", "transcript_hash"])

print(
    """
unique_df = df.unique(subset=["preacher", "transcript_hash"])"""
)
print("Bug. Does not remove non-unique values:")
print(unique_df)

unique_df = df.unique(subset=["transcript_hash", "preacher"])

print(
    """
unique_df = df.unique(subset=["transcript_hash", "preacher"])"""
)
print("\nBug. Reversed order of columns in subset, still does not remove non-unique values:")
print(unique_df)

unique_df = df.unique(subset=["preacher"])
unique_df = df.unique(subset=["transcript_hash"])

print(
    """
unique_df = df.unique(subset=["preacher"])
unique_df = df.unique(subset=["transcript_hash"])"""
)
print("\nCurrent workaround. use .unique() with only 1 column in the parameter:")
print(unique_df)

Original DataFrame:
shape: (2, 3)
┌──────────────────────────────┬────────────────────┬────────────────────┐
│ title                        ┆ preacher           ┆ transcript_hash    │
│ ---                          ┆ ---                ┆ ---                │
│ str                          ┆ str                ┆ u64                │
╞══════════════════════════════╪════════════════════╪════════════════════╡
│ can christians drink alcohol ┆ unknown            ┆ 837119213172560523 │
│ can christians drink alcohol ┆ pastor bruce mejia ┆ 837119213172560523 │
└──────────────────────────────┴────────────────────┴────────────────────┘

unique_df = df.unique(subset=["preacher", "transcript_hash"])
Bug. Does not remove non-unique values:
shape: (2, 3)
┌──────────────────────────────┬────────────────────┬────────────────────┐
│ title                        ┆ preacher           ┆ transcript_hash    │
│ ---                          ┆ ---                ┆ ---                │
│ str                   

In [28]:
print(df.to_init_repr())

pl.DataFrame(
    [
        pl.Series('video_id', [1550162, 8288714], dtype=pl.Int64),
        pl.Series('section', ['other videos', 'sermon clips pastor bruce mejia'], dtype=pl.String),
        pl.Series('title', ['can christians drink alcohol', 'can christians drink alcohol'], dtype=pl.String),
        pl.Series('preacher', ['unknown', 'pastor bruce mejia'], dtype=pl.String),
        pl.Series('video_url', ['https://allthepreaching.com/pages/video.php?id=1550162', 'https://allthepreaching.com/pages/video.php?id=8288714'], dtype=pl.String),
        pl.Series('mp4_url', ['https://www.kjv1611only.com/video/12other/Can_Christians_Drink_Alcohol.mp4', 'https://www.kjv1611only.com/video/02preaching/Sermon_Clips_Pastor_Mejia/Can_Christians_Drink_Alcohol.mp4'], dtype=pl.String),
        pl.Series('mp3_url', ['https://www.kjv1611only.com/video/12other/Can_Christians_Drink_Alcohol.mp3', 'https://www.kjv1611only.com/video/02preaching/Sermon_Clips_Pastor_Mejia/Can_Christians_Drink_Alcohol.mp3'], 

In [None]:
# import json
# from chromadb import QueryResult, PersistentClient, Settings
# import numpy as np
# import os

# # --- ChromaDB Client Setup ---
# # This code runs ONCE when the server starts.
# # Construct the path to the database directory at the project root.
# # This makes the path independent of where the script is run from.

# # project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# # chroma_db_path = os.path.join(project_root, ".chroma")

# chroma_db_path = r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\.chroma" #temporary while using notebook

# client = PersistentClient(
#     path=chroma_db_path,
#     settings=Settings(
#         is_persistent=True,
#         persist_directory=chroma_db_path,
#         anonymized_telemetry=False,
#     ),
# )

# transcript_collection = client.get_or_create_collection(name="atp_transcripts")
# chunk_collection = client.get_or_create_collection(name="atp_chunks")