In [6]:
# chromadb webvtt-py requests bs4 polars
import chromadb
import webvtt
import requests
import time
from bs4 import BeautifulSoup
import polars as pl
from pydantic import BaseModel, Field, TypeAdapter, ValidationError
import pytest
import web_scraper_helpers as wsh
from enum import Enum
from typing import Optional
import patito as pt

In [None]:
class PreScrapingDataFrameModel(pt.Model):
    video_id: int = pt.Field(unique=True, ge=1)
    section: str = pt.Field(min_length=3)
    title: str = pt.Field(min_length=3)
    video_url: str = pt.Field(
        unique=True,
        pattern=r"^https://allthepreaching.com/pages/video.php\?id=\d{7}$",
    )  # this may break when the site gets updated, since the new way is just "/video/..." instead of "https:/.../video/..."
    fail_reasons: str = pt.Field(allow_missing=True)


class CleanTranscriptDataFrameModel(pt.Model):
    video_id: int = pt.Field(unique=True, ge=1)
    section: str = pt.Field()
    title: str = pt.Field
    preacher: str = pt.Field
    video_url: str = pt.Field(
        unique=True,
        pattern=r"^https://allthepreaching.com/pages/video.php\?id=\d{7}$",
    )  # this may break when the site gets updated, since the new way is just "/video/..." instead of "https:/.../video/..."
    mp4_url: str = pt.Field(
        unique=True,
        pattern=r"https://www.kjv1611only.com/video/\w+/\w+/\w+.mp4",
    )
    mp3_url: str = pt.Field(
        unique=True,
        pattern=r"https://www.kjv1611only.com/video/\w+/\w+/\w+.mp3",
        derived_from=(pl.col("mp4_url").str.replace("mp4", "mp3"))
    )
    vtt_url: str = pt.Field(
        unique=True,
        pattern=r"https://www.kjv1611only.com/video/\w+/\w+/\w+.vtt",
        derived_from=(pl.col("mp4_url").str.replace("mp4", "vtt")),
    )
    # txt_url: str = pt.Field() # Currently, vtt_to_txt.php is not available on ATP, so this field is unused
    transcript: str = pt.Field(unique=True, min_length=50)
    fail_reasons: Optional[list[str]]

# class ChunkedRecord(pt.Model):
#     chunk: str = pt.Field(min_length=5)
#     pass

In [8]:
# all_links = wsh.get_records_from_archive_url()
test_links_1 = wsh.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\test_archive_1.html"
)
test_links_2 = wsh.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\test_archive_2.html"
)
test_links_3 = wsh.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\test_archive_3.html"
)
print(len(test_links_1))
print(len(test_links_2))
print(len(test_links_3))

test_links_all = wsh.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\atp_archive_2025-07-07.html"
)
print(len(test_links_all))
print(test_links_all[0])

50
149
198
17057
{'section': 'salvation', 'title': '360 video', 'raw_video_url': 'video.php?id=8466772'}


In [24]:
df = (
    PreScrapingDataFrameModel.DataFrame(test_links_1)
    .lazy()
    .with_columns(
        pl.col("raw_video_url")
        .str.extract(
            r"id=(\d+)", 1
        )  # This regex looks for 'id=' followed by one or more digits (\d+)
        .cast(int)
        .alias("video_id")
    )
    .with_columns(
        (
            pl.lit("https://allthepreaching.com/pages/video.php?id=")
            + pl.col("video_id").cast(str).alias("video_url")
        ).alias("video_url")
    )
    .drop("raw_video_url")
    .collect()
)
print(df)
try:
    df.validate()
except pt.DataFrameValidationError as e:
    print(e)
    raise(e)

shape: (50, 4)
┌─────────────────────────┬──────────────────────────────┬──────────┬──────────────────────────────┐
│ section                 ┆ title                        ┆ video_id ┆ video_url                    │
│ ---                     ┆ ---                          ┆ ---      ┆ ---                          │
│ str                     ┆ str                          ┆ i64      ┆ str                          │
╞═════════════════════════╪══════════════════════════════╪══════════╪══════════════════════════════╡
│ sermonsbro. dillon awes ┆ 1 samuel 1                   ┆ 6020540  ┆ https://allthepreaching.com/ │
│                         ┆                              ┆          ┆ pa…                          │
│ sermonsbro. dillon awes ┆ 1 samuel 2                   ┆ 5509588  ┆ https://allthepreaching.com/ │
│                         ┆                              ┆          ┆ pa…                          │
│ sermonsbro. dillon awes ┆ 1 samuel 3                   ┆ 8266638  ┆ https: