In [None]:
import chromadb
import webvtt
import requests
import time
from bs4 import BeautifulSoup
import polars as pl
import patito as pt
from pydantic import BaseModel, Field, TypeAdapter, ValidationError
import pytest
import src.helpers as helpers
from enum import Enum
from typing import Optional
from tqdm import tqdm

In [None]:
class PreScrapingDataFrameModel(pt.Model):
    video_id: int = pt.Field(unique=True, ge=1)
    section: str = pt.Field(min_length=3)
    title: str = pt.Field(min_length=3)
    preacher: str = pt.Field(min_length=3)
    video_url: str = pt.Field(
        unique=True,
        pattern=r"^https://allthepreaching.com/pages/video.php\?id=\d{7}$",
    )  # this may break when the site gets updated, since the new way is just "/video/..." instead of "https:/.../video/..."


class CleanTranscriptDataFrameModel(PreScrapingDataFrameModel):
    mp4_url: str = pt.Field(
        unique=True,
        pattern=r"https://www.kjv1611only.com/video/\w+/\w+/\w+.mp4",
    )
    mp3_url: str = pt.Field(
        unique=True,
        pattern=r"https://www.kjv1611only.com/video/\w+/\w+/\w+.mp3",
        derived_from=(pl.col("mp4_url").str.replace("mp4", "mp3")),
    )
    vtt_url: str = pt.Field(
        unique=True,
        pattern=r"https://www.kjv1611only.com/video/\w+/\w+/\w+.vtt",
        derived_from=(pl.col("mp4_url").str.replace("mp4", "vtt")),
    )
    # txt_url: str = pt.Field() # Currently, vtt_to_txt.php is not available on ATP, so this field is unused
    vtt: str = pt.Field(unique=True, min_length=50)
    transcript: str = pt.Field(unique=True, min_length=50)


# class ChunkedRecord(pt.Model):
#     chunk: str = pt.Field(min_length=5)
#     pass

In [None]:
# all_links = helpers.get_records_from_archive_url()
test_links_1 = helpers.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\test_archive_1.html"
)
test_links_2 = helpers.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\test_archive_2.html"
)
test_links_3 = helpers.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\test_archive_3.html"
)
print(len(test_links_1))
print(len(test_links_2))
print(len(test_links_3))

test_links_all = helpers.get_records_from_html_file(
    r"C:\repos\All_The_Preaching_Web_Scraping_Pipeline\data\atp_archive_2025-07-07.html"
)
print(len(test_links_all))
print(test_links_all[0])

50
149
198
17057
{'section': 'salvation', 'title': '360 video', 'raw_video_url': 'video.php?id=8466772'}


In [4]:
# config.py

disallowed_sections = [
    "salvation",
    "alexander scourby",
    "hymns",
    "other music",
    "psalms",
    "volume 1",
    "volume 2",
    "volume 3",
    "volume 4",
    "volume 5",
    "volume 6",
]

existing_video_ids = [] # connect this to DB

section_replacements = {
    # Word pairs stuck together (lowercase)
    r"(clips)(pastor)": r"${1} ${2}",
    r"(clips)(bro)": r"${1} ${2}",
    r"(sermons)(pastor)": r"${1} ${2}",
    r"(sermons)(bro)": r"${1} ${2}",
    r"(seminar)(pastor)": r"${1} ${2}",
    r"(winning)(motivation)": r"${1} ${2}",
    r"(winning)(tips)": r"${1} ${2}",
    r"(tribulation)(moments)": r"${1} ${2}",
    r"(by)(pastor)": r"${1} ${2}",
    r"(lord)(conference)": r"${1} ${2}",
    r"(james)(conference)": r"${1} ${2}",
}

title_replacements = {
    # Year suffix: adds space between word and year
    r"([a-z]+)(\d{4})": r"${1} ${2}",
}

preacher_names_replacements = {
    "pastor anderson": "pastor steven anderson",
    "steven anderson": "pastor steven anderson",
    "pastor shelley": "pastor jonathan shelley",
    "jonathan shelley": "pastor jonathan shelley",
    "pastor jimenez": "pastor roger jimenez",
    "roger jimenez": "pastor roger jimenez",
    "pastor thompson": "pastor aaron thompson",
    "aaron thompson": "pastor aaron thompson",
    "bro dillon awes": "pastor dillon awes",
    "dillon awes": "pastor dillon awes",
    "pastor awes": "pastor dillon awes",
    "evangelist mejia": "pastor bruce mejia",
    "pastor mejia": "pastor bruce mejia",
    "bruce mejia": "pastor bruce mejia",
    "bro stucky": "pastor matthew stucky",
    "matthew stucky": "pastor matthew stucky",
    "deacon ressl": "pastor corbin ressl",
    "corbin ressl": "pastor corbin ressl",
    "pastor ressl": "pastor corbin ressl",
    "ian taverner": "pastor ian taverner",
    "kevin sepulveda": "pastor kevin sepulveda",
    "pastor reyes": "pastor enrique reyes",
    "pastor bougardt": "pastor oscar bougardt",
    "pastor webb": "pastor webb",  # what is his first name?
    "deacon gonzalez": "deacon oliver gonzalez",
    "chris segura": "brother chris segura",
    "bro segura": "brother chris segura",
    "bro alex larson": "brother alexander larson",
    "alexander larson": "brother alexander larson",
    "raymond cooper": "brother raymond cooper",
    "ben naim": "brother ben naim",
}

section_preacher_map = {
    "fbbf 2019": "evaluate",
    "fbbf 2021": "evaluate",
    "fbc 2019": "evaluate",
    "fbc 2020": "evaluate",
    "fbc 2021": "evaluate",
    "fwbc missions conference 2023": "evaluate",
    "fwbc missions conference 2024": "evaluate",
    "fwbc preaching class 2022": "evaluate",
    "heritage of the lord conference 2022": "evaluate",
    "king james conference 2023": "evaluate",
    "masa 2019": "evaluate",
    "marching to zion conference 2018": "evaluate",
    "next generation youth rally 2023": "evaluate",
    "prophesy conference 2015": "evaluate",
    "red hot preaching conference 2016": "evaluate",
    "red hot preaching conference 2017": "evaluate",
    "red hot preaching conference 2018": "evaluate",
    "red hot preaching conference 2019": "evaluate",
    "red hot preaching conference 2020": "evaluate",
    "red hot preaching conference 2021": "evaluate",
    "red hot preaching conference 2022": "evaluate",
    "red hot preaching conference 2023": "evaluate",
    "red hot preaching conference 2024": "evaluate",
    "sermon clips bro. dillon awes": "pastor dillon awes",
    "sermon clips bro. chris segura": "brother chris segura",
    "sermon clips pastor matthew stucky": "pastor matthew stucky",
    "sermon clips pastor corbin ressl": "pastor corbin ressl",
    "sermon clips pastor steven anderson": "pastor steven anderson",
    "sermon clips pastor ian taverner": "pastor ian taverner",
    "sermon clips pastor roger jimenez": "pastor roger jimenez",
    "sermon clips pastor bruce mejia": "pastor bruce mejia",
    "sermon clips pastor jonathan shelley": "pastor jonathan shelley",
    "sermon clips pastor aaron thompson": "pastor aaron thompson",
    "sermons pastor dillon awes": "pastor dillon awes",
    "sermons bro. dillon awes": "pastor dillon awes", # TODO need to make this more flexible, perhaps using regex
    "sermons bro. raymond cooper": "pastor raymond cooper",
    "sermons bro. ben naim": "pastor ben naim",
    "sermons bro. chris segura": "pastor chris segura",
    "sermons pastor matthew stucky": "pastor matthew stucky",
    "sermons pastor corbin ressl": "pastor corbin ressl",
    "sermons pastor steven anderson": "pastor steven anderson",
    "sermons pastor ian taverner": "pastor ian taverner",
    "sermons pastor roger jimenez": "pastor roger jimenez",
    "sermons pastor bruce mejia": "pastor bruce mejia",
    "sermons pastor kevin sepulveda": "pastor kevin sepulveda",
    "sermons pastor jonathan shelley": "pastor jonathan shelley",
    "sermons pastor aaron thompson": "pastor aaron thompson",
    "soul winning motivation": "unknown",
    "seminar pastor anderson": "pastor steven anderson",
    "seminar pastor jimenez": "pastor roger jimenez",
    "seminar pastor shelley": "pastor jonathan shelley",
    "soul winning tips": "evaluate",
    "creation moments": "unknown",
    "day of the dead moments": "evaluate",
    "topics for dummies": "brother alexander larson",
    "hindu moments": "evaluate",
    "israel moments": "evaluate",
    "kjv minutes": "evaluate",
    "mormon moments": "evaluate",
    "post tribulation moments": "evaluate",
    "short videos by pastor jimenez": "pastor roger jimenez",
    "thessalonian moments": "evaluate",
    "trinity moments": "evaluate",
    "documentaries": "unknown",
    "trailers": "unknown",
    "creation": "unknown",
    "interviews": "unknown",
    "baptist bias": "unknown",
    "olive crown": "unknown",
    "rod of iron": "unknown",
    "sword of the spirit": "unknown",
    "true born sons": "unknown",
    "the sword drill": "unknown",
    "uncensored": "unknown",
    "landmarks": "unknown",
    "other videos": "unknown",
}

In [None]:
section_preacher_df = (
    pl.from_dicts(section_preacher_map)
    .transpose(include_header=True)
    .rename({"column": "section", "column_0": "preacher"})
    .lazy()
)

df = (
    PreScrapingDataFrameModel.DataFrame(test_links_1)
    .lazy()
    .filter(~pl.col("section").is_in(disallowed_sections))
    .with_columns(
        pl.col("raw_video_url")
        .str.extract(
            r"id=(\d+)", 1
        )  # This regex looks for 'id=' followed by one or more digits (\d+)
        .cast(int)
        .alias("video_id")
    )
    .filter(~pl.col("video_id").is_in(existing_video_ids))
    .with_columns(
        [
            (
                pl.lit("https://allthepreaching.com/pages/video.php?id=")
                + pl.col("video_id").cast(str)
            ).alias("video_url"),
            pl.col("section"),
        ]
    )
    .drop("raw_video_url")
)
for pattern, replacement in section_replacements.items():
    df = df.with_columns(pl.col("section").str.replace_all(pattern, replacement))
for pattern, replacement in title_replacements.items():
    df = df.with_columns(pl.col("title").str.replace_all(pattern, replacement))
df = (
    df.join(section_preacher_df, pl.col("section"))
    .collect()
)
print(df)
try:
    df.validate()
except pt.DataFrameValidationError as e:
    print(e)
    # raise(e)

scraping_df = pl.DataFrame()

video_urls = df.get_column("video_url")

video_urls = video_urls[:5]  # TODO temporary. remove when ready for production. This limits the amount of web requests to 5

mp4_urls = []
mp3_urls = []
vtt_urls = []
vtts = []
transcripts = []
for video_url in tqdm(video_urls, total=len(video_urls)):
    mp4_url = helpers.get_mp4_url_from_video_url(video_url)
    mp3_url = mp4_url.replace(".mp4", ".mp3")
    vtt_url = mp4_url.replace(".mp4", ".vtt")
    vtt = helpers.get_html_content_from_url(vtt_url)
    transcript = helpers.vtt_to_text(vtt)

    mp4_urls.append(mp4_url)
    mp3_urls.append(mp3_url)
    vtt_urls.append(vtt_url)
    vtts.append(vtt)
    transcripts.append(transcript)
    time.sleep(1)

new_rows = pl.DataFrame(
    {
        "video_url": video_urls,
        "mp4_url": mp4_urls,
        "mp3_url": mp3_urls,
        "vtt_url": vtt_urls,
        "vtt": vtts,
        "transcript": transcripts,
    }
)
scraping_df = pl.concat([scraping_df, new_rows])


scraping_df = scraping_df.with_columns(
    [
        pl.col("mp4_url").str.replace(r".mp4", ".mp3").alias("mp3_url"),
        pl.col("mp4_url").str.replace(r".mp4", ".vtt").alias("vtt_url"),
    ]
)

df = (
    CleanTranscriptDataFrameModel.DataFrame(df)
    .join(scraping_df, pl.col("video_url"))
)

print(df)

try:
    df.validate()
except pt.DataFrameValidationError as e:
    print(e)
    # raise (e)

shape: (50, 5)
┌──────────────────────┬─────────────────────┬──────────┬─────────────────────┬────────────────────┐
│ section              ┆ title               ┆ video_id ┆ video_url           ┆ preacher           │
│ ---                  ┆ ---                 ┆ ---      ┆ ---                 ┆ ---                │
│ str                  ┆ str                 ┆ i64      ┆ str                 ┆ str                │
╞══════════════════════╪═════════════════════╪══════════╪═════════════════════╪════════════════════╡
│ sermons bro. dillon  ┆ 1 samuel 1          ┆ 6020540  ┆ https://allthepreac ┆ pastor dillon awes │
│ awes                 ┆                     ┆          ┆ hing.com/pa…        ┆                    │
│ sermons bro. dillon  ┆ 1 samuel 2          ┆ 5509588  ┆ https://allthepreac ┆ pastor dillon awes │
│ awes                 ┆                     ┆          ┆ hing.com/pa…        ┆                    │
│ sermons bro. dillon  ┆ 1 samuel 3          ┆ 8266638  ┆ https://alltheprea

100%|██████████| 5/5 [00:14<00:00,  2.92s/it]

shape: (5, 10)
┌───────────┬───────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ section   ┆ title     ┆ video_id ┆ video_url ┆ … ┆ mp3_url   ┆ vtt_url   ┆ vtt       ┆ transcrip │
│ ---       ┆ ---       ┆ ---      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ t         │
│ str       ┆ str       ┆ i64      ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ ---       │
│           ┆           ┆          ┆           ┆   ┆           ┆           ┆           ┆ str       │
╞═══════════╪═══════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ sermons   ┆ 1 samuel  ┆ 6020540  ┆ https://a ┆ … ┆ https://w ┆ https://w ┆ WEBVTT    ┆ In the    │
│ bro.      ┆ 1         ┆          ┆ lltheprea ┆   ┆ ww.kjv161 ┆ ww.kjv161 ┆           ┆ book of   │
│ dillon    ┆           ┆          ┆ ching.com ┆   ┆ 1only.com ┆ 1only.com ┆ 00:00.000 ┆ First     │
│ awes      ┆           ┆          ┆ /pa…      ┆   ┆ /vi…      ┆ /vi…      ┆


