<h3>Load Dataset</h3>

In [1]:
import glob
files = glob.glob("dataset/*.mp3")

<h3>Preprocess the audios</h3>

In [2]:
from pydub import AudioSegment
import os

os.makedirs("processed", exist_ok=True)

for f in files:
    base = os.path.basename(f)[:-4]

    # Skip if first chunk already exists
    if any(name.startswith(base) for name in os.listdir("processed")):
        print(f"Skipping {base} (already processed)")
        continue

    audio = AudioSegment.from_file(f)
    audio = audio.set_channels(1).set_frame_rate(16000).normalize()

    chunk_len = 5 * 60 * 1000  # 5 min
    for i, chunk in enumerate(audio[::chunk_len]):
        out = f"processed/{base}_{i}.wav"
        chunk.export(out, format="wav")

print("Audio preprocessing completed (with skipping)")

Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056) (already processed)
Skipping 2015-11-22 - The Pilgrims as Psalm Singers - Rev. James C. Gallagher (113017199280) (already processed)
Skipping 2016-12-04 - Walk in the Spirit - Timothy Guess (113017194316) (already processed)
Skipping 2017-11-15 - The Battle of Jericho 2 - Andy Proctor (1130172238493) (already processed)
Skipping 2017-11-25 - The LORD Shall Be King - Dr. John Whitcomb (1130171910161) (already processed)
Skipping 2017-11-26 - A Way of Escape - Chuck Hunt (1130172035161) (already processed)
Skipping 2017-11-26 - Christmas Prophecy of Immanuel - Rev. Lawrence J. Bilkes (112917147574) (already processed)
Skipping 2017-11-26 - Godly Goals of Raising Children - Steve Walker (1130171925540) (already processed)
Skipping 2017-11-26 - Grace for an Overburdened One - Rev. David Kranendonk (1130172253346) (already processed)
Skipping 2017-11-26 - Guest Speaker - Missionary Steve Douglas - Stev

<h3>Convert Audio into Text</h3>

In [3]:
import whisper
import glob
import os
import json

AUDIO_DIR = "processed"
OUT_DIR = "transcripts"
MODEL_SIZE = "base"

os.makedirs(OUT_DIR, exist_ok=True)

model = whisper.load_model(MODEL_SIZE)

for audio_path in glob.glob(f"{AUDIO_DIR}/*.wav"):
    name = os.path.splitext(os.path.basename(audio_path))[0]

    txt_path = os.path.join(OUT_DIR, f"{name}.txt")
    json_path = os.path.join(OUT_DIR, f"{name}.json")

    # Skip if transcript already exists
    if os.path.exists(txt_path) and os.path.exists(json_path):
        print(f"Skipping {name} (already transcribed)")
        continue

    print(f"Processing {name}")

    result = model.transcribe(audio_path)

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(result["text"])

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(result["segments"], f, indent=2)

print("Transcription completed (safe restart supported)")


Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_0 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_1 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_10 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_11 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_12 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_2 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_3 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214056)_4 (already transcribed)
Skipping 1997-07-23 - Proverbs 1_20-33 The Call To Wisdom - Jackson Boyett (113017214

<h3>Splits the transcript into topic-wise sections</h3>

In [4]:
import glob, json, re

sentences = []
sentence_meta = [] 

def clean_text(t):
    t = t.lower()
    t = re.sub(r"\.{2,}", "", t)          # remove ...
    t = re.sub(r"[^a-z0-9\s?.!']", " ", t)  # remove noise
    t = re.sub(r"\s+", " ", t)
    return t.strip()

for jp in glob.glob("transcripts/*.json"):
    with open(jp, "r", encoding="utf-8") as f:
        segments = json.load(f)

    for seg in segments:
        text = clean_text(seg["text"])
        if text:
            sentences.append(text)
            sentence_meta.append({
                "file": jp,
                "start": seg["start"],
                "end": seg["end"]
            })

In [5]:
WINDOW = 3 if len(sentences) < 300 else 5  # VERY important

window_texts = []
window_map = []

for i in range(len(sentences) - WINDOW + 1):
    chunk = " ".join(sentences[i:i+WINDOW])
    window_texts.append(chunk)
    window_map.append(i)  # maps back to sentence index

<h3>Sentence Embeddings</h3>

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedder.encode(
    window_texts,
    convert_to_numpy=True,
    normalize_embeddings=True
)

np.save("sentences.npy", np.array(sentences, dtype=object))
np.save("sentence_embeddings.npy", embeddings)
np.save("sentence_meta.npy", np.array(sentence_meta, dtype=object))

print("Embeddings saved")

  from .autonotebook import tqdm as notebook_tqdm


Embeddings saved


<h3>Cosine Similarity</h3>

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

sims = []
for i in range(len(embeddings) - 1):
    sim = cosine_similarity(
        embeddings[i].reshape(1, -1),
        embeddings[i+1].reshape(1, -1)
    )[0][0]
    sims.append(sim)

sims = np.array(sims)

<h3>Detect topic breaks</h3>

In [8]:
THRESHOLD = 0.75

split_indices = [i+1 for i, s in enumerate(sims) if s < THRESHOLD]

<h3>Split Transcript into Topics</h3>

In [9]:
topics = []
start = 0

for idx in split_indices:
    topics.append(" ".join(sentences[start:idx]))
    start = idx

topics.append(" ".join(sentences[start:]))
topics = [t for t in topics if len(t.strip()) > 20]

<h3>Merge very short topic</h3>

In [10]:
MIN_SENTENCES = max(3, len(sentences) // 40)

def sentence_count(text):
    # counts sentence-ending punctuation safely
    return sum(text.count(p) for p in [".", "?", "!"])

merged_topics = []
current = ""

for t in topics:
    if sentence_count(current) < MIN_SENTENCES:
        current = (current + " " + t).strip()
    else:
        merged_topics.append(current)
        current = t

if current:
    merged_topics.append(current)

topics = merged_topics

<h3>Save topic wise transcript</h3>

In [11]:
# FINAL SAFETY FILTER
MIN_WORDS = 10
clean_topics = []

for t in topics:
    words = t.split()
    if len(words) >= MIN_WORDS and not all(w == "." for w in words):
        clean_topics.append(t)

topics = clean_topics

# write to file
with open("topic_segments.txt", "w", encoding="utf-8") as f:
    for i, t in enumerate(topics):
        f.write(f"\n--- TOPIC {i+1} ---\n")
        f.write(t + "\n")

<h3>Assign basic part lebel</h3>

In [12]:
labeled_topics = []

for i, t in enumerate(topics):
    if i == 0:
        label = "INTRODUCTION"
    elif i == len(topics) - 1:
        label = "CONCLUSION"
    else:
        label = "MAIN CONTENT"
    labeled_topics.append((label, t))

<h3>Summarization</h3>

In [13]:
import torch
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


In [14]:
def summarize_text(text, max_chunk_words=400):
    if not text or not text.strip():
        return ""

    summaries = []
    words = text.split()

    for i in range(0, len(words), max_chunk_words):
        chunk_words = words[i:i + max_chunk_words]
        if len(chunk_words) < 20:
            continue

        chunk = " ".join(chunk_words)

        max_len = min(120, len(chunk_words))
        min_len = min(40, max_len - 1)
        if max_len <= min_len:
            continue

        try:
            out = summarizer(
                chunk,
                max_length=max_len,
                min_length=min_len,
                truncation=True,
                do_sample=False
            )
            summaries.append(out[0]["summary_text"])
        except IndexError:
            # skip rare bad chunks safely
            continue

    return " ".join(summaries)

In [15]:
summarized_topics = []

for label, t in labeled_topics:
    if not t or not t.strip():
        continue
    summary = summarize_text(t)
    summarized_topics.append((label, summary))

In [16]:
with open("topic_summaries.txt", "w", encoding="utf-8") as f:
    for i, (label, summary) in enumerate(summarized_topics):
        if not summary:
            continue
        f.write(f"\n--- {label} SUMMARY (PART {i+1}) ---\n")
        f.write(summary.strip() + "\n")

In [17]:
# TEST TRANSCRIPTION
import whisper

model = whisper.load_model("base")
result = model.transcribe(r"C:\Users\User\OneDrive\Desktop\infosys_springboard_project\Final Project\dataset\2017-11-30 - Trust in God - Rev. Klaas Veldman (11301721260).mp3")
print("RAW TRANSCRIPT:\n", result["text"])


# TEST SUMMARIZATION
test_text = result["text"]

summary = summarize_text(test_text)
print("\nSUMMARY:\n", summary)




RAW TRANSCRIPT:
  De hulp en Only Expectation is in the name of the Lord who made heaven and earth who keep us through forever. Never forsakes the works of his own hands. Grace mercy and peace be unto you from hand at is and at was and at is to come from the seven spirits which are before his throne and from Jesus Christ which is the faithful witness. First, we guard on our death and the prince of all the kings of deep earth. Amen. Congregation, let's sing together for 26, 4 and 9. The Lord preserves the meek most. Tendily, broad night to death in him I found salvation in 9. I am a Lord I serve and bound yet free in what form. I am a Lord I live and I am a Lord I live and I am a Lord. I am a Lord I live and I am a Lord. I am a Lord I live and I am a Lord I live and I am a Lord. The reading of the Holy Scriptures can be found in one-tibleth in 5. The first 16 verses, one-tibleth 5 runs from 16 before 12 articles. I believe in God the Father Almighty, maker of heaven and earth and in Jes

In [22]:
def test_single_audio(audio_path):
    import os, re
    from pydub import AudioSegment
    from sklearn.metrics.pairwise import cosine_similarity

    print("ðŸ”¹ Processing:", audio_path)

    # =====================
    # AUDIO PREPROCESS
    # =====================
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1).set_frame_rate(16000).normalize()

    tmp_wav = "temp_test_audio.wav"
    audio.export(tmp_wav, format="wav")

    # =====================
    # TRANSCRIBE
    # =====================
    result = model.transcribe(tmp_wav)
    os.remove(tmp_wav)

    # =====================
    # CLEAN TEXT
    # =====================
    def clean_text(t):
        t = t.lower()
        t = re.sub(r"\.{2,}", "", t)
        t = re.sub(r"[^a-z0-9\s?.!']", " ", t)
        t = re.sub(r"\s+", " ", t)
        return t.strip()

    sentences = [
        clean_text(seg["text"])
        for seg in result["segments"]
        if seg["text"].strip()
    ]

    if len(sentences) < 10:
        print("âš  Not enough speech detected")
        return

    # =====================
    # WINDOWING (SMOOTH)
    # =====================
    WINDOW = 7 if len(sentences) < 500 else 10

    windows = [
        " ".join(sentences[i:i + WINDOW])
        for i in range(len(sentences) - WINDOW + 1)
    ]

    embeddings = embedder.encode(
        windows,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    # =====================
    # TOPIC SPLITTING
    # =====================
    THRESHOLD = 0.68

    sims = [
        cosine_similarity(
            embeddings[i].reshape(1, -1),
            embeddings[i + 1].reshape(1, -1)
        )[0][0]
        for i in range(len(embeddings) - 1)
    ]

    split_indices = [i + 1 for i, s in enumerate(sims) if s < THRESHOLD]

    topics, start = [], 0
    for idx in split_indices:
        topics.append(" ".join(sentences[start:idx]))
        start = idx
    topics.append(" ".join(sentences[start:]))

    # =====================
    # MERGE SMALL TOPICS (BALANCED)
    # =====================
    MIN_WORDS = 180
    MAX_WORDS = 900

    merged_topics = []
    current = ""

    for t in topics:
        curr_len = len(current.split())

        if curr_len < MIN_WORDS:
            current += " " + t
        elif curr_len > MAX_WORDS:
            merged_topics.append(current.strip())
            current = t
        else:
            merged_topics.append(current.strip())
            current = t

    if current.strip():
        merged_topics.append(current.strip())

    topics = merged_topics

    # =====================
    # LABELING
    # =====================
    labeled_topics = []
    for i, t in enumerate(topics):
        if i == 0:
            label = "INTRODUCTION"
        elif i == len(topics) - 1:
            label = "CONCLUSION"
        else:
            label = f"MAIN PART {i}"
        labeled_topics.append((label, t))

    # =====================
    # SUMMARIZATION
    # =====================
    print("\n========== SUMMARY ==========\n")

    for label, text in labeled_topics:
        summary = summarize_text(text)
        print(f"\nðŸ”¸ {label}")
        print(summary)

    print("\nâœ… Single-audio pipeline test completed")

In [23]:
test_single_audio(
    r"C:\Users\User\OneDrive\Desktop\infosys_springboard_project\Final Project\dataset\2017-11-30 - Trust in God - Rev. Klaas Veldman (11301721260).mp3"
)

ðŸ”¹ Processing: C:\Users\User\OneDrive\Desktop\infosys_springboard_project\Final Project\dataset\2017-11-30 - Trust in God - Rev. Klaas Veldman (11301721260).mp3







ðŸ”¸ INTRODUCTION
 The first we got on of death and the prince of all the kings of the earth . The lord preserves the meek most tenderly brought night to death in him i found salvation and nine i am a lord i serve and bound yet free .  Let not a widow be taken into the number under three score years old having been the wife of one man . Younger widows refuse for when they have begun to wax want and against christ they will marry having damnation because they have cast off their first faith .

ðŸ”¸ MAIN PART 1
 When we speak words what are words without a heart beware about working in our hearts we sigh and sometimes we can't even say what we sigh about . let's take the fight of lord and prayer. the lord will pray in us with unspeakable words what attend to the end for the comfort of thy people .  In sweden the word in sweden is comfort speak in comfort talk to him out of the gospel . in the early church the widows were deacons not in an office but the older women the older widows ha