<h3>Load Dataset</h3>

In [1]:
import glob
files = glob.glob("dataset/*.mp3")

<h3>Preprocess the audios</h3>

In [None]:
from pydub import AudioSegment
import os

os.makedirs("processed", exist_ok=True)

for f in files:
    audio = AudioSegment.from_file(f)
    audio = audio.set_channels(1).set_frame_rate(16000).normalize()
    
    chunk_len = 5 * 60 * 1000  # 5 min
    for i, chunk in enumerate(audio[::chunk_len]):
        out = f"processed/{os.path.basename(f)[:-4]}_{i}.wav"
        chunk.export(out, format="wav")

<h3>Convert Audio into Text</h3>

In [3]:
import whisper
import glob
import os
import json

model = whisper.load_model("base")

# Create output folder
os.makedirs("transcripts", exist_ok=True)

# Transcribe all processed wav files
for audio_path in glob.glob("processed/*.wav"):
    result = model.transcribe(audio_path)

    name = os.path.splitext(os.path.basename(audio_path))[0]

    # Save text transcript
    with open(f"transcripts/{name}.txt", "w", encoding="utf-8") as f:
        f.write(result["text"])

    # Save JSON with timestamps
    with open(f"transcripts/{name}.json", "w", encoding="utf-8") as f:
        json.dump(result["segments"], f, indent=2)

print("Transcription completed")



Transcription completed


<h3>Convert Texts into Paragraphs</h3> 

In [4]:
import os
import glob

os.makedirs("paragraphs", exist_ok=True)

for txt_file in glob.glob("transcripts/*.txt"):
    with open(txt_file, "r", encoding="utf-8") as f:
        text = f.read()

    # Split by blank lines
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

    name = os.path.splitext(os.path.basename(txt_file))[0]

    with open(f"paragraphs/{name}_paragraphs.txt", "w", encoding="utf-8") as f:
        for p in paragraphs:
            f.write(p + "\n\n")

print("Paragraph-level split done")

Paragraph-level split done


<h3>Text Preprocessing</h3>

In [5]:
import os
import glob
import re

os.makedirs("clean_paragraphs", exist_ok=True)

def clean_text(text):
    text = text.lower()                     # lowercase
    text = re.sub(r"\s+", " ", text)        # remove extra spaces
    text = re.sub(r"[^\w\s.,!?]", "", text) # remove noisy symbols
    return text.strip()

for file in glob.glob("paragraphs/*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        paragraphs = [p.strip() for p in f.read().split("\n\n") if p.strip()]

    cleaned = [clean_text(p) for p in paragraphs]

    name = os.path.splitext(os.path.basename(file))[0]

    with open(f"clean_paragraphs/{name}.txt", "w", encoding="utf-8") as f:
        for p in cleaned:
            f.write(p + "\n\n")

print("Paragraph text preprocessing done")

Paragraph text preprocessing done
