# Setup

In [135]:
#!pip install -qU langchain-huggingface langchain-text-splitters langchain-community pypdf kokoro>=0.3.4 soundfile pydub

In [136]:
from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseChatModel
import torch
import gc


class CFG:
    _llm = "Qwen/Qwen2.5-1.5B-Instruct"
    _embeddings = "sentence-transformers/all-MiniLM-L6-v2"
    doc = "paper.pdf"
    device = "cpu"
    llm_model: BaseChatModel = None
    embeddings_model: Embeddings = None

    def update_embeddings(self, model: Embeddings):
        self.embeddings_model = model

    def update_llm(self, model: BaseChatModel):
        self.llm_model = model

    def update_device(self, device: str):
        self.device = device

    def flash(self):
        del self.llm_model
        del self.embeddings_model
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

cfg = CFG()

## Hugging face model

In [137]:
# from langchain_huggingface import HuggingFaceEmbeddings

# cfg.update_embeddings(HuggingFaceEmbeddings(model_name=CFG._embeddings))

In [138]:
# from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

# cfg.update_llm(ChatHuggingFace(llm = HuggingFacePipeline.from_model_id(model_id=CFG._llm, task="text-generation")))

## Ollama model

In [139]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")

cfg.update_embeddings(embeddings)

In [140]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model="llama3.2")

cfg.update_llm(llm)

# Template

In [141]:
from langchain_core.prompts import ChatPromptTemplate

plan_prompt = ChatPromptTemplate.from_template(
    """You are a very clever planner of podcast scripts. You will be given the text of a research paper, and your task will be to generate a plan for a podcast involving 3 persons discussing about the content of the paper in a very engaging, interactive and enthusiastic way. The plan will be structured using titles and bullet points only. The plan for the podcast should follow the structure of the paper. The podcast involves the following persons:
- The host: he will present the paper and its details in a very engaging way. very professional, friendly, warm and enthusiastic.
- The learner: he will ask clever and significative questions about the paper and its content. he is curious and funny.
- The expert: he will provide deep insights, comments and details about the content of the paper and other related topics. he talks less than the two other and his interventions are more profound and detailed.
Example of a structure for the podcast:
# Title: title of the podcast
# Section 1: title of section 1
- bullet point 1
- bullet point 2
- bullet point 3
...
- bullet point n
# Section 2: title of section 2
- bullet point 1
- bullet point 2
- bullet point 3
...
- bullet point n
# Section 3: title of section 3
...
# Section n: title of section n
- bullet point 1
- bullet point 2
- bullet point 3
...
- bullet point n
The paper: {paper}
The podcast plan in titles and bullet points:"""
)

discuss_prompt_template = ChatPromptTemplate.from_template(
    """You are a very clever scriptwriter of podcast discussions. You will be given a plan for a section of the middle of a podcast that already started involving 3 persons discussing about the content of a research paper. Your task will be to generate a brief dialogue for the podcast talking about the given section, do not include voice effects, and do not make an introduction. The dialogue should be engaging, interactive, enthusiastic and have very clever transitions and twists. The dialogue should follow the structure of the plan. The podcast involves the following persons:
- The host: he will present the paper and its details in a very engaging way. very professional, friendly, warm and enthusiastic.
- The learner: he will ask clever and significative questions about the paper and its content. he is curious and funny.
- The expert: he will provide deep insights, comments and details about the content of the paper and other related topics. he talks less than the two other and his interventions are more profound and detailed.
Dialogue example 1:
Host: Let's continue with the second section of the paper ...
Learner: I have a question about ...
Expert: I would like to add ...
Dialogue example 2:
Host: Now, let's move on to the next section ...
Expert: I think that ...
Learner: I have a question about ...
Expert: I would like to add ...
Dialogue example 3:
Learner: Should we move on to the next section?
Host: Yes, let's move on to the next section ...
Expert: I think that ...
Section plan: {section_plan}
Previous dialogue (to avoid repetitions): {previous_dialogue}
Additional context:{additional_context}
Brief section dialogue:"""
)

initial_dialogue_prompt = ChatPromptTemplate.from_template(
    """You are a very clever scriptwriter of podcast introductions. You will be given the title of a paper and a brief glimpse of the content of a research paper. Avoid using sound effects, only text. Avoid finishing with the host, finish the dialogue with the expert. Your task will be to generate an engaging and enthusiastic introduction for the podcast. The introduction should be captivating, interactive, and should make the listeners eager to hear the discussion. The introduction of the podcast should have 3 interactions only. The podcast involves the following persons:
- The host: he will present the paper and its details in a very engaging way. very professional, friendly, warm and enthusiastic.
- The learner: he will ask clever and significative questions about the paper and its content. he is curious and funny.
- The expert: he will provide deep insights, comments and details about the content of the paper and other related topics. he talks less than the two other and his interventions are more profound and detailed.
Introduction example 1:
Host: Welcome to our podcast, today we will be discussing the paper ...
Learner: I am very curious about ...
Expert: I think that ...
Introduction example 2:
Host: Hello everyone, today we have a very interesting paper to discuss ...
Expert: I would like to add ...
Learner: I have a question about ...
Content of the paper: {paper_head}
Brief 3 interactions introduction:"""
)

enhance_prompt = ChatPromptTemplate.from_template(
    """You are a very clever scriptwriter of podcast discussions. You will be given a script for a podcast involving 3 persons discussing about the content of a research paper. Your task will be to enhance the script by removing audio effects mentions and reducing repetition and redundancy. Don't mention sound effects, laughing, chuckling or any other audio effects between brackets. The script should only contain what the persons are saying and not what are they doing or how they are saying it. Enhance the transitions and the twists, and reduce repetition and redundancy.
The draft script{draft_script}
The enhanced script:"""
)

# Generate script from PDF

## Helper functions

### Parse PDF

In [142]:
from pypdf import PdfReader

def _parse_pdf(pdf_path: str, output_path: str) -> str:
    pdf_reader = PdfReader(pdf_path)
    extracted_text = []
    collecting = True

    for page in pdf_reader.pages:
        text = page.extract_text()
        if text and collecting:
            extracted_text.append(text)

            # Check for the end condition, the section after "Conclusion"
            if "Conclusion" in text:
                conclusion_start = text.index("Conclusion")
                extracted_text.append(text[conclusion_start:])
                collecting = (
                    False  # Stop collecting after the section following Conclusion
                )

    final_text_to_section_after_conclusion = "\n".join(extracted_text)

    with open(output_path, "w", encoding="utf-8") as file:
        file.write(final_text_to_section_after_conclusion)

    return output_path

### Get head

In [143]:
def _get_head(pdf_path: str) -> str:
    # Load the PDF file
    pdf_reader = PdfReader(pdf_path)

    extracted_text = []
    collecting = True

    for page in pdf_reader.pages:
        text = page.extract_text()
        if text and collecting:
            if "Introduction" in text:
                introduction_index = text.index("Introduction")
                extracted_text.append(text[:introduction_index])
                break
            else:
                extracted_text.append(text)

    return "\n".join(extracted_text)

### Prepare discussion chain

In [144]:
from pathlib import Path
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import InMemoryVectorStore
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter

from operator import itemgetter

def _initialize_discussion_chain(txt_file: str | Path, cfg: CFG):
    # Load, chunk and index the contents of the post.
    loader = TextLoader(txt_file)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    vector_store = InMemoryVectorStore.from_documents(
        documents=splits, embedding=cfg.embeddings_model
    )

    # Retrieve and generate using the relevant snippets of the post.
    retriever = vector_store.as_retriever()

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    discuss_rag_chain = (
        {
            "additional_context": itemgetter("section_plan") | retriever | format_docs,
            "section_plan": itemgetter("section_plan"),
            "previous_dialogue": itemgetter("previous_dialogue"),
        }
        | discuss_prompt_template
        | cfg.llm_model
        | StrOutputParser()
    )
    return discuss_rag_chain

### Generate script

In [145]:
from datetime import datetime
import os


def generate_script(chains: dict, cfg: CFG) -> str:
    start_time = datetime.now()
    # Step 1: parse the pdf file
    txt_file = f"text_paper_{datetime.now().strftime('%Y%m%d%H%M%S')}.txt"
    txt_file = _parse_pdf(cfg.doc, txt_file)

    with open(txt_file, "r", encoding="utf-8") as file:
        paper = file.read()

    plan = chains["plan_script_chain"].invoke({"paper": paper})
    print("Plan generated...")

    # Step 2: generate the actual script for the podcast by looping over the sections of the plan
    script = ""
    # generate the initial dialogue
    initial_dialogue = chains["initial_dialogue_chain"].invoke(
        {"paper_head": _get_head(cfg.doc)}
    )

    script += initial_dialogue
    actual_script = initial_dialogue
    discuss_rag_chain = _initialize_discussion_chain(txt_file, cfg)
    for section in plan:
        section_script = discuss_rag_chain.invoke(
            {"section_plan": section, "previous_dialogue": actual_script}
        )
        script += section_script
        actual_script = section_script
    enhanced_script = chains["enhance_chain"].invoke({"draft_script": script})
    end_time = datetime.now()
    print(f"Time taken: {end_time - start_time}")
    print("final script generated")
    os.remove(txt_file)
    return enhanced_script

### Prepare script plan

In [146]:
import re
from langchain_core.messages import AIMessage

def parse_script_plan(ai_message: AIMessage) -> list:
    # Initialize the sections list
    sections = []
    current_section = []

    # Split the text by line and skip the first line as the title
    lines = ai_message.content.strip().splitlines()
    lines = lines[1:]  # Skip the first line (title)

    # Regex patterns for any level of headers and bullet points
    header_pattern = re.compile(r"^([IVXLCDM]+|#+|\d+.)\s*")  # Match headers with any number of #
    bullet_pattern = re.compile(r"^(\\t\*|-|\*)\s")  # Match lines starting with a bullet point "- "

    # Parse each line, starting with the first header after the title
    for line in lines:
        if header_pattern.match(line):
            # Append the previous section (if any) to sections when a new header is found
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            # Start a new section with the header
            current_section.append(line.strip())
        elif bullet_pattern.match(line):
            # Append bullet points to the current section
            current_section.append(line.strip())

    # Append the last section if exists
    if current_section:
        sections.append(" ".join(current_section))

    return sections

# Generate podcast script

In [147]:
chains = {
    "plan_script_chain": plan_prompt | cfg.llm_model | parse_script_plan,
    "initial_dialogue_chain": initial_dialogue_prompt | cfg.llm_model | StrOutputParser(),
    "enhance_chain": enhance_prompt | cfg.llm_model | StrOutputParser(),
}

In [148]:
podcast = Path("podcast_script.txt")
if not podcast.exists():
    script = generate_script(chains, cfg)
    with podcast.open("w", encoding="utf-8") as file:
        file.write(script)

Plan generated...
Time taken: 0:01:14.814038
final script generated


### Free up memory

In [149]:
cfg.flash()

# Generate audio

## Generate voices

In [150]:
from kokoro import KPipeline
import soundfile as sf
import datetime

def generate_host(text: str, output_dir: str):
    now = int(datetime.datetime.now().timestamp())
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
      text,
      voice='am_michael',
      speed=1,
      split_pattern=r'\n+'
    )

    for i, (gs, ps, audio) in enumerate(generator):
        sf.write(f'./{output_dir}/host_{now}_{i}.mp3', audio, 24000)


def generate_expert(text: str, output_dir: str):
    now = int(datetime.datetime.now().timestamp())
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
      text,
      voice='af_heart',
      speed=1,
      split_pattern=r'\n+'
    )

    for i, (gs, ps, audio) in enumerate(generator):
        sf.write(f'./{output_dir}/expert_{now}_{i}.mp3', audio, 24000)


def generate_learner(text, output_dir):
    now = int(datetime.datetime.now().timestamp())
    pipeline = KPipeline(lang_code='a')
    generator = pipeline(
      text,
      voice='af_bella',
      speed=1,
      split_pattern=r'\n+'
    )

    for i, (gs, ps, audio) in enumerate(generator):
        sf.write(f'./{output_dir}/learner_{now}_{i}.mp3', audio, 24000)

## Merge all voices

In [151]:
from pydub import AudioSegment
import glob

def merge_mp3_files(directory_path, output_file):
    # Find all .mp3 files in the specified directory
    mp3_files = [os.path.basename(x) for x in glob.glob(f"./{directory_path}/*.mp3")]

    # Sort files by datetime extracted from filename
    sorted_files = sorted(
        mp3_files,
        key=lambda x: re.search(r"(\d{10})", x).group(0)
    )
    # Initialize an empty AudioSegment for merging
    merged_audio = AudioSegment.empty()

    # Merge each mp3 file in sorted order
    for file in sorted_files:
        audio = AudioSegment.from_mp3(f"./{directory_path}/{file}")
        merged_audio += audio

    # Export the final merged audio
    merged_audio.export(output_file, format="mp3")
    print(f"Merged file saved as {output_file}")

## Generate podcast

In [152]:
import os
import re

def generate_podcast(script):
    # create a new directory to store the audio files
    output_dir = f"podcast_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"
    os.mkdir(output_dir)
    # Regex to capture "Speaker: Text"
    lines = re.findall(
        r"(Host|Learner|Expert):\s*(.*?)(?=(Host|Learner|Expert|$))", script, re.DOTALL
    )

    for speaker, text, _ in lines:
        # Strip any extra spaces or newlines
        text = text.strip()

        # Direct the text to the appropriate function
        if speaker == "Host":
            generate_host(text, output_dir)
        elif speaker == "Learner":
            generate_learner(text, output_dir)
        elif speaker == "Expert":
            generate_expert(text, output_dir)

    # Merge the audio files into a single podcast
    now = int(datetime.datetime.now().timestamp())
    merge_mp3_files(output_dir, f"podcast_{now}.mp3")
    os.rmdir(output_dir)

# Generate podcast

In [153]:
print("Generating podcast audio files...")
generate_podcast(script)
print("Podcast generation complete!")

Generating podcast audio files...
Merged file saved as podcast_1740312186.mp3
Podcast generation complete!
