# Summarize the video transcripts


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from llama_cpp import Llama
from langchain.prompts import PromptTemplate

from transcriber.data.utils import get_resource, check_create_fol

In [None]:
data_fol = get_resource("data_fol")
course_fol = data_fol / "course"
transcripts_fol = course_fol / "transcripts"
summaries_fol = course_fol / "summaries"

In [None]:
check_create_fol(summaries_fol)

In [None]:
# iterate the topics in the transcripts folder
transcripts_fol_iterdir = list(transcripts_fol.iterdir())
for topic_fol in transcripts_fol_iterdir[2:]:
    if not topic_fol.is_dir():
        continue
    topic_name = topic_fol.name
    print(topic_name)

    # iterate the lessons in the topic folder
    topic_fol_iterdir = list(topic_fol.iterdir())
    for lesson_fp in topic_fol_iterdir[1:]:
        if not lesson_fp.is_file():
            continue
        lesson_name = lesson_fp.stem
        print(lesson_name)

        # load the transcript
        transcript_orig_text = lesson_fp.read_text()
        # remove spaces at beginning and end
        transcript_orig_text = transcript_orig_text.strip()
        # remove newlines
        transcript_orig_text = transcript_orig_text.replace("\n", " ")
        word_count = len(transcript_orig_text.split())
        print(f"word count: {word_count}")
        print(transcript_orig_text)

        break
    break

In [None]:
model_fol = get_resource("quant_model_fol")

for mn in model_fol.iterdir():
    print(mn.name)

# model_name = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
# model_name = "mistral-7b-instruct-v0.2.Q8_0.gguf"
# model_name = "codellama-34b-instruct.Q5_K_M.gguf"
model_name = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
model_path = model_fol / model_name

In [None]:
# context = """given the following transcript of a speech, please provide a one paragraph summary.
# take care of summarizing only the provided information, without introducing external knowledge.
# do not add an explanation, and do not include the question in the summary.
# """

context = """given the following transcript of a speech, please provide a summary.
take care of summarizing only the provided information, without introducing external knowledge.
do not add an explanation, and do not include the question in the summary.
"""

In [None]:
tmpl_transcript = """{context}
Summarize the following transcript of a speech:
{transcript_orig_text}
"""

prompt_transcript = PromptTemplate(
    input_variables=[
        "context",
        "transcript_orig_text",
    ],
    template=tmpl_transcript,
)

fill_prompt_transcript = prompt_transcript.format(
    context=context,
    transcript_orig_text=transcript_orig_text,
)
print(fill_prompt_transcript[:60000])

In [None]:
n_ctx = 2**13
print(f"using {n_ctx=}")

llm = Llama(
    model_path=str(model_path),
    # n_ctx=32768,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_ctx=n_ctx,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=16,  # The number of CPU threads to use, tailor to your system and the resulting performance
)

In [None]:
output = llm(
    fill_prompt_transcript,
    max_tokens=512,
)

print(output)

In [None]:
print(output.keys())
print(output["choices"][0]["text"])

In [None]:
len(output["choices"])

### Assistant prompt

now let's create a prompt for that assistant. follow the steps below to create a
prompt that will help the assistant perform the task of summarizing a video
transcript.

Set the context: Start by specifying the role you want the assistant to play.
This helps set the expectations and provides a frame of reference for the
generated responses.

State the requirements: Clearly state what you need from the assistant, such as
a recommendation, solution, or answer. This helps narrow down the focus of the
conversation.

Define the task: Describe the specific task or action that the assistant should
perform. This provides clarity on the desired outcome.

Provide details: Specify any additional details or considerations that should be
taken into account during the task.

Mention restrictions: Clearly state any limitations or restrictions that should
be followed. This helps avoid generating responses that are not suitable or
desired.

Specify the desired format: Indicate the format or structure in which you would
like the final result to be presented. This ensures the generated output aligns
with your expectations.

Provide examples: Include examples to give the assistant a reference point and
help it understand the type of response you are looking for. This can improve
the quality and relevance of the generated answers.

Remember, assigning the assistant a specific role and providing clear
instructions through a well constructed prompt increases the likelihood of
obtaining accurate and useful responses.


Context:
You are an editor with strong reading comprehension, concise communication skills and good attention to detail.

Requirements:
You must generate clear and coherent summaries for video transcripts.
The goal is to distill the key information, main points, and important details while maintaining accuracy and relevance.

Task:
Please summarize the provided video transcript on the given topic.

<!-- Your summary should have a schematic structure with sections and lists to capture the essence of the content. -->

Your summary should have a schematic structure with sections for each subtopic in the video transcript.
Where needed, you can use bullet points to capture the essence of the content.

Details:
Take into account the context of the video and focus on highlighting major themes, key takeaways, and any notable examples or explanations provided.

Restrictions:
Only summarize information that is present in the video transcript.
Do not include any additional information or make any assumptions.
Refrain from introducing personal opinions or biases in the summary.
Stick to a neutral and informative tone.

Desired Format:
Format the summary as valid markdown text.

Examples:
For a video on renewable energy, a suitable summary might cover the importance of renewable sources, advancements in technology, and potential environmental benefits. Provide similar structured summaries for various topics based on the specific video content.


# OpenAI


In [None]:
context = """Context:
You are an editor with strong reading comprehension, concise communication skills and good attention to detail.

Requirements:
You must generate clear and coherent summaries for video transcripts.
The goal is to distill the key information, main points, and important details while maintaining accuracy and relevance.

Task:
Please summarize the provided video transcript on the given topic.
Your summary should have a schematic structure with sections and lists to capture the essence of the content.

Details:
Take into account the context of the video and focus on highlighting major themes, key takeaways, and any notable examples or explanations provided.

Restrictions:
Only summarize information that is present in the video transcript.
Do not include any additional information or make any assumptions.
Refrain from introducing personal opinions or biases in the summary.
Stick to a neutral and informative tone.

Desired Format:
Format the summary as valid markdown text.
"""

context = """Context: You are an editor with strong reading comprehension, concise communication skills and good attention to detail.

Requirements: You must generate clear and coherent summaries for video transcripts. The goal is to distill the key information, main points, and important details while maintaining accuracy and relevance.

Task: Please summarize the provided video transcript on the given topic. Your summary should have a schematic structure with sections for each subtopic in the video transcript. Where needed, you can use bullet points to capture the essence of the content.

Details: Take into account the context of the video and focus on highlighting major themes, key takeaways, and any notable examples or explanations provided.

Restrictions: Only summarize information that is present in the video transcript. Do not include any additional information or make any assumptions. Refrain from introducing personal opinions or biases in the summary. Stick to a neutral and informative tone.

Desired Format: Format the summary as valid markdown text.
"""

In [None]:
from openai import OpenAI

client = OpenAI()

In [None]:
prompt = f"""Summarize the following transcript of a speech:
{transcript_orig_text}
"""

In [None]:
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": prompt},
    ],
)

print(completion.choices[0].message)

In [None]:
print(completion.choices[0].message.content)

In [None]:
def get_summary_gpt3(
    prompt,
    context,
    client: OpenAI,
    # max_tokens=NOT_GIVEN,
):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": context},
            {"role": "user", "content": prompt},
        ],
        # max_tokens=max_tokens,
    )
    return completion.choices[0].message.content

In [None]:
# iterate the topics in the transcripts folder
transcripts_fol_iterdir = list(transcripts_fol.iterdir())
for topic_fol in transcripts_fol_iterdir[0:]:
    if not topic_fol.is_dir():
        continue
    topic_name = topic_fol.name
    print(topic_name)
    # create the topic folder in the summary folder
    topic_sum_fol = summaries_fol / topic_name
    check_create_fol(topic_sum_fol)

    # iterate the lessons in the topic folder
    topic_fol_iterdir = list(topic_fol.iterdir())
    for lesson_fp in topic_fol_iterdir[0:]:
        if not lesson_fp.is_file():
            continue
        lesson_name = lesson_fp.stem
        print("  ", lesson_name)

        # build the output md file name
        summary_fp = topic_sum_fol / f"{lesson_name}.md"
        # if the summary file already exists, skip it
        if summary_fp.exists():
            continue

        # load the transcript
        transcript_orig_text = lesson_fp.read_text()
        # remove spaces at beginning and end
        transcript_orig_text = transcript_orig_text.strip()
        # remove newlines
        transcript_orig_text = transcript_orig_text.replace("\n", " ")
        # word_count = len(transcript_orig_text.split())
        # print(f"word count: {word_count}")
        # print(transcript_orig_text[:100])

        # summarize the transcript
        summary_text = get_summary_gpt3(
            prompt=transcript_orig_text,
            context=context,
            client=client,
        )
        if summary_text is None:
            print("    no summary")
            summary_text = ""
        # write the summary to file
        summary_fp.write_text(summary_text)

    #     break
    # break