In [None]:
# import libraries

import pandas as pd
from pytube import Playlist
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
from openai import OpenAI
import time
from IPython.display import display, Markdown

In [None]:
# set up openai client
api_key = 'YOUR-API-KEY'
client = OpenAI(api_key=api_key)

#### `fetch_transcript` function

This function gets the transcript of a single YouTube video.

In [None]:
# define function to retrieve transcripts

def fetch_transcript(video_id, title):
    """
    Fetches the transcript of a YouTube video using its video ID.

    This function attempts to retrieve the transcript of a given YouTube video. 
    If successful, it returns the transcript text as a single string. 
    If the transcript is disabled or not found, it returns an appropriate error message.
    
    Args:
        video_id (str): The ID of the YouTube video.
        title (str): The title of the YouTube video.
        
    Returns:
        str: The transcript text if successfully fetched, 
             otherwise an error message indicating the issue.
    
    Exceptions:
        TranscriptsDisabled: If transcripts are disabled for the video.
        NoTranscriptFound: If no transcript is found for the video.
        Exception: For any other unexpected errors.
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = " ".join([line['text'] for line in transcript])
        return transcript_text
    except TranscriptsDisabled:
        return f"Transcripts are disabled for video: {title}"
    except NoTranscriptFound:
        return f"No transcript found for video: {title}"
    except Exception as e:
        return f"An error occurred for video {title}: {e}"

In [None]:
# test case
fetch_transcript('2frJsC_Q3I0','How to Read and Summarize Research Papers | Machine Learning & Deep Learning')

#### `download_playlist_data` function

This function retrieves the metadata of all the YouTube videos in a playlist

In [None]:
def download_playlist_data(playlist_url):
    """
    Downloads transcripts and metadata for all videos in a YouTube playlist and stores in a dictionary.

    Args:
        playlist_url (str): The URL of the YouTube playlist.

    Returns:
        dict: A dictionary containing metadata for each video in the playlist. 
              The keys are video IDs and the values are dictionaries with the following keys:
              - 'transcript': The transcript of the video.
              - 'title': The title of the video.
              - 'author': The author of the video. 
              - 'date_published': The publish date of the video in YYYY-MM-DD format.
              - 'duration': The duration of the video in seconds.

    """
    playlist = Playlist(playlist_url)
    video_info = {}

    for video in playlist.videos:
        
        # get the transcript and time
        start_time = time.time()
        result = fetch_transcript(video.video_id, video.title)
        end_time = time.time()

        # create dictionary entry for each video in playlist 
        if "An error occurred" not in result and "Transcripts are disabled" not in result and "No transcript found" not in result:
            video_info[video.video_id] = {
                "transcript": result,
                "title": video.title,
                'author': video.author,
                "date_published": video.publish_date.strftime("%Y-%m-%d"),
                "duration": video.length,
                "transcript_fetch_time": round(end_time-start_time, 2)
            }
    
    return video_info

In [None]:
# test case
test_playlist_url = "https://www.youtube.com/playlist?list=PLLcwGvZPfmzdaC38G9cUgQ27Uoi-KHWNv"
test_playlist_data = download_playlist_data(test_playlist_url)
print(test_playlist_data)

In [None]:
for key in test_playlist_data.keys():
    print(key)
    print(test_playlist_data[key]['transcript_fetch_time'])

#### `compile_prompt` function

In [None]:
def compile_prompt(user_prompt, transcript, topic):
    topic_and_transcript = f"Topic: {topic}\nTranscript: {transcript}"
    compiled_prompt = user_prompt + topic_and_transcript        
    return compiled_prompt    

#### `get_model_response` function

This function queries the OpenAI API given a prompt and a transcript. It returns the model's response. 

In [None]:
# define model response function

def get_model_response(user_prompt, system_prompt):
    """
    Processes a transcript using the OpenAI API to generate a summary and other information.

    This function sends a prompt to the OpenAI API containing the provided transcript text.
    The prompt instructs the API to perform several tasks, including extracting a bio of the guest,
    listing books suggested, outlining unique ideas discussed, summarizing career advice, and 
    summarizing learning processes mentioned in the transcript.

    Args:
        transcript_text (str): The text of the transcript to be processed.
        user_prompt (str): The user prompt to include in the OpenAI API request.
        system_prompt(str): The system prompt to include in the OpenAI API request.

    Returns:
        str: The response from the OpenAI API, containing the requested information within <result> tags.
    """

    response = client.chat.completions.create(
        model="gpt-4o-2024-05-13",
        messages=[{"role": "system", "content": system_prompt},
                  {"role": "user", "content": user_prompt}],
        max_tokens=1000
    )

    return response.choices[0].message.content

In [None]:
# test model response

test_transcript_text = """how do you read a research paper and summarize it research papers are often quite 
intimidating and can be hard to understand because they are written by experts who have been researching in 
their field often for up to decades even and they go right into the mathematical details and talk about the experiments 
and sometimes they forget to actually talk about the topic that they're researching and explain it to newcomers so how can 
you still work through a paper and give a presentation on it or write a summary for your research proposal for example in 
this video i will show you both a quick and a more thorough method for reading the paper that helped me personally when i 
was starting out in the field and was a bit overwhelmed by the papers i would suggest you start with a quick method to get
a rough idea of the paper and then afterwards you can decide if you want to do some more work and get into more of the 
details of the paper or if you already know that you have to read the whole paper because it is for a university 
assignment for example then i would still suggest you start with a quick method because this will give you an intro and 
then dive more into the details later i have one quick disclaimer before we start this video is specifically about computer
science papers and i have most of my experience with machine learning papers okay so enough about that let's get into 
the workflow that i use for every new paper that i read as an example through this video i will use the paper asymmetric 
values beyond sharp and local minima and that is about an optimization method called stochastic gradient percent in deep 
learning and i will link the paper down below in the description box in case you're interested all right let's start with 
the quick method the first thing you'll notice in any paper is probably the title on the top of the page and below that 
you will find the authors of the paper and under the office you can find the university or company that they are currently
working for or who sponsored this paper now this is not extremely important but i always take a look at this to see which
company the authors are from and also after you read a few of the papers in the field you notice some repeating companies 
or authors and that allows you to draw some connections between the topics that they're working on below the authors we 
have the abstract of the paper this will give you a rough summary of what the paper is about and what problem they were 
trying to solve and how they solved it you should quickly read through this but don't worry if you don't understand a lot
just look for some keywords that might be important for your research or just keywords that you already understand and 
that will then give your context and makes you aware of what you should look out for while you read the rest of the paper 
for example in this paper i noticed the keywords generalization performance stochastic gradient descent and flat and type 
minima and these are all things that i already know about and know that they are relevant to what i'm currently working on
so i know that the paper will probably be a good read for me i also again see the mention of the asymmetric valleys like 
they already said in the title so i know that this will probably be an important concept in the paper and i know to keep 
an eye out for it and see how it relates to the other concepts that were mentioned and at the end of the abstract you also
see theoretical explanation and empirically so from that we can infer that the paper will both have mathematical proofs 
and experiments next is the introduction which we will also read and we will read it a bit more carefully than the 
abstract ideally this should give you an introduction to the topic and tell you what the research is about and also 
why should we care about this topic or how is it relevant to solving a specific problem i would suggest that you 
highlight the most important parts and also maybe write down something in your own words and also write down any open 
questions that you might have of things that you don't understand then later as you read the rest of the paper you 
might be able to answer these questions and you have a record of what was maybe unclear at the start so if you then 
give a presentation or write a summary for another inexperienced reader then you can answer these questions upfront 
and hopefully reduce any confusion that the listener might have if you still understand almost nothing after reading 
the introduction multiple times just you know consider chugging the paper into the trash okay i realize this might not 
be possible if you have to read it for a university assignment or another commitment that you have so in that case 
i would recommend that you go through the papers that are cited in the introduction or in the related work section and 
look for one that is very similar in topic to the one you're currently trying to read and then look at that paper and 
maybe read their introduction and hopefully they explain the general research topic in a more accessible way than the 
paper you're currently reading the related work section is most often the chapter that comes after the introduction and 
unless you're trying to explicitly find another paper to read i would just skim read this section because it will show 
you a lot of papers that are slightly related to the one you're currently reading and it will probably not have a lot of 
information that you actually need to understand the paper next up we actually skip ahead to the end and we read the one 
or two chapters that come before the references often they are called conclusion or discussion but sometimes they might 
be entirely missing from the paper in that case don't read the last paper if it's about the details of it but most of 
the time you should find a conclusion so read that and this will again summarize everything that the authors find out 
and oftentimes they will also give you an outlook of the future research areas or future research directions that they 
might have because of the findings they have if you just want to quickly know the gist of the paper then your journey 
ends here you can again read everything you've just read and maybe write down some key findings but definitely remember 
to write something down before moving on to the next paper or project because otherwise you will probably have forgotten
everything that the paper is about in about two days at least that's what happens to me now let's talk about the long way 
through the paper if we want to know more details after reading the quick pass now we read the rest of the paper or the middle of the paper that we just skipped and you should read the paper multiple times and either on the first or second read through begin making some notes in your own words about what you're reading don't try to just reword a sentence really like use your own words to describe the concept even if it might be slightly wrong on this read-through i would still recommend to skip any complicated proofs or really just any proofs and just focus on understanding the concept many papers in the machine learning area also have an appendix that comes after the references and oftentimes those appendix can be 10 to 20 pages long even when the main paper is only eight to nine pages that is because if you submit to a journal or conference you have to be within a certain page limit and anything that didn't make the main paper will then be put into the appendix often the appendix is the place where you find the long mathematical proofs or something so in this case we can just skip it but also sometimes the appendix is used for additional figures or tables about the experiment that the authors did and in this case you can take a quick look at the appendix but in general everything that didn't make the main paper is probably not vital to understanding the topics that you're trying to learn about so don't worry about the appendix until maybe at the end of your work through the paper if you still have time you can take a look but yeah otherwise just don't worry about it after reading through everything and making some notes you're going to do a bit of reflection this step is what most beginners skip and it really makes a huge difference to your understanding and ability to presentate to presentate but it makes a huge difference in your understanding and ability to present the topic that you just researched ask yourself the following questions what are the three to five main concepts or statements that the paper is making while doing this try to think about the overarching field of research and what these statements might lead to and how they might impact the future of the research area either during this stage or afterwards i recommend you also make a mind map to write down these concepts and maybe try to find the connection between those concepts or how one of those ideas leads to the next idea how are they connected what is the story behind the paper near the end of your work through try and answer the question what is the main goal or problem that the paper is trying to solve and really state this in the most simple and short way that you can manage while still describing the paper as a whole what is the motivation behind this work and really take your time with this one write down a few different suggestions that you have and then reflect on this and think which one is the most simple while still describing all of the work that went into the paper of course no short answer will summarize the whole paper but really try to boil it down to one motivational idea behind the paper after all of this work you probably understand enough to summarize the paper or give a presentation on the topic especially for a presentation i recommend that you start with the main goal that you just found out and put it on the first slide let your audience know immediately what the motivation for it this presentation and for the paper is and what they can expect from the rest of the presentation because remember context is everything for understanding for a written summary you can do a bit of a longer introduction but still i would suggest to start with the main point and then try and develop a story that connects the three to five main statements that you found out and go through them and in the end comes to a conclusion of the topic i will probably make another video that goes into how i would structure an engaging presentation about a very abstract topic but that is it for this video remember introduction and conclusion are the two most important parts to read first and always reflect on the key concept that you're reading and the context of everything when writing a summary don't just reword the sentences and make them shorter but really engage with the content of the paper in all the ways that you possibly can please leave a like if this video helped you and consider subscribing for more machine learning and student content bye"""

test_user_prompt = f"""
    Here is the transcript from a YouTube video about learning Machine Learning:

    <transcript>
    {transcript_text}
    </transcript>

    Please carefully read through the entire transcript above. Then answer the following questions:

    1. What are the key points made in this YouTube video? 

    2. What are some actionable steps that one can take based on the advice in this YouTube video? 
    

    Provide your full response inside <result> tags.
    """

test_system_prompt = """You are an expert at summarising YouTube videos on any topic. 
Your job is to read and effectively summarise and answer questions about YouTube videos based on their transcripts. 
Do not hallucinate or fabricate any information. If some piece of information requested above is not found in the transcript, 
simply skip that part. Do not write things like "not found in transcript"."""

get_model_response(test_user_prompt, test_system_prompt)

#### `summarise_playlist` function

This function puts everything together. Given a playlist url, prompts and topic(s) it will gather the transcripts, query the LLM and output the results. 

In [None]:
def summarise_playlist(playlist_url, user_prompt, system_prompt, topic):
    playlist_data = download_playlist_data(playlist_url)
    summarized_data = {}

    for video_id, info in playlist_data.items():
        
        # create prompt
        compiled_user_prompt = compile_prompt(user_prompt, info['transcript'], topic)
        
        # get summary from model and time
        start_time = time.time()
        summary = get_model_response(compiled_user_prompt, system_prompt)
        end_time = time.time()
        
        # add summary to dictionary 
        summarized_data[video_id] = {
            "summary": summary,
            "transcript": info['transcript'],
            "title": info['title'],
            "author": info['author'],
            "date_published": info['date_published'],
            "duration": info['duration'],
            "transcript_fetch_time": info['transcript_fetch_time'],
            "model_response_time": round(end_time-start_time, 2)
        }
    
    return summarized_data

In [None]:
# test case
test_summaries = summarise_playlist("https://www.youtube.com/playlist?list=PLLcwGvZPfmzcGxeS_5OMR_25VpFRKao2U", 
                                    test_user_prompt, test_system_prompt, 'Machine Learning')


In [None]:
test_summaries

In [None]:
# convert to dataframe
df = pd.DataFrame.from_dict(test_summaries, orient='index')
df = df.reset_index().rename(columns={'index': 'video_id'})
df = df[['video_id', 'title', 'author', 'summary', 
         'date_published', 'duration', 'transcript_fetch_time', 'model_response_time']]