In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import whisper
import os
from pytube import YouTube
from yt_dlp import YoutubeDL

In [3]:
def extract_video_id(youtube_url):
    """
    Extracts the video id from a youtube url

    Parameters
    ----------
    youtube_url : str
        The youtube url

    Returns
    -------
    str
        The video id
    """
    regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
    match = re.search(regex, youtube_url)
    return match.group(1) if match else None

In [9]:
def get_transcript(video_id):
    """
    Gets the transcript of a youtube video

    Parameters
    ----------
    video_id : str
        The video id

    Returns
    -------
    str
        The transcript of the video
    """
    try:
        full_transcript = " "
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        for line in transcript:
            full_transcript += line['text'] + " "
        return full_transcript
    except Exception as e:
        print(e)
        return None

In [10]:
video_id = extract_video_id("https://www.youtube.com/watch?v=7wpfu30FYJM&list=PL2qEL_7r0QISg3wu4D_j9xRJodZsfjBEu")
transcript = get_transcript(video_id)
transcript

" let's get started with our linear algebra review in this video i want to tell you what are matrices and what are vectors a matrix is a rectangular array of numbers written between square brackets so for example here is a matrix i'm going to write a left square bracket and then write in a bunch of numbers and you know these could be features for a machine learning problem or it could be data from somewhere else but for example the specific values don't matter and then i'm gonna close it with another right bracket on the right so that's one matrix and you know here's another example of a matrix mr right one two three four five six so matrix is just another way for saying is a 2d or two dimensional array and the other piece of analogy we need is that the dimensional matrix is going to be written as the number of rows times the number of columns in the matrix so concretely this example on the left this has one two three four rows and it has two columns and so this example on the left i'm

In [58]:
def download_video_and_extract_transcript(link):
    """
    Downloads a youtube video and extracts the transcript

    Parameters
    ----------
    link : str
        The youtube video link

    Returns
    -------
    str
        The transcript of the video
    """
    try:
        # Download audio
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl' : 'audios/%(title)s.%(ext)s',
        }
        
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([link])
            info_dict = ydl.extract_info(link, download=True)
            filename = ydl.prepare_filename(info_dict)
            print(filename)
        # Extract transcript
        model = whisper.load_model("base")
        result = model.transcribe(filename)
        transcript = result['text']
        return transcript
    except Exception as e:
        print(e)
        return None

In [59]:
download_video_and_extract_transcript("https://www.youtube.com/watch?v=lUUte2o2Sn8")

[youtube] Extracting URL: https://www.youtube.com/watch?v=lUUte2o2Sn8
[youtube] lUUte2o2Sn8: Downloading webpage
[youtube] lUUte2o2Sn8: Downloading tv client config
[youtube] lUUte2o2Sn8: Downloading player 6b3caec8
[youtube] lUUte2o2Sn8: Downloading tv player API JSON
[youtube] lUUte2o2Sn8: Downloading ios player API JSON
[youtube] lUUte2o2Sn8: Downloading m3u8 information
[info] lUUte2o2Sn8: Downloading 1 format(s): 251-2
[download] audios/Gil Strang's Final 18.06 Linear Algebra Lecture.webm has already been downloaded
[download] 100% of   49.41MiB
[youtube] Extracting URL: https://www.youtube.com/watch?v=lUUte2o2Sn8
[youtube] lUUte2o2Sn8: Downloading webpage
[youtube] lUUte2o2Sn8: Downloading tv client config
[youtube] lUUte2o2Sn8: Downloading tv player API JSON
[youtube] lUUte2o2Sn8: Downloading ios player API JSON
[youtube] lUUte2o2Sn8: Downloading m3u8 information
[info] lUUte2o2Sn8: Downloading 1 format(s): 251-2
[download] audios/Gil Strang's Final 18.06 Linear Algebra Lecture.

  checkpoint = torch.load(fp, map_location=device)


" You don't think so, you vote for cabinet rules and coming to party? All right. All right. All right. All right. All right. All right. All right. All right. All right. All right. All right. All right. Oh, yeah. I got it. Yeah. All right. All right. All right. All right. All right. All right. All right. Yeah. All right. I'm going to have a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a li