In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import whisper
import os
import yt_dlp
from yt_dlp import YoutubeDL
import boto3
import requests

In [2]:
def extract_video_id(youtube_url):
    """
    Extracts the video id from a youtube url

    Parameters
    ----------
    youtube_url : str
        The youtube url

    Returns
    -------
    str
        The video id
    """
    regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
    match = re.search(regex, youtube_url)
    return match.group(1) if match else None

In [3]:
def get_transcript_yt_api(video_id):
    """
    Gets the transcript of a youtube video

    Parameters
    ----------
    video_id : str
        The video id

    Returns
    -------
    str
        The transcript of the video
    """
    try:
        full_transcript = " "
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        for line in transcript:
            full_transcript += line['text'] + " "
        return full_transcript
    except Exception as e:
        print(e)
        return None

In [4]:
video_id = extract_video_id("https://www.youtube.com/watch?v=7wpfu30FYJM&list=PL2qEL_7r0QISg3wu4D_j9xRJodZsfjBEu")
transcript = get_transcript_yt_api(video_id)
transcript

" let's get started with our linear algebra review in this video i want to tell you what are matrices and what are vectors a matrix is a rectangular array of numbers written between square brackets so for example here is a matrix i'm going to write a left square bracket and then write in a bunch of numbers and you know these could be features for a machine learning problem or it could be data from somewhere else but for example the specific values don't matter and then i'm gonna close it with another right bracket on the right so that's one matrix and you know here's another example of a matrix mr right one two three four five six so matrix is just another way for saying is a 2d or two dimensional array and the other piece of analogy we need is that the dimensional matrix is going to be written as the number of rows times the number of columns in the matrix so concretely this example on the left this has one two three four rows and it has two columns and so this example on the left i'm

In [5]:
s3_client = boto3.client("s3")
BUCKET_NAME = "tubequiz-bucket"

In [6]:
def download_audio(link):
    video_id = extract_video_id(link)
    output_path = f"{video_id}_audio"
    ydl_opts = {
        'format':'bestaudio/best',
        'outtmpl':output_path,
        'postprocessors':[{
            'key':'FFmpegExtractAudio',
            'preferredcodec':'mp3',
            'preferredquality':'192'
        }],

    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([link])

    return output_path+'.mp3'

In [7]:
def upload_s3(file_path):
    s3_key = f"audio/{file_path}"
    s3_client.upload_file(file_path, BUCKET_NAME, s3_key)
    return f"s3://{BUCKET_NAME}/{s3_key}"

In [8]:
transcribe_client = boto3.client("transcribe", region_name="us-west-1")

In [9]:
def transcribe_audio(s3_uri, job_name):

    """Transcribes an audio file using AWS Transcribe."""
    transcribe_client.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': s3_uri},
        MediaFormat='mp3',
        LanguageCode='en-US'
    )

    # Wait for the job to complete
    while True:
        status = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break

    if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
        transcript_uri = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
        return transcript_uri

In [14]:
audio_path = download_audio("https://youtu.be/6M5VXKLf4D4?si=GW_-1F3kF-7Kr5zX")

[youtube] Extracting URL: https://youtu.be/6M5VXKLf4D4?si=GW_-1F3kF-7Kr5zX
[youtube] 6M5VXKLf4D4: Downloading webpage
[youtube] 6M5VXKLf4D4: Downloading tv client config
[youtube] 6M5VXKLf4D4: Downloading player 7d1d50a6
[youtube] 6M5VXKLf4D4: Downloading tv player API JSON
[youtube] 6M5VXKLf4D4: Downloading ios player API JSON
[youtube] 6M5VXKLf4D4: Downloading m3u8 information
[info] 6M5VXKLf4D4: Downloading 1 format(s): 251
[download] Destination: 6M5VXKLf4D4_audio
[download] 100% of    5.75MiB in 00:00:01 at 4.62MiB/s   
[ExtractAudio] Destination: 6M5VXKLf4D4_audio.mp3
Deleting original file 6M5VXKLf4D4_audio (pass -k to keep)


In [15]:
audio_path

'6M5VXKLf4D4_audio.mp3'

In [16]:
s3_video_uri = upload_s3(audio_path)

NoCredentialsError: Unable to locate credentials

In [None]:
video_id = extract_video_id("https://www.youtube.com/watch?v=MfIjxPh6Pys")

In [None]:
transcript_video_uri  = transcribe_audio(s3_video_uri, f"transcription-{video_id}")

In [None]:
transcript_video_uri

'https://s3.us-west-2.amazonaws.com/aws-transcribe-us-west-2-prod/841162666835/transcription-MfIjxPh6Pys/76330c46-1510-47e8-8f6a-cedc774f2e6a/asrOutput.json?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEN3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQChryXD8bNaSy6o2TlJQ3IN2l3wUlwUcLUDewdglb5ZHQIhAJIxVqSLyIxOyApSV95vB%2BAqJTe2GZy54zNn9NjIDRIiKrEFCDYQBBoMMDgwMjQ4MzIyMjA2Igw6VpBlJqlTMuHRm%2BMqjgW2jLNKc%2F%2Bs5FtmedK5a7d9%2FTdLr4pkLAlRliy%2F17Gc%2F490bIAGzL72%2BfaJ3mhb0I0PtyT8Rkyf%2B4dGAAYa1CtP%2FVvubeVnpoSHUbGeLz4FZlA%2FYZ2OU2XjQ18wwyY75Z1N9fKQmavM%2FSZ5%2Fx5uFY1VsMqhs0C6hWpscqHWfHvxaxmkA9SCzBQLUkb2ufOL%2BOC7RE%2FfQurLYYhhvP5DVo7dRzMEWGARmnpdsJA%2FdAqZ2wmDKmaRxZkCO%2F1Jh0ZherviXYrZHEKUEplTWMwK%2FM8snU2yoWpmbteFzMVA0Xbu9SUZW8PmoqLbJL%2F%2F%2BNeZBMEuxPJRs0dcadkXSI9FJ5f1CeaBy4iodmmVRs0nl545elQaVKBhjvl%2BbPGDc64IaFEJSoWdPd%2B%2Fo0mTwGlzkqTVYQNleFJFiICb2RmoPnKr7qfNndE7hsl8RACDUb%2BHSjrAX1vmKkg3g60kR4dYWBAEBsGwH0w%2FdfsyWlMjnO4Wjl44bos2eJrVqJDvGYeRg1Q0dcxsRLmCWYrOsntuiiVvLOzUYQh5%2BX%2FWUCzoby

In [None]:
transcript_data = requests.get(transcript_video_uri).json()
transcript = transcript_data['results']['transcripts'][0]['transcript']

In [None]:
transcript

"Hello everyone, uh, welcome to CS 229. Um, today we're going to talk about, uh, deep learning and neural networks. Um, we're going to have two lectures on that one today and a little bit more of it on, uh, Monday. Um, don't hesitate to ask questions during the lecture, uh, so stop me if you don't understand something, and we'll try to build the intuition around your own network together. We will actually start with an algorithm that you guys have seen, uh, previously called logistic regression. Everybody remembers logistic regression. OK, remember it's a classification algorithm. Um, we're going to do that, explain how logistic regression can be interpreted as a neural network specific case of the neural network. And then we will go to neural networks. Sounds good? So the quick intro on deep learning. So deep learning is a is a set of techniques that is let's say a subset of machine learning and it's one of the growing techniques that have been used in the industry specifically for pr

In [None]:
bedrock_client = boto3.client("bedrock-runtime")

In [None]:
import json
import boto3

def generate_quiz(transcript, model_id="anthropic.claude-3-5-sonnet-20241022-v2:0"):
    """
    Uses AWS Bedrock with Claude models to generate quiz questions.

    Parameters
    ----------
    transcript : str
        The text transcript from which to generate the quiz.
    model_id : str, optional
        The Claude model ID to use. Default is Claude 3.5 Sonnet v2.

    Returns
    -------
    str
        The generated quiz in JSON format.
    """
    # Initialize the Bedrock runtime client
    bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2")

    # Claude models use the Messages API format
    body = json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2000,
        "temperature": 0.7,
        "messages": [
            {
                "role": "user",
                "content": f"Generate a quiz from this transcript:\n{transcript}\n"
                          "The quiz should include:\n"
                          "- 5 Multiple Choice Questions (MCQs)\n"
                          "- 3 Short Answer Questions\n"
                          "- Format the output in JSON.\n"
                          "Example Output:\n"
                          "{\n"
                          "  \"mcqs\": [\n"
                          "    {\"question\": \"What is AI?\", \"options\": [\"Artificial Intelligence\", \"Automated Input\", \"None\"], \"answer\": \"Artificial Intelligence\"}\n"
                          "  ],\n"
                          "  \"text_questions\": [\n"
                          "    {\"question\": \"Explain how AI models learn?\"}\n"
                          "  ]\n"
                          "}"
            }
        ]
    })

    try:
        response = bedrock_client.invoke_model(
            modelId=model_id,
            contentType="application/json",
            accept="application/json",
            body=body
        )

        # Read and parse the response
        response_body = json.loads(response['body'].read().decode('utf-8'))
        
        # Extract the response text from the Claude message structure
        return response_body.get('content', [{}])[0].get('text', '')
    
    except Exception as e:
        print(f"Error invoking model: {e}")
        return None

# Example Usage
if __name__ == "__main__":
    
    quiz_result = generate_quiz(transcript)
    print(quiz_result)

{
  "mcqs": [
    {
      "question": "What type of algorithm is logistic regression primarily used for?",
      "options": ["Classification", "Regression", "Clustering", "Dimensionality Reduction"],
      "answer": "Classification"
    },
    {
      "question": "In a neural network, what do we call neurons that are not connected to each other in the same group?",
      "options": ["Cluster", "Layer", "Hidden Units", "Activation Group"],
      "answer": "Layer"
    },
    {
      "question": "What is the purpose of the softmax function in neural networks?",
      "options": ["To normalize outputs between 0 and 1", "To create a probability distribution that sums to 1", "To activate neurons", "To compute gradients"],
      "answer": "To create a probability distribution that sums to 1"
    },
    {
      "question": "What is broadcasting in the context of neural networks?",
      "options": ["Sending data between layers", "Repeating parameter vectors to match dimensions", "Broadcasting 