In [1]:
import os
import sys
import re
import json
import time

import numpy as np
import pandas as pd
from tqdm import tqdm

from youtube_transcript_api import YouTubeTranscriptApi
from get_timestamps import get_timestamps, get_text
from yt_dlp import YoutubeDL
import whisper
from transformers import pipeline
from datasets import load_dataset, Audio

model = whisper.load_model("medium.en")

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
def convert_srt_to_json(srt):
    """ 
    From YouTubeTranscriptAPI we get a srt file,
    this function converts it to a json file and 
    formats it in the same structure as Whisper output.
    """
    json_list = []

    for dict in srt:
        start_time = dict["start"]
        end_time = dict["start"] + dict["duration"]
        json_list.append(
            {
                "start": start_time,
                "end": end_time,
                "text": dict["text"],
            }
        )
    
    result = {"segments":json_list}
    return result

In [4]:
def get_timestamps_times(description):
    """GitHub copilot did this. I'm bad at regex but it's pretty simple in this case"""
    timestamps = []
    timestamp_regex = r"\d{1,2}:\d{2}:\d{2}|\d{1,2}:\d{2}"
    timestamp_matches = re.findall(timestamp_regex, description)
    for timestamp in timestamp_matches:
        timestamps.append(timestamp)
    return timestamps

def get_timestamps_keywords(description, timestamps):
    """
    Extract description text for each timestamp.
    Split the description on each timestamp time, then remove the time part.
    split by newline and take out the keyword for the timestamp.
    Then do some processing, remove -, brackets, unecessary space.
    """
    keywords = []
    for timestamp in timestamps:
        chapter = description.split(timestamp)[1].split("\n")[0]
        chapter = chapter.strip().strip("-").strip("[").strip("|").strip("]").strip()
        keywords.append(chapter)
    return keywords

In [5]:
def download_transcript_from_API(video_id):
    """Download transcript from YouTube API"""
    srt = YouTubeTranscriptApi.get_transcript(video_id)
    transcript = convert_srt_to_json(srt)
    return transcript

In [6]:
def download_audio(url, output_file):
    # Download audio file from a youtube url using yt-dlp
    ffmpeg_loc = "C:/Users/aladd/Desktop/Shortcuts/ffmpeg/bin/ffmpeg.exe"

    """
    Download audio file from a youtube url using yt-dlp
    """
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": output_file,
        "ffmpeg_location": ffmpeg_loc,
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
                "preferredquality": "196",
            }
        ],
    }
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])


def download_transcript_using_whisper(video_id, output_file="audio/audio.wav"):
    # make video_id into youtube url
    url = f"https://www.youtube.com/watch?v={video_id}"

    # delete audio.wav if it exists
    if os.path.exists(output_file):
        os.remove(output_file)
        
    # download audio file
    download_audio(url, output_file)

    # transcript using whisper
    # time how long it takes 
    start = time.time()
    result = model.transcribe(output_file, verbose=True)
    end = time.time()
    print(f"Whisper model took {end-start} seconds to transcribe")
    return result

In [9]:
DATA_FILE = "youtube_data_v3_extra_1.csv"
DIR_FOLDER = "raw_dataset_v3"

if not os.path.exists(DIR_FOLDER):
    os.mkdir(DIR_FOLDER)

df = pd.read_csv(DATA_FILE)
urls = df["url"].tolist()
descriptions = df["description"].tolist()
tags = df["tags"].tolist()
titles = df["title"].tolist()
likes = df["likes"].tolist()
durations = df["duration"].tolist()
comments = df["top_comment"].tolist()

assert len(urls) == len(descriptions) == len(tags) == len(titles) == len(likes) == len(durations)

# loop through urls and download audio
for idx in tqdm(range(len(urls))):
    video_id = urls[idx].split("=")[1]
    description = descriptions[idx]
    tag = tags[idx]
    title = titles[idx]
    like = likes[idx]
    duration = durations[idx]
    comment = comments[idx]

    # get timestamps
    try:
        timestamps = get_timestamps_times(description)
        keywords = get_timestamps_keywords(description, timestamps)
    except:
        timestamps = []
        keywords = []
        #print(f"Error getting timestamps for {video_id}")

    # check if video_id.json in transcripts folder
    if video_id + ".json" not in os.listdir(DIR_FOLDER):
        # check in length of timestamps is greater than 0
        try:
            result = download_transcript_from_API(video_id)
        except:
            #print(f"Probably a private video {video_id} or no transcript. Downloading using whisper")
            try:
                pass
                #result = download_transcript_using_whisper(video_id)
            except:
                print(f"Error downloading with Whisper transcript for {video_id}")
                continue


        # Add metadata to json file
        result["video_id"] = video_id
        result["description"] = description
        result["tags"] = tag
        result["title"] = title
        result["likes"] = like
        result["duration"] = duration
        result["timestamps"] = timestamps
        result["keywords"] = keywords
        result["comment"] = comment

        # Save json file
        with open(f"{DIR_FOLDER}/{video_id}.json", "w") as f:
            json.dump(result, f)
    else:
        pass
        #print(f"{video_id} already exists")

100%|██████████| 951/951 [09:38<00:00,  1.64it/s]
