In [47]:
import re
import requests
import youtube_transcript_api.formatters
import pandas as pd

from openai import OpenAI
from bs4 import BeautifulSoup
from pytube import Playlist
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound

In [48]:
# use your openai key
client = OpenAI()
#client = OpenAI(api_key="sk-xxx")

In [49]:
# get playlist by youtuber ID
def get_playlist(youtuber_id):
    channel_url = "https://www.youtube.com/"+ youtuber_id + "/playlists"
    response = requests.get(channel_url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all strings that match the regular expression for a playlist ID
    playlist_pattern = r'"playlistId":"(PL[^"]+)"'
    playlist_ids = re.findall(playlist_pattern, str(soup))
    # Remove duplicates
    return set(playlist_ids) 

def get_playlist_name(playlist_id):
    playlist_url = "https://www.youtube.com/playlist?list=" + playlist_id
    r = requests.get(playlist_url)
    soup = BeautifulSoup(r.text)
    title_tag = soup.find("title")
    if title_tag:
        full_title = title_tag.text.strip()
        # Assuming the format is 'Playlist Title - YouTube'
        playlist_name = full_title.split(' - ')[0]
    return playlist_name

# get video title by video ID
def get_video_title(video_id):
    r = requests.get("https://www.youtube.com/watch?v=" + video_id)
    soup = BeautifulSoup(r.text)
    link = soup.find_all(name="title")[0]
    title = str(link)
    title = title.replace("<title>","")
    title = title.replace("</title>","")
    title = title.replace(" - YouTube","")
    return title

# download transcript by video url
def download_transcript(url):
    try:
        # Fetching the subtitles
        transcript_list = YouTubeTranscriptApi.list_transcripts(url)
        transcript = transcript_list.find_generated_transcript(['en'])
        # Formatting the transcript as plain text
        formatter = youtube_transcript_api.formatters.TextFormatter()
        text_transcript = formatter.format_transcript(transcript.fetch())
    except:
        text_transcript = "No transcript found for video."
    return text_transcript

# download all transcript by youtuber ID
def get_all_transcript(youtuber_id):
    video_dict = dict({})
    playlist_ids = get_playlist(youtuber_id)
    playlist_dict = dict({})
    # create a dict of playlist
    for playlist_id in playlist_ids:
        playlist_name = get_playlist_name(playlist_id)
        playlist_dict[playlist_id] = playlist_name
    # loop the playlist for each video
    for i in playlist_dict:
        list_name = playlist_dict[i]
        print(list_name)
        url_playlist = "https://www.youtube.com/playlist?list=" + i
        videos = Playlist(url_playlist)
        videos = [i.split("v=")[1] for i in videos]
        # get title and subtitles of each video
        for video_id in videos:
            title = get_video_title(video_id)
            print(title)
            transcript = download_transcript(video_id)
            video_dict[video_id] = [title, transcript, i, list_name]
    # output dataframe
    sub_df = pd.DataFrame(video_dict).T.reset_index()
    sub_df.columns = ["video_id", "video_name", "transcript", "playlist_id", "playlist_name"]
    return sub_df


# process and clean transcript with Gen AI prompts
clean_sys = """
            Input:
            YouTube video transcripts in English.
            The transcripts often begin with greetings and casual conversation.
            The content includes the YouTuber's personal wording and opinions.
            The format is unstructured and may contain irrelevant characters.

            Processing Goals:
            Format Standardization: Transform the transcript into a coherent and readable format. This involves structuring the text logically and ensuring consistency in presentation.
            Cleaning and Clarity: Remove extraneous text and characters. Add punctuation to improve clarity, especially where expressions or sentences are ambiguous.
            Narrative Conversion: Shift from a first-person narrative to an objective, third-person perspective. This involves rephrasing subjective statements into objective information.
            Content Focus: Identify and retain only the text relevant to the main topic of the transcript. Exclude any off-topic remarks or digressions to maintain focus on the core subject matter.

            Output:
            A refined and structured document that presents the essential knowledge from the YouTube transcript.
            The text should be free of casual, irrelevant, and subjective elements, offering a clear, objective, and topic-focused read.
            Please don't mention Youtube or Youtuber in the output.
            """

clean_pmt = """
            Please process below transcripts and your response will only include the processed output.
            
            """

# select gpt model to convert subjective transcript to objective knowledge
def clean_transcripts(text, gpt_model):
    clean_text = client.chat.completions.create(
        model = gpt_model,
        messages = [
                    {"role": "system", "content": clean_sys},
                    {"role": "user", "content": clean_pmt + text}
                  ]
        )
    converted_text = clean_text.choices[0].message.content
    return converted_text

In [55]:
# use youtuber @StrandaSnowboards as an example
df = get_all_transcript("@StrandaSnowboards")
df = df[df["transcript"] != "No transcript found for video."].reset_index(drop = True)
# df["transcript_converted"] = df["transcript"].map(lambda x: clean_transcripts(x, "gpt-4-1106-preview"))
df.head(2)

Unnamed: 0,video_id,video_name,transcript,playlist_id,playlist_name
0,EDcCG-h96xk,We make boards you can trust!,[Applause]\nhello everybody\nbeing environment...,PLnt6rxj68XCb-XdHReer9BYpu8PG4X6nt,Shapers Words
1,aaIM5TfOLWw,Help me me build the best snowboard possible!,foreign\nhey guys I'm working on some uh reall...,PLnt6rxj68XCb-XdHReer9BYpu8PG4X6nt,Shapers Words


In [53]:
#downloaded transcripts
text = df["transcript"][13]
text

"[Music]\nhi guys\nlet me introduce the descender it's an\nall-mountain directional Ripper that\noffers all the power and control you\ncould ever ask for\nit's nimble enough for some back country\nfreestyle it's burly enough to charge\nreally big lines sidecut is somewhere in\nbetween what you usually would ask for\non a big mountain board so it's fun in\nlower speed each bite is phenomenal it\nhas added bumps here at the inserts\npoints so you have four contact points\nhere here here and here\nwhich really gives good edge bite I've\nbeen carving the out of icy\ngroomers with this board it's just so\nstable that means that it's also super\nsafe when you traverse exposed terrain\nwe're falling is not an option I'm truly\nproud about this board it's developed\nwith feedback from some of Sweden's best\nfree riders among others two-time Big\nMountain champion Steph honestly the\nbottom shape is a camber which extends\nquite a far bit outside the insert\npositions it's a low camber for five

In [54]:
#shaped transcripts
clean_text = clean_transcripts(text, "gpt-4-1106-preview")
print(clean_text)

The Descender is a versatile all-mountain directional snowboard designed for advanced riders. It offers a balance of power and control, enabling users to comfortably engage in backcountry freestyle and tackle large lines. Constructed with a moderate sidecut, the board performs well at various speeds and excels in edge grip due to the addition of extra contact points near the insert positions.

The design, enhanced by feedback from prominent Swedish free riders like the two-time Big Mountain champion Steph, features a camber with a low profile of five millimeters, extending past the insert positions. This is complemented by a parabolic camber curve to maintain pop and drive, especially under the rear foot.

With an 8.9-meter sidecut on the 161 model, the board's design aims to provide agility that often lacks in traditional big mountain snowboards. The top sheet sports a matte gray, semi-translucent finish that showcases the natural wood grain and features an image of a mountain range c

In [56]:
!pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.4.13-py2.py3-none-any.whl (33 kB)
Collecting yarg
  Downloading yarg-0.1.9-py2.py3-none-any.whl (19 kB)
Installing collected packages: yarg, pipreqs
Successfully installed pipreqs-0.4.13 yarg-0.1.9
You should consider upgrading via the '/Users/jjpjk/.env/bin/python3 -m pip install --upgrade pip' command.[0m


In [58]:
!python3 -m  pipreqs.pipreqs .

