In [10]:
import pandas as pd
from hugchat import hugchat
from hugchat.login import Login
import json
from tqdm import tqdm
tqdm.pandas()
import regex as re

In [11]:
# Load the secrets from the JSON file
with open('client_secrets.json', 'r') as file:
    secrets = json.load(file)

# Set up the API key and YouTube API client
login = secrets['huggingLogin']
password = secrets['huggingPassword']

### Initialize HuggingChat

In [12]:
sign = Login(login, password)
cookies = sign.login()

# Save cookies to the local directory
cookie_path_dir = "./cookies_snapshot"
sign.saveCookiesToDir(cookie_path_dir)

'./cookies_snapshot/stfnstingl@gmail.com.json'

In [13]:
# Load cookies when you restart your program:
# sign = login(email, None)
# cookies = sign.loadCookiesFromDir(cookie_path_dir) # This will detect if the JSON file exists, return cookies if it does and raise an Exception if it's not.

In [14]:
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())  # or cookie_path="usercookies/<email>.json"

### Load Shorts Information

In [15]:
df_shorts = pd.read_csv("youtube_shorts_description.csv")
df_shorts.head()

Unnamed: 0,Video ID,Video Title,Channel Title,Transcript,Duration,Words per Second,Top10 Comments,Category
0,l9_8_pDTmis,"b'Furthest Away From Me Wins $10,000'",MrBeast,whichever one of these three people can get th...,96.84,1.579926,"['Bro the guy at Disneyland was super smart, s...",Entertainment
1,QYEfTly0pTE,b'We lost contact with ATC over the ocean! #av...,Fly Me to the Fun™,this year we flew our Cessna 150 1800 miles fr...,110.08,1.43532,['Nice! <br><br>It’s always nice when the big ...,Travel & Events
2,jYJTPqU66IY,b'How to renovate your private jet',TheJetBusiness,I was going to say like what's a turnaround ti...,118.28,2.054447,['You know it’s a huge number when that was th...,Travel & Events
3,dBsomKKHhtk,"b'When you finally meet the ""work wife""'",Viva La Dirt League,oh it's my w wife oh the the infamous W wife h...,120.879,1.886184,['Dude went from not realising he&#39;s marrie...,Gaming
4,dTLYweJ08Tg,b'Courtside Kicks CASHES OUT on WHOLE TABLE of...,Courtside Kicks,yo what's good bro so you have a ton of dunks ...,59.679,1.524824,['After the video here is $20 for playing alon...,Entertainment


### Build Query

In [16]:
def create_custom_query(row):
    return (
        "You are a copywriter, create a 100 word summary of what this Youtube Short is about. "
        "Provide a neutral description. The summary should describe the overall atmosphere "
        "and pace of the video. It should also highlight important events from the video. "
        "Do not include any statements about a viewers response to the content or the overall "
        "viewing experience. Output the raw summary text.\n"
        "Title: {}\n"
        "Channel: {}\n"
        "Transcript: {}\n"
        "Comments: {}\n"
        "Category: {}\n"
    ).format(
        row['Video Title'], 
        row['Channel Title'], 
        row['Transcript'], 
        row['Top10 Comments'], 
        row['Category']
    )

In [17]:
# Apply the function to each row
df_shorts['Custom Query'] = df_shorts.apply(create_custom_query, axis=1)

### Query Huggingchat

In [18]:
def get_chatbot_summary(row):
    #Cast to String for regex, since query returns Message object
    return str(chatbot.query(row['Custom Query']))

In [19]:
# Apply the function to each row and create a new column for the results
df_shorts['LLM Summary'] = df_shorts.progress_apply(get_chatbot_summary, axis=1)

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [03:30<00:00, 10.52s/it]


In [21]:
df_shorts.head()

Unnamed: 0,Video ID,Video Title,Channel Title,Transcript,Duration,Words per Second,Top10 Comments,Category,Custom Query,LLM Summary
0,l9_8_pDTmis,"b'Furthest Away From Me Wins $10,000'",MrBeast,whichever one of these three people can get th...,96.84,1.579926,"['Bro the guy at Disneyland was super smart, s...",Entertainment,"You are a copywriter, create a 100 word summar...","In this YouTube short, MrBeast challenges thr..."
1,QYEfTly0pTE,b'We lost contact with ATC over the ocean! #av...,Fly Me to the Fun™,this year we flew our Cessna 150 1800 miles fr...,110.08,1.43532,['Nice! <br><br>It’s always nice when the big ...,Travel & Events,"You are a copywriter, create a 100 word summar...",This YouTube short is about a flight from Atl...
2,jYJTPqU66IY,b'How to renovate your private jet',TheJetBusiness,I was going to say like what's a turnaround ti...,118.28,2.054447,['You know it’s a huge number when that was th...,Travel & Events,"You are a copywriter, create a 100 word summar...","Sure, here's a 100-word summary of the video ..."
3,dBsomKKHhtk,"b'When you finally meet the ""work wife""'",Viva La Dirt League,oh it's my w wife oh the the infamous W wife h...,120.879,1.886184,['Dude went from not realising he&#39;s marrie...,Gaming,"You are a copywriter, create a 100 word summar...","In this YouTube short, a man introduces his c..."
4,dTLYweJ08Tg,b'Courtside Kicks CASHES OUT on WHOLE TABLE of...,Courtside Kicks,yo what's good bro so you have a ton of dunks ...,59.679,1.524824,['After the video here is $20 for playing alon...,Entertainment,"You are a copywriter, create a 100 word summar...",This YouTube short features a person interest...


### Clean LLM Summaries 

This doesn't work 100% of the answers, since the answer varies a lot. I haven't found a pattern that is 100% reliable, but i would say this is "good enough"

In [22]:
import re

def clean_chatbot_response(text):
    # Regex pattern that accounts for variable spacing, punctuation, and case
    pattern = r"Sure[,!]? here'?s? (is |a |the )?100-word summary of .+:"
    # Replace the matched pattern with an empty string, ignoring case
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE).strip()
    return cleaned_text

In [23]:
df_shorts['LLM Summary'] = df_shorts['LLM Summary'].progress_apply(clean_chatbot_response)

100%|██████████| 20/20 [00:00<00:00, 19911.25it/s]


In [24]:
df_shorts.head()

Unnamed: 0,Video ID,Video Title,Channel Title,Transcript,Duration,Words per Second,Top10 Comments,Category,Custom Query,LLM Summary
0,l9_8_pDTmis,"b'Furthest Away From Me Wins $10,000'",MrBeast,whichever one of these three people can get th...,96.84,1.579926,"['Bro the guy at Disneyland was super smart, s...",Entertainment,"You are a copywriter, create a 100 word summar...","In this YouTube short, MrBeast challenges thre..."
1,QYEfTly0pTE,b'We lost contact with ATC over the ocean! #av...,Fly Me to the Fun™,this year we flew our Cessna 150 1800 miles fr...,110.08,1.43532,['Nice! <br><br>It’s always nice when the big ...,Travel & Events,"You are a copywriter, create a 100 word summar...",This YouTube short is about a flight from Atla...
2,jYJTPqU66IY,b'How to renovate your private jet',TheJetBusiness,I was going to say like what's a turnaround ti...,118.28,2.054447,['You know it’s a huge number when that was th...,Travel & Events,"You are a copywriter, create a 100 word summar...","In this video, a salesperson discusses the pro..."
3,dBsomKKHhtk,"b'When you finally meet the ""work wife""'",Viva La Dirt League,oh it's my w wife oh the the infamous W wife h...,120.879,1.886184,['Dude went from not realising he&#39;s marrie...,Gaming,"You are a copywriter, create a 100 word summar...","In this YouTube short, a man introduces his co..."
4,dTLYweJ08Tg,b'Courtside Kicks CASHES OUT on WHOLE TABLE of...,Courtside Kicks,yo what's good bro so you have a ton of dunks ...,59.679,1.524824,['After the video here is $20 for playing alon...,Entertainment,"You are a copywriter, create a 100 word summar...",This YouTube short features a person intereste...


In [25]:
df_shorts.to_csv("youtube_shorts_with_chatbot_summary.csv", index=False)