In [4]:
import requests
from dotenv import load_dotenv
from urllib.parse import urlparse
from urllib.parse import parse_qs
from os import getenv
from requests.models import PreparedRequest
from requests import get

load_dotenv()

False

In [5]:
u = 'https://www.youtube.com/watch?v=9yDDnurRmIY'


def get_video_key(p_url):
    parsed_url = urlparse(p_url)
    captured_value = parse_qs(parsed_url.query)['v'][0]
    return captured_value

get_video_key(u)

'9yDDnurRmIY'

# Understanding the Results of the Video

we call the [GetVideos](https://developers.google.com/youtube/v3/docs/videos/list?apix_params=%7B%22part%22%3A%5B%22snippet%2CcontentDetails%2Cstatistics%22%5D%2C%22id%22%3A%5B%229yDDnurRmIY%22%5D%7D#usage) API which can give the information for single or multiple videos, here we are focusing on the single video.

| Part                        | What is it for                             |
|-----------------------------|--------------------------------------------|
| contentDetails              | Has the Duration of the Video              |
| localizations               | Has the Title and Description of the Video |
| paidProductPlacementDetails | if the Video has any Product ADs related   |
| player                      | URL for embedding the video (IFRAME URL)   |
| snippet                     | has tags and more info. on the Video       |
| statistics                  | views, likes, ... (stats on the video)     |
| topicDetails                | Topic Classification Ig                    |

In [6]:
def add_key(params):
    return {"key": getenv('API_KEY'), **params}

    
def get_video_details(p_url):
    video_key = get_video_key(p_url)
    url = 'https://www.googleapis.com/youtube/v3/videos'
    params = add_key({
      'id': video_key,
      'part': 'contentDetails,localizations,paidProductPlacementDetails,player,snippet,statistics,topicDetails'
    })
    req = PreparedRequest()
    req.prepare_url(url, params)
    return get(req.url)


In [7]:
def search_results(q):
    url = 'https://www.googleapis.com/youtube/v3/search'
    params = add_key({
      'q': q,
      'part': 'snippet',
        'maxResults': 5
    })
    req = PreparedRequest()
    req.prepare_url(url, params)
    return get(req.url)

search_results("how to make a chocolate cake, can you help with me it").json()

{'error': {'code': 403,
  'message': "Method doesn't allow unregistered callers (callers without established identity). Please use API Key or other form of API consumer identity to call this API.",
  'errors': [{'message': "Method doesn't allow unregistered callers (callers without established identity). Please use API Key or other form of API consumer identity to call this API.",
    'domain': 'global',
    'reason': 'forbidden'}],
  'status': 'PERMISSION_DENIED'}}

In [8]:
search_results("how to make a chocolate cake ?").json()

{'error': {'code': 403,
  'message': "Method doesn't allow unregistered callers (callers without established identity). Please use API Key or other form of API consumer identity to call this API.",
  'errors': [{'message': "Method doesn't allow unregistered callers (callers without established identity). Please use API Key or other form of API consumer identity to call this API.",
    'domain': 'global',
    'reason': 'forbidden'}],
  'status': 'PERMISSION_DENIED'}}

In [9]:
resp = get_video_details('https://www.youtube.com/watch?v=9yDDnurRmIY')
resp.json()

{'error': {'code': 403,
  'message': "Method doesn't allow unregistered callers (callers without established identity). Please use API Key or other form of API consumer identity to call this API.",
  'errors': [{'message': "Method doesn't allow unregistered callers (callers without established identity). Please use API Key or other form of API consumer identity to call this API.",
    'domain': 'global',
    'reason': 'forbidden'}],
  'status': 'PERMISSION_DENIED'}}

# Getting the Transcripts

First we can use the [Captions](https://www.googleapis.com/youtube/v3/captions) API in order to fetch the caption id, there can be multiple transcripts or captions for a video so

In [10]:
def get_caption_items(u):
    c_url = 'https://www.googleapis.com/youtube/v3/captions'
    video_key = get_video_key(u)
    params = add_key({
      'videoId': video_key,
      'part': 'snippet'
    })
    req = PreparedRequest()
    req.prepare_url(c_url, params)
    return get(req.url)

In [15]:
resp = get_caption_items('https://www.youtube.com/watch?v=9yDDnurRmIY')
m = resp.json()
m, m["items"][0]["id"]

KeyError: 'items'

In [None]:
def get_captions(track_id):
    c_url = 'https://www.googleapis.com/youtube/v3/captions/id'
    video_key = get_video_key(u)
    params = add_key({
      'id': track_id,
    })
    req = PreparedRequest()
    req.prepare_url(c_url, params)
    return get(req.url)

In [None]:
resp = get_captions(m["items"][0]["id"])
resp

<Response [401]>

# Another Approach

Since the Official Youtube data API does not allow the users who do not own the video to download the captions, but its from Unofficial service.

[youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api)

`uv add youtube-transcript-api`

In [52]:
from youtube_transcript_api import YouTubeTranscriptApi

ytt_api = YouTubeTranscriptApi()

fetch = ytt_api.fetch(get_video_key('https://www.youtube.com/watch?v=9yDDnurRmIY'))
fetch2 = ytt_api.fetch(get_video_key('https://www.youtube.com/watch?v=y_q741QO_m0'))
fetches = [fetch.to_raw_data(), fetch2.to_raw_data()]
print(fetch)

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='oh you got to be kidding me zero Ubers', start=0.96, duration=5.52), FetchedTranscriptSnippet(text="we're in the middle of downtown", start=3.659, duration=5.341), FetchedTranscriptSnippet(text='hey get in who the hell are you a really', start=6.48, duration=4.079), FetchedTranscriptSnippet(text="bad start to your day that's who were", start=9.0, duration=3.059), FetchedTranscriptSnippet(text='you just waiting for me out here all day', start=10.559, duration=3.54), FetchedTranscriptSnippet(text='yes sir well do you have like an app to', start=12.059, duration=3.301), FetchedTranscriptSnippet(text="make sure it's safe yeah it's called", start=14.099, duration=3.241), FetchedTranscriptSnippet(text='take my word for it and get the hell in', start=15.36, duration=4.86), FetchedTranscriptSnippet(text='ah you can help me with my bags no we', start=17.34, duration=4.199), FetchedTranscriptSnippet(text="don't work for five star ratings

In [53]:
print(
    fetch.video_id,
    fetch.language,
    fetch.language_code,
    # whether it has been manually created or generated by YouTube
    fetch.is_generated, sep="\n"
)

9yDDnurRmIY
English (auto-generated)
en
True


In [103]:
fetch2.to_raw_data()[:5]

[{'text': "In September 2025, Trump hosted a dinner\nfor Big Tech's top CEOs.",
  'start': 0.0,
  'duration': 4.629},
 {'text': 'The most brilliant people\nare gathered around this table.',
  'start': 4.629,
  'duration': 2.711},
 {'text': '[Trevor]\nMeta, Google, Microsoft, OpenAI.',
  'start': 7.34,
  'duration': 3.462},
 {'text': 'Everyone was there.', 'start': 10.802, 'duration': 1.084},
 {'text': '[Trump]\nThis is definitely a high IQ group.',
  'start': 12.303,
  'duration': 2.044}]

In [None]:
def raw_data_to_single_text(raw_data):
    texts = "".join(f"{i['text'].replace("\n", " ")} \n" for i in raw_data)
    return "POST: " + texts


# raw = fetch.to_raw_data()
# text = raw_data_to_single_text(raw)
# print(text)

In [126]:

def create_list_from_fetches(fetches_list: list):
    return list(map(raw_data_to_single_text, fetches_list))

def format_df(df):
    df["text"] = create_list_from_fetches(df["text"])
    df["text"] = "TITLE: " + df["title"] + "\n\n" + df["text"]
    return df

df = pd.DataFrame({"text":  fetches, "title": ["Uber", "Politics"]})
df = format_df(df)
print(df["text"])
print("\n" + df["text"][1])

0    TITLE: Uber\n\nPOST: oh you got to be kidding ...
1    TITLE: Politics\n\nPOST: In September 2025, Tr...
Name: text, dtype: object

TITLE: Politics

POST: In September 2025, Trump hosted a dinner for Big Tech's top CEOs. 
The most brilliant people are gathered around this table. 
[Trevor] Meta, Google, Microsoft, OpenAI. 
Everyone was there. 
[Trump] This is definitely a high IQ group. 
[Trevor] It was the usual scene: a bunch of people 
around a table praising Trump, thanking him for everything that he's doing. 
[Mark Zuckerberg] Well, thanks for hosting us. 
[Jared Isaacman] Thank you so much. 
[Bill Gates] Thank you. 
[Tim Cook] Thank you for including me. 
[Sam Altman] Thank you for being such a pro-business, 
pro-innovation president. 
[Greg Brockman] Thank you for that. 
[Safra Catz] Thank you. 
But I've got to admit, this dinner in particular, it had me scratching my head. 
It was weird. 
And not just because Zuckerberg got caught on a hot mic trying to be the teacher's pet

/Users/thorkildkappel/Desktop/7. sem/NLP /project/NLP_project_group6/lab copy


In [135]:
f = open('../data/queries.txt', 'r')
content = f.read()

print(len(content.split(",")))

1127
