In [1]:
import requests
from dotenv import load_dotenv
from urllib.parse import urlparse
from urllib.parse import parse_qs
from os import getenv
from requests.models import PreparedRequest
from requests import get

load_dotenv()

True

In [2]:
u = 'https://www.youtube.com/watch?v=9yDDnurRmIY'


def get_video_key(p_url):
    parsed_url = urlparse(p_url)
    captured_value = parse_qs(parsed_url.query)['v'][0]
    return captured_value

get_video_key(u)

'9yDDnurRmIY'

# Understanding the Results of the Video

we call the [GetVideos](https://developers.google.com/youtube/v3/docs/videos/list?apix_params=%7B%22part%22%3A%5B%22snippet%2CcontentDetails%2Cstatistics%22%5D%2C%22id%22%3A%5B%229yDDnurRmIY%22%5D%7D#usage) API which can give the information for single or multiple videos, here we are focusing on the single video.

| Part                        | What is it for                             |
|-----------------------------|--------------------------------------------|
| contentDetails              | Has the Duration of the Video              |
| localizations               | Has the Title and Description of the Video |
| paidProductPlacementDetails | if the Video has any Product ADs related   |
| player                      | URL for embedding the video (IFRAME URL)   |
| snippet                     | has tags and more info. on the Video       |
| statistics                  | views, likes, ... (stats on the video)     |
| topicDetails                | Topic Classification Ig                    |

In [3]:
def add_key(params):
    return {"key": getenv('API_KEY'), **params}

    
def get_video_details(p_url):
    video_key = get_video_key(p_url)
    url = 'https://www.googleapis.com/youtube/v3/videos'
    params = add_key({
      'id': video_key,
      'part': 'contentDetails,localizations,paidProductPlacementDetails,player,snippet,statistics,topicDetails'
    })
    req = PreparedRequest()
    req.prepare_url(url, params)
    return get(req.url)


In [4]:
resp = get_video_details('https://www.youtube.com/watch?v=9yDDnurRmIY')
resp.json()

{'kind': 'youtube#videoListResponse',
 'etag': '2LWxI7f7ax3lC6n6w-ye7goUrME',
 'items': [{'kind': 'youtube#video',
   'etag': 'Z1fsYValXUJGTTEb6HF4kfDSt74',
   'id': '9yDDnurRmIY',
   'snippet': {'publishedAt': '2023-01-13T18:19:12Z',
    'channelId': 'UCqQkpiZwMDNUGEgHzR-EDPw',
    'title': 'Every Taxi Experience be like',
    'description': 'Subscribe if you somewhat kind of enjoy my videos sometimes😉\n\nFollow me ⬇️\nInstagram: www.instagram.com/lando_kalriz/\nTikTok: https://www.tiktok.com/@landokalriz\nFacebook: https://www.facebook.com/Land0kalriz/',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/9yDDnurRmIY/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/9yDDnurRmIY/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/9yDDnurRmIY/hqdefault.jpg',
      'width': 480,
      'height': 360},
     'standard': {'url': 'https://i.ytimg.com/vi/9yDDnurRmIY/sddefault.jpg',

# Getting the Transcripts

First we can use the [Captions](https://www.googleapis.com/youtube/v3/captions) API in order to fetch the caption id, there can be multiple transcripts or captions for a video so

In [5]:
def get_caption_items(u):
    c_url = 'https://www.googleapis.com/youtube/v3/captions'
    video_key = get_video_key(u)
    params = add_key({
      'videoId': video_key,
      'part': 'snippet'
    })
    req = PreparedRequest()
    req.prepare_url(c_url, params)
    return get(req.url)

In [6]:
resp = get_caption_items('https://www.youtube.com/watch?v=9yDDnurRmIY')
m = resp.json()
m, m["items"][0]["id"]

({'kind': 'youtube#captionListResponse',
  'etag': 'uVf_piGN-xSWbyPeCeOU4GPQHJM',
  'items': [{'kind': 'youtube#caption',
    'etag': 'bKbKMAcHAlLeKidAUViGFIPe_pg',
    'id': 'AUieDaY6naV-hW1_Wyk23jAtiGs3uwq3QI8f7zvazbRr5a-lko0',
    'snippet': {'videoId': '9yDDnurRmIY',
     'lastUpdated': '2023-01-13T19:46:11.310475Z',
     'trackKind': 'asr',
     'language': 'en',
     'name': '',
     'audioTrackType': 'unknown',
     'isCC': False,
     'isLarge': False,
     'isEasyReader': False,
     'isDraft': False,
     'isAutoSynced': False,
     'status': 'serving'}}]},
 'AUieDaY6naV-hW1_Wyk23jAtiGs3uwq3QI8f7zvazbRr5a-lko0')

In [13]:
def get_captions(track_id):
    c_url = 'https://www.googleapis.com/youtube/v3/captions/id'
    video_key = get_video_key(u)
    params = add_key({
      'id': track_id,
    })
    req = PreparedRequest()
    req.prepare_url(c_url, params)
    print(req.url)
    return get(req.url)

In [14]:
resp = get_captions(m["items"][0]["id"])
resp

https://www.googleapis.com/youtube/v3/captions/id?key=AIzaSyBBPqWrX0QDCHsmBbTaW6l7pfu1wNHncz8&id=AUieDaY6naV-hW1_Wyk23jAtiGs3uwq3QI8f7zvazbRr5a-lko0


<Response [401]>

# Installation

`uv add google-auth-oauthlib`

install this package and make sure to fetch the token before downloading the transcripts

In [22]:
from google_auth_oauthlib.flow import InstalledAppFlow

def get_token():
    SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]
    flow = InstalledAppFlow.from_client_config(
        {
            "installed": {
                "client_id": getenv('CLIENT_ID'),
                "client_secret": getenv('CLIENT_SECRET'),
                "redirect_uris": ["http://localhost"],
                "auth_uri": "https://accounts.google.com/o/oauth2/auth",
                "token_uri": "https://oauth2.googleapis.com/token"
            }
        },
        SCOPES
    )
    creds = flow.run_local_server()
    return creds.token

In [11]:
t = get_token()
len(t), t[:10]

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=188034312075-ds67sa5l9hm1j79aiatu6jokmbdm5dfi.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=hF08ynoIajIsmnFK0jbuhkNm1CZXbw&access_type=offline


(253, 'ya29.a0ATi')

def get_captions(track_id):
    c_url = 'https://www.googleapis.com/youtube/v3/captions/id'
    video_key = get_video_key(u)
    params = {
      'id': track_id,
        'tfmt': 'srt'
    }
    headers = {
        'Authorization': f'Bearer {get_token()}',
        'Accept': 'application/json'
    }
    req = PreparedRequest()
    req.prepare_url(c_url, params)
    print(req.url)
    return get(req.url, headers=headers)

In [23]:
resp = get_captions(m["items"][0]["id"])
resp.text

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=188034312075-ds67sa5l9hm1j79aiatu6jokmbdm5dfi.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=DM3QIsX9X7MkW12FWyohdBhSQhgGsU&access_type=offline
https://www.googleapis.com/youtube/v3/captions/id?id=AUieDaY6naV-hW1_Wyk23jAtiGs3uwq3QI8f7zvazbRr5a-lko0&tfmt=srt


'{\n  "error": {\n    "code": 403,\n    "message": "The permissions associated with the request are not sufficient to download the caption track. The request might not be properly authorized, or the video order might not have enabled third-party contributions for this caption.",\n    "errors": [\n      {\n        "message": "The permissions associated with the request are not sufficient to download the caption track. The request might not be properly authorized, or the video order might not have enabled third-party contributions for this caption.",\n        "domain": "youtube.caption",\n        "reason": "forbidden",\n        "location": "id",\n        "locationType": "parameter"\n      }\n    ]\n  }\n}\n'

# Another Approach

Since the Official Youtube data API does not allow the users who do not own the video to download the captions, but its from Unofficial service.

[youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api)

`uv add youtube-transcript-api`

In [25]:
from youtube_transcript_api import YouTubeTranscriptApi

ytt_api = YouTubeTranscriptApi()

fc = ytt_api.fetch(get_video_key('https://www.youtube.com/watch?v=9yDDnurRmIY'))

In [31]:
print(
    fc.video_id,
    fc.language,
    fc.language_code,
    # whether it has been manually created or generated by YouTube
    fc.is_generated, sep="\n"
)

9yDDnurRmIY
English (auto-generated)
en
True


In [34]:
fc.to_raw_data()[:6]

[{'text': 'oh you got to be kidding me zero Ubers',
  'start': 0.96,
  'duration': 5.52},
 {'text': "we're in the middle of downtown",
  'start': 3.659,
  'duration': 5.341},
 {'text': 'hey get in who the hell are you a really',
  'start': 6.48,
  'duration': 4.079},
 {'text': "bad start to your day that's who were",
  'start': 9.0,
  'duration': 3.059},
 {'text': 'you just waiting for me out here all day',
  'start': 10.559,
  'duration': 3.54},
 {'text': 'yes sir well do you have like an app to',
  'start': 12.059,
  'duration': 3.301}]