In [None]:
from googleapiclient.discovery import build
import pandas as pd
import numpy as np

def get_playlist_data_slim(api_key, playlist_id):
    youtube = build("youtube", "v3", developerKey=api_key)

    # get all playlist items
    playlist_items = []
    next_token = None
    while True:
        resp = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_token,
            fields="nextPageToken,items(snippet(publishedAt,resourceId/videoId,position,title),contentDetails/videoId)"
        ).execute()

        for it in resp.get("items", []):
            vid = it["contentDetails"]["videoId"]
            playlist_items.append({
                "videoId": vid,
                "addedAt": it["snippet"]["publishedAt"],  # date added to playlist
                "position": it["snippet"]["position"],
            })

        next_token = resp.get("nextPageToken")
        if not next_token:
            break

    # get detail in batch of 50
    results = []
    for i in range(0, len(playlist_items), 50):
        chunk = playlist_items[i:i+50]
        ids = ",".join([c["videoId"] for c in chunk])

        vid_resp = youtube.videos().list(
            part="snippet,statistics",
            id=ids,
            fields="items(id,snippet(title,description,thumbnails/standard/url),statistics(viewCount,likeCount,commentCount))"
        ).execute()

        # map by videoId
        vid_map = {v["id"]: v for v in vid_resp.get("items", [])}

        for c in chunk:
            v = vid_map.get(c["videoId"])
            if v:
                results.append({
                    "videoId": c["videoId"],
                    "title": v["snippet"]["title"],
                    "description": v["snippet"]["description"],
                    "thumbnail": v["snippet"]["thumbnails"].get("standard", {}).get("url"),
                    "addedAt": c["addedAt"],
                    "viewCount": v["statistics"].get("viewCount"),
                    "likeCount": v["statistics"].get("likeCount"),
                    "commentCount": v["statistics"].get("commentCount"),
                })
            else:
                # private video
                results.append({
                    "videoId": c["videoId"],
                    "title": np.nan,
                    "description": np.nan,
                    "thumbnail": np.nan,
                    "addedAt": c["addedAt"],
                    "viewCount": np.nan,
                    "likeCount": np.nan,
                    "commentCount": np.nan,
                })

    # make df
    df = pd.DataFrame(results)
    return df

In [10]:
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv('youtube_auth_api_key')

final_data = get_playlist_data_slim(playlist_id= 'PLaGGM00HkKkvDwVXbPG8ZVLyQ3SlKahPK', api_key=api_key)

In [13]:
final_data.head()

Unnamed: 0,videoId,title,description,thumbnail,addedAt,viewCount,likeCount,commentCount
0,Zg4JivSYj9w,They Made This In 60 Crores 😱 | Mirai Movie Re...,One of the most talked about movies is finally...,https://i.ytimg.com/vi/Zg4JivSYj9w/sddefault.jpg,2025-09-12T09:03:14Z,157623,4052,630
1,cLzEdXEm9Nw,The Most Offensive Bollywood Movie Portrayals,I have often thought about #bollywood movie re...,https://i.ytimg.com/vi/cLzEdXEm9Nw/sddefault.jpg,2025-09-10T09:05:36Z,170494,5776,899
2,Qkuxtc42zLU,Salman Must Hate This! | Madharaasi Movie Revi...,The much awaited Tamil film is finally here in...,https://i.ytimg.com/vi/Qkuxtc42zLU/sddefault.jpg,2025-09-08T06:27:48Z,197073,5212,512
3,rjoL1JTbADk,The Bengal Files Movie Review & Analysis | Viv...,The much awaited #bengalfiles is finally here....,https://i.ytimg.com/vi/rjoL1JTbADk/sddefault.jpg,2025-09-06T03:44:07Z,233953,9065,1921
4,_Dq5p-PKIio,Baaghi 4 Movie Review & Analysis,The much awaited #baaghi4 is here! The movie s...,https://i.ytimg.com/vi/_Dq5p-PKIio/sddefault.jpg,2025-09-05T08:37:28Z,307603,9635,1315


In [16]:
final_data.to_csv("Data/TriedRefusedProductions_data.csv")

In [1]:
import yt_dlp

def get_transcript(video_url, lang="en"):
    ydl_opts = {
        "skip_download": True,
        "writesubtitles": True,
        "writeautomaticsub": True,
        "subtitleslangs": [lang],
        "subtitlesformat": "vtt",
        "quiet": True,  # suppress logs
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        subs = info.get("subtitles") or info.get("automatic_captions")

        if subs and lang in subs:
            transcript_url = subs[lang][0]["url"]
            return transcript_url  # direct URL to subtitle file
        else:
            return None
        
get_transcript("https://www.youtube.com/watch?v=Zg4JivSYj9w")




'https://www.youtube.com/api/timedtext?v=Zg4JivSYj9w&ei=FjjFaI6EI4HXvPEPvYSC2Qk&caps=asr&opi=112496729&xoaf=5&xowf=1&hl=en&ip=0.0.0.0&ipbits=0&expire=1757780614&sparams=ip%2Cipbits%2Cexpire%2Cv%2Cei%2Ccaps%2Copi%2Cxoaf&signature=5BB98F619A3FB76C8D859EBEB01C44CC4C724A3E.BC179A6D6E52C6C306F0D2718F685D8316B75253&key=yt8&kind=asr&lang=en&fmt=json3'