## Scraping YouTube videos

### Installing required libraries

In [2]:
! pip install google-api-python-client

Defaulting to user installation because normal site-packages is not writeable


In [3]:
! pip install youtube-transcript-api

Defaulting to user installation because normal site-packages is not writeable


### Scraping one YouTube video's details and transcript

In [None]:
from googleapiclient.discovery import build

# Please put your own API key here
API_key = "Please put your own API key here"

# Defining a loop that uses the YouTube API to scrape the basic data of a video
def get_video_details(video_id):
    youtube = build("youtube", "v3", developerKey=API_key)
    request = youtube.videos().list(part="snippet,statistics", id=video_id)\
    
    response = request.execute()

    if "items" in response and response["items"]:
        video = response["items"][0]
        details = {
            "title": video["snippet"]["title"],
            "channel": video["snippet"]["channelTitle"],
            "views": video["statistics"].get("viewCount", "N/A"),
            "publish_date": video["snippet"]["publishedAt"],
            "url": f"https://www.youtube.com/watch?v={video_id}"
        }
        return details
    return None

In [None]:
# Testing code on example video of the VVD YouTube-channel
video_id = "XqtessUPQEY"
details = get_video_details(video_id)
print(details)

{'title': '"Ik sta hier voor de veiligheid van Nederlanders." Dilan Yeşilgöz-Zegerius clasht met FvD.', 'channel': 'VVD', 'views': '669', 'publish_date': '2025-02-19T12:48:47Z', 'url': 'https://www.youtube.com/watch?v=XqtessUPQEY'}


##### Scraping YouTube video transcript in Dutch

In [6]:
from youtube_transcript_api import YouTubeTranscriptApi

# Defining a function to fetch the YouTube transcripts while utilising the YouTube transcript API
def get_transcript(video_id, language="nl"):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        transcript_text = " ".join([entry["text"] for entry in transcript])
        return transcript_text

    except Exception as e:
        return f"Error fetching transcript: {e}"

In [None]:
# Testing code on example video of the VVD YouTube-channel
# Sorry but this video actually is a speech ouch
video_id = "XqtessUPQEY"
transcript = get_transcript(video_id)
print("\nTranscript:", transcript)


Transcript: ehm de heer Rutte Secretaris generaal van de NAVO heeft uitgesproken dat het hele Westen dus van Wat zei die San Francisco tot Ankara geloof ik eh maar een derde aan wapentuig en munitie kan produceren van wat Rusland produceert hè Dat heeft hij uitgesproken nu dreigen wij in de situatie te zitten vandaag al dat Oekraïne met volledige steun van de NAVO dit eh conflict verliest eh is het dan niet wereldvreemd om te zeggen we gaan als Europa zelf onze broek ophouden en we gaan dan zorgen desnoods om dan zelf maar iets tegen die Russen tweer te stellen dat is toch totaal niet realistisch voorzitter mijn eerste reactie zou niet zijn op mijn rug liggen met mijn pootjes omhoog en aan Poetin vragen hoever die Europa door wil trekken ik sta wel voor de veiligheid van Nederlanders en ik zal er dan vervolgens ook alles aan doen dat wij daar paraat op eh zijn dus of dat is de samenwerking met Europa of dat is toch met onze Amerikaanse bondgenoot op een andere manier maar hier de sugg

### Scraping the previously defined data and transcripts for multiple videos

##### Getting an uploads list for a certain YouTube channel

In [17]:
# Finding this channel ID is a bit of a challenge, but can be done via following this tutorial: https://www.youtube.com/watch?v=0oDy2sWPF38
channel_id_VVD = "UCZean7nAZKDGIHANq-MuaGA"
channel_id_BIJ1 = "UCI1wgi8HoU-wAnuSPSTDDUA"
channel_id_GroenLinks = "UCpYJiwLo1KTKElBq_0xAI7A"

# Getting the playlist of uploads for a specific YouTube channel
def get_uploads_playlist_id(channel_id):
    youtube = build("youtube", "v3", developerKey=API_key)
    request = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    )
    response = request.execute()
    
    if response["items"]:
        return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    return None

In [None]:
# Testing code on example video of the VVD YouTube-channel
uploads_playlist_id = get_uploads_playlist_id(channel_id_BIJ1)
print("Uploads Playlist ID:", uploads_playlist_id)

Uploads Playlist ID: UUI1wgi8HoU-wAnuSPSTDDUA


##### Getting all video IDs from the uploads playlist

In [None]:
import time
import json
import os

# Specifying a cache directory to increase the speed of the code
CACHE_DIR = "cache"
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

# Creating a json-file for the Playlist ID of the relevant YouTube channel
def cache_results(filename, data):
    with open(os.path.join(CACHE_DIR, filename), 'w') as f:
        json.dump(data, f)

# Loading the previously cached results
def load_cached_results(filename):
    filepath = os.path.join(CACHE_DIR, filename)
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            return json.load(f)
    return None

# Defining a function to get all the videos of a specific YouTube channel
def get_all_videos(playlist_id):
    # Returning the cached data
    cache_filename = f"{playlist_id}_all_videos.json"
    cached_data = load_cached_results(cache_filename)
    if cached_data:
        return cached_data

    youtube = build("youtube", "v3", developerKey=API_key)
    video_list = []
    next_page_token = None

    while True:
        # Requesting data via the YouTube API
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token,
            fields="items(snippet(resourceId(videoId),title,publishedAt)),nextPageToken"
        )
        response = request.execute()

        for item in response["items"]:
            # Extracting the relevant data
            video_id = item["snippet"]["resourceId"]["videoId"]
            title = item["snippet"]["title"]
            publish_date = item["snippet"]["publishedAt"]

            # Appending the outcome variables to an outcome list
            video_list.append({
                "video_id": video_id,
                "title": title,
                "publish_date": publish_date
            })
        
        # Go to next page if needed
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

        # Implementing sleep time to not hit quota limits
        time.sleep(1)

    cache_results(cache_filename, video_list)
    return video_list

In [None]:
import pandas as pd

# Getting the uploads playlist ID for the channel
channel_id = channel_id_GroenLinks
uploads_playlist_id = get_uploads_playlist_id(channel_id)
print("Uploads Playlist ID:", uploads_playlist_id)

# Fetching all video IDs from the uploads playlist
videos = get_all_videos(uploads_playlist_id)
print(f"Found {len(videos)} videos")

# Creating a list to store the video details and transcripts
video_data = []

# For each video ID, get the video details and transcript
for video in videos:
    video_id = video["video_id"]
    details = get_video_details(video_id)
    transcript = get_transcript(video_id)
    
    video_data.append({
        "video_id": video_id,
        "title": details["title"],
        "channel": details["channel"],
        "views": details["views"],
        "publish_date": details["publish_date"],
        "url": details["url"],
        "transcript": transcript
    })

# Converting the list to a DataFrame
df = pd.DataFrame(video_data)

# Exporting the DataFrame to a CSV file
df.to_csv("youtube_videos_VVD.csv", index=False)

Uploads Playlist ID: UUpYJiwLo1KTKElBq_0xAI7A
Found 453 videos
