## Scraping YouTube videos

### Installing required libraries

In [None]:
! pip install google-api-python-client

In [None]:
! pip install youtube-transcript-api

### Scraping one YouTube video's details and transcript

In [None]:
from googleapiclient.discovery import build

# Please put your own API key here
API_key = "Input your own"

# Defining a loop that uses the YouTube API to scrape the basic data of a video
def get_video_details(video_id):
    youtube = build("youtube", "v3", developerKey=API_key)
    request = youtube.videos().list(part="snippet,statistics", id=video_id)\
    
    response = request.execute()

    if "items" in response and response["items"]:
        video = response["items"][0]
        details = {
            "title": video["snippet"]["title"],
            "channel": video["snippet"]["channelTitle"],
            "views": video["statistics"].get("viewCount", "N/A"),
            "publish_date": video["snippet"]["publishedAt"],
            "url": f"https://www.youtube.com/watch?v={video_id}"
        }
        return details
    return None

In [None]:
# Testing code on example video of the VVD-YouTube-video
video_id = "XqtessUPQEY"
details = get_video_details(video_id)
print(details)

##### Scraping YouTube video transcript in Dutch

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

# Defining a function to fetch the YouTube transcripts while utilising the YouTube transcript API
def get_transcript(video_id, language="nl"):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        transcript_text = " ".join([entry["text"] for entry in transcript])
        return transcript_text

    except Exception as e:
        return f"Error fetching transcript: {e}"

In [None]:
# Testing code on example video of the VVD-YouTube-video
# Sorry but this video actually is a speech ouch
video_id = "XqtessUPQEY"
transcript = get_transcript(video_id)
print("\nTranscript:", transcript)

### Scraping the previously defined data and transcripts for multiple videos

##### Getting an uploads list for a certain YouTube channel

In [None]:
# Finding this channel ID is a bit of a challenge, but can be done via following this tutorial: https://www.youtube.com/watch?v=0oDy2sWPF38
channel_id_VVD = "UCZean7nAZKDGIHANq-MuaGA"

# Getting the playlist of uploads for a specific YouTube channel
def get_uploads_playlist_id(channel_id):
    youtube = build("youtube", "v3", developerKey=API_key)
    request = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    )
    response = request.execute()
    
    if response["items"]:
        return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    return None

In [None]:
# Testing code on example video of the VVD-YouTube-video
# Sorry but this video actually is a speech ouch
uploads_playlist_id = get_uploads_playlist_id(channel_id_VVD)
print("Uploads Playlist ID:", uploads_playlist_id)

##### Getting all video IDs from the uploads playlist

In [None]:
import datetime

def get_videos_after_date(playlist_id, after_date_str):
    youtube = build("youtube", "v3", developerKey=API_key)
    video_list = []
    next_page_token = None

    # Converting string to datetime object
    after_date = datetime.datetime.strptime(after_date_str, "%Y-%m-%d")

    while True:
        request = youtube.playlistItems().list(part = "snippet", playlistId = playlist_id, maxResults = 50, pageToken = next_page_token)
        response = request.execute()

        for item in response["items"]:
            video_id = item["snippet"]["resourceId"]["videoId"]
            title = item["snippet"]["title"]
            publish_date = item["snippet"]["publishedAt"]
            
            # Convert publish date to datetime
            publish_date_obj = datetime.datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")

            if publish_date_obj > after_date:
                video_list.append({
                    "video_id": video_id,
                    "title": title,
                    "publish_date": publish_date
                })
            
            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break
            
    return video_list

In [None]:
# Example: Get all videos published after January 1, 2024
videos = get_videos_after_date(uploads_playlist_id, "2024-01-01")

# Does not work yet, quota limit problems