In [13]:
pip install --upgrade google-api-python-client

Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
from datetime import datetime
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [15]:
# Your restricted API key
api_key = ''

# Build the YouTube API client
youtube = build("youtube", "v3", developerKey=api_key)

def get_videos_for_period(query, start_date, end_date, total_videos=500):
    all_videos = []  # List to store video data
    videos_per_page = 50  # Maximum results per API call

    try:
        while len(all_videos) < total_videos:
            # Search for videos within the date range
            search_request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=videos_per_page,
                type="video",
                publishedAfter=start_date,
                publishedBefore=end_date
            )
            search_response = search_request.execute()

            # Extract video IDs and channel IDs
            video_ids = [item["id"]["videoId"] for item in search_response["items"]]
            channel_ids = [item["snippet"]["channelId"] for item in search_response["items"]]

            # Get detailed video statistics
            videos_request = youtube.videos().list(
                part="statistics,snippet",
                id=",".join(video_ids)
            )
            videos_response = videos_request.execute()

            # Get channel statistics (number of subscribers)
            channels_request = youtube.channels().list(
                part="statistics",
                id=",".join(set(channel_ids))  # Ensure unique channel IDs
            )
            channels_response = channels_request.execute()

            # Map channel IDs to subscriber counts
            channel_subscribers = {
                channel["id"]: channel["statistics"].get("subscriberCount", "0")
                for channel in channels_response["items"]
            }

            # Collect video data
            for video in videos_response["items"]:
                channel_id = video["snippet"]["channelId"]
                all_videos.append({
                    "Video ID": video["id"],
                    "Title": video["snippet"]["title"],
                    "Channel": video["snippet"]["channelTitle"],
                    "Channel ID": channel_id,
                    "Subscribers": channel_subscribers.get(channel_id, "0"),
                    "Views": video["statistics"].get("viewCount", 0),
                    "Likes": video["statistics"].get("likeCount", 0),
                    "Comments": video["statistics"].get("commentCount", 0),
                    "Thumbnail": video["snippet"]["thumbnails"]["default"]["url"]
                })

            # Stop if no more results are available
            if len(search_response["items"]) < videos_per_page:
                break

    except HttpError as error:
        print(f"An error occurred: {error}")

    # Convert the collected data into a DataFrame
    df_videos = pd.DataFrame(all_videos)

    # Save the DataFrame as a CSV file
    csv_filename = f"youtube_videos_{start_date[:10]}_to_{end_date[:10]}.csv"
    df_videos.to_csv(csv_filename, index=False, encoding="utf-8")
    print(f"Data saved to {csv_filename}")

    return df_videos

# Define the periods for 2020 to 2024, divided into quarters
quarters = []
years = [2020, 2021, 2022, 2023, 2024]
for year in years:
    # Q1: January to March
    quarters.append((f"{year}-01-01T00:00:00Z", f"{year}-03-31T23:59:59Z"))
    # Q2: April to June
    quarters.append((f"{year}-04-01T00:00:00Z", f"{year}-06-30T23:59:59Z"))
    # Q3: July to September
    quarters.append((f"{year}-07-01T00:00:00Z", f"{year}-09-30T23:59:59Z"))
    # Q4: October to December
    quarters.append((f"{year}-10-01T00:00:00Z", f"{year}-12-31T23:59:59Z"))

In [7]:
# First batch: 2020 to mid-2022
print("Starting batch 1: 2020 to mid-2022")
for start_date, end_date in quarters[:10]:  # First 10 quarters (2020 Q1 to 2022 Q2)
    print(f"Extracting videos for the period: {start_date} to {end_date}")
    get_videos_for_period("meditación", start_date, end_date, total_videos=500)

Starting batch 1: 2020 to mid-2022
Extracting videos for the period: 2020-01-01T00:00:00Z to 2020-03-31T23:59:59Z
Data saved to youtube_videos_2020-01-01_to_2020-03-31.csv
Extracting videos for the period: 2020-04-01T00:00:00Z to 2020-06-30T23:59:59Z
Data saved to youtube_videos_2020-04-01_to_2020-06-30.csv
Extracting videos for the period: 2020-07-01T00:00:00Z to 2020-09-30T23:59:59Z
Data saved to youtube_videos_2020-07-01_to_2020-09-30.csv
Extracting videos for the period: 2020-10-01T00:00:00Z to 2020-12-31T23:59:59Z
Data saved to youtube_videos_2020-10-01_to_2020-12-31.csv
Extracting videos for the period: 2021-01-01T00:00:00Z to 2021-03-31T23:59:59Z
Data saved to youtube_videos_2021-01-01_to_2021-03-31.csv
Extracting videos for the period: 2021-04-01T00:00:00Z to 2021-06-30T23:59:59Z
Data saved to youtube_videos_2021-04-01_to_2021-06-30.csv
Extracting videos for the period: 2021-07-01T00:00:00Z to 2021-09-30T23:59:59Z
Data saved to youtube_videos_2021-07-01_to_2021-09-30.csv
Extrac

In [16]:
# Second batch: mid-2022 to 2024
print("Starting batch 2: mid-2022 to 2024")
for start_date, end_date in quarters[10:]:  # Remaining quarters (2022 Q3 to 2024 Q4)
    print(f"Extracting videos for the period: {start_date} to {end_date}")
    get_videos_for_period("meditación", start_date, end_date, total_videos=500)

Starting batch 2: mid-2022 to 2024
Extracting videos for the period: 2022-07-01T00:00:00Z to 2022-09-30T23:59:59Z
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=statistics&id=UCu1C0mYQjWPFNmmMXJ40X7A%2CUCJQoiWCKVc5EivU9h0p80oA%2CUC2Z5BQhJWVgLX71_frN4oTA%2CUCvZKEunQgNxg6iJh62YGcdQ%2CUCnjsFhatLwYKvcpXJM2uP9g%2CUCeJkA3BPyzVt0UjKXZAGHLQ%2CUCjd1IqMxEJVhI0yfEVvXumQ%2CUCN4vyryy6O4GlIXcXTIuZQQ%2CUCkImJVONIpthOWxmxhF_gyA%2CUCZOIxWxZcVWhav7sWb1nTTQ%2CUCQWbj3GWvkDiEUoSMCK5msA%2CUCb9aiDQXg60uPYyCsJfLrSQ%2CUC6MF-58JaCCE0xMVkIueQ8A%2CUC1i0xzVJYUEmqPSNmPssiCQ%2CUCAdlJqHeesFJCDYRYPO9hvQ%2CUCOHzeyMJNJEW4gOH1ce8TLQ%2CUCNgvH4oG_eKDtnLYpHSEu3A%2CUCdwxjEwpKg26GJv299bxzow%2CUCvJLk6MtHo_0ksGExaXMvCA%2CUCpOAcjJNAp0Y0fhznRrXIJQ%2CUCmuESTgut4_eKeWtCpV4Ppw%2CUC8orFoVp-NU8P7uDWdEmq2A%2CUCJ80RT5ob5mSJT8KRwqGW4g%2CUCUF6CW5FvBXmEDQuOwEsnmA%2CUCKQgiVPqEPU0tImAjFG5lNA%2CUCsGpxss62kB8NE-hlNmcBSA%2CUCQNTZKLVgkFdRNDdBzFsq_Q%2CUCmGGQt4Rq-jgVDNqzCgowXw&key=AIzaSyDB

In [23]:
#After the data cleaning I've dropped 5000 rows. The issue likely arises because the same video is captured multiple times during different
#API calls to YouTube, resulting in repeated entries with the same Video ID. This can happen if the video remains relevant across multiple
#time periods or queries, causing duplication in the final dataset.To avoid this issue in future extractions, we should add the following
#code during the data loading process to filter out duplicate Video ID entries as they are being imported:

In [None]:
# Initialize an empty set to track unique Video IDs
unique_video_ids = set()

# Filter duplicates during CSV loading
for csv_file in csv_files:
    file_path = os.path.join(csv_folder_path, csv_file)
    print(f"Loading {csv_file}...")
    df = pd.read_csv(file_path)

    # Remove videos that have already been processed
    initial_count = len(df)
    df = df[~df['Video ID'].isin(unique_video_ids)]

    # Add the new Video IDs to the set
    unique_video_ids.update(df['Video ID'])
    
    filtered_count = len(df)
    print(f"{filtered_count}/{initial_count} videos added from {csv_file}.")
    
    # Append the filtered DataFrame
    dataframes.append(df)