In [None]:
!pip install --upgrade google--api--python--client



In [None]:
from googleapiclient.discovery import build
import pandas as pd
import numpy as np
from google.colab import userdata
from google.colab import files
import os
import requests
import time
# from PIL import Image   # good to have for later, for image processing
# from io import BytesIO #optional

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

api_key = userdata.get('YoutubeDataApiKey')
Youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
def get_video_ids_from_search(search_query, max_results_per_page=50, num_pages=1):
    video_ids = []
    next_page_token = None
    print(f"Searching for '{search_query}'...")
    for page_num in range(num_pages):
        try:
            # ... (your existing search logic) ...
            search_response = Youtube.search().list(
                q=search_query,
                type='video',
                part='id',
                maxResults=max_results_per_page,
                pageToken=next_page_token
            ).execute()

            for item in search_response['items']:
                if item['id']['kind'] == 'youtube#video':
                    video_ids.append(item['id']['videoId'])

            next_page_token = search_response.get('nextPageToken')
            if not next_page_token:
                print(f"No more pages for '{search_query}' after page {page_num+1}.")
                break
            print(f"Fetched page {page_num+1} for '{search_query}'. Total IDs so far: {len(video_ids)}")
        except Exception as e:
            print(f"Error on page {page_num+1} for '{search_query}': {e}")
            break

    return video_ids

In [None]:


def get_video_details(video_ids_list):
    """
    Fetches detailed information for a list of video IDs using videos().list().
    It processes IDs in batches of 50 to optimize quota usage.

    Args:
        video_ids_list (list): A list of unique YouTube video IDs.

    Returns:
        pd.DataFrame: A DataFrame containing detailed video metadata.
                      Returns an empty DataFrame if no data is fetched.
    """
    all_video_data = []
    # Process IDs in batches of 50
    for i in range(0, len(video_ids_list), 50):
        batch_ids = video_ids_list[i:i+50]
        video_ids_string = ",".join(batch_ids) # Convert to comma-separated string

        print(f"Fetching details for batch of {len(batch_ids)} videos from index {i} to {i+len(batch_ids)}...")

        try:
            # API call to get video details
            request = Youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=video_ids_string
            )
            response = request.execute()

            for video in response['items']:

                video_info = {
                    'video_id': video.get('id'),
                    'title': video['snippet'].get('title'),
                    'published_at': video['snippet'].get('publishedAt'),
                    'channel_id': video['snippet'].get('channelId'),
                    'category_id': video['snippet'].get('categoryId'),
                    'view_count': int(video['statistics'].get('viewCount', 0)),
                    'like_count': int(video['statistics'].get('likeCount', 0)),
                    'comment_count': int(video['statistics'].get('commentCount', 0)),
                    'duration': video['contentDetails'].get('duration'),
                    'description': video['snippet'].get('description'),
                    'tags': video['snippet'].get('tags', [])
                }
                all_video_data.append(video_info)


        except Exception as e:
            print(f"Error fetching details for batch starting with {batch_ids[0]}: {e}")
            # Continue to next batch even if this one fails
            continue


    return pd.DataFrame(all_video_data)

GET DATA

In [None]:

search_queries = [
    # Tech & Education
    "python tutorial", "machine learning", "data science", "learn javascript",
    "blender 3d tutorial", "photoshop editing", "unreal engine 5", "unity tutorial",
    # Lifestyle & Hobbies
    "healthy cooking recipes", "home workout", "financial planning", "minimalism",
    "travel vlog japan", "learn to play guitar", "woodworking projects", "gardening for beginners",
    # Entertainment & Reviews
    "product review smartphone", "movie review", "video game walkthrough", "book summary",
    "car review", "stand up comedy", "best of compilation", "fails compilation",
    # Business & Finance
    "stock market analysis", "real estate investing", "small business ideas", "marketing strategies"
]

category_filler_queries = [
    "official music video", "live concert performance", "nba highlights",
    "premier league goals 2025", "minecraft gameplay", "elden ring boss fight",
    "funny cat videos", "dog training tips", "animated short film",
    "latest movie trailers", "breaking news today", "political debate highlights"
]

search_queries = category_filler_queries

# 2. Prepare for the collection loop
all_dataframes = []
# Aim for a small, targeted number of videos for these new categories
total_videos_to_fetch = 2000
videos_per_query = total_videos_to_fetch // len(search_queries)
num_pages = max(1, (videos_per_query // 50))

# 3. Loop through ONLY the new queries
for query in search_queries:
    print(f"\n--- Starting data collection for query: '{query}' ---")
    video_ids = get_video_ids_from_search(query, max_results_per_page=50, num_pages=num_pages)

    if video_ids:
        unique_ids = list(set(video_ids))
        video_df = get_video_details(unique_ids)

        if not video_df.empty:
            all_dataframes.append(video_df)
            print(f"Collected {len(video_df)} videos for '{query}'")

    time.sleep(1)

# 4. Combine new data with your existing old data
if all_dataframes:
    old_data_path = '/content/drive/My Drive/YTML/YTML_Project_Data/V1_text_raw_data.csv'
    try:
        old_df = pd.read_csv(old_data_path)
        all_dataframes.insert(0, old_df)
        print("\nSuccessfully loaded previously collected data.")
    except FileNotFoundError:
        print("\nCould not find V1_text_raw_data.csv. Starting with new data only.")

    # Combine all DataFrames
    final_df = pd.concat(all_dataframes, ignore_index=True)
    final_df.drop_duplicates(subset='video_id', inplace=True)

    print(f"\n--- Total unique videos in the combined dataset: {len(final_df)} ---")

    folder_path = '/content/drive/My Drive/YTML/YTML_Project_Data/'
    file_path = os.path.join(folder_path, 'V2_generalized_raw_data.csv')
    os.makedirs(folder_path, exist_ok=True)
    final_df.to_csv(file_path, index=False)

    print(f"Generalized dataset successfully saved to: {file_path}")
    display(final_df.head())
else:
    print("\nNo new data was collected.")

# all_dataframes = []
# total_videos_to_fetch = 4000 # Your target number of videos
# videos_per_query = total_videos_to_fetch // len(search_queries)
# num_pages = (videos_per_query // 50) + 1 # 50 results per page

# # 2. Loop through queries and collect data
# for query in search_queries:
#     print(f"\n--- Starting data collection for query: '{query}' ---")
#     video_ids = get_video_ids_from_search(query, max_results_per_page=50, num_pages=num_pages)

#     if video_ids:
#         unique_ids = list(set(video_ids))
#         video_df = get_video_details(unique_ids)

#         if not video_df.empty:
#             all_dataframes.append(video_df)
#             print(f"Collected {len(video_df)} videos for '{query}'")

#     time.sleep(1) # Be polite to the API

# # 3. Combine, de-duplicate, and save the final dataset
# if all_dataframes:
#     final_df = pd.concat(all_dataframes, ignore_index=True)
#     final_df.drop_duplicates(subset='video_id', inplace=True)

#     print(f"\n--- Total unique videos collected across all queries: {len(final_df)} ---")

#     folder_path = '/content/drive/My Drive/YTML_Project_Data/'
#     file_path = os.path.join(folder_path, 'V1_diverse_raw_data.csv')
#     os.makedirs(folder_path, exist_ok=True)
#     final_df.to_csv(file_path, index=False)

#     print(f"Diverse dataset successfully saved to: {file_path}")
#     display(final_df.head())
# else:
#     print("\nNo data was collected across any of the queries.")


--- Starting data collection for query: 'official music video' ---
Searching for 'official music video'...
Fetched page 1 for 'official music video'. Total IDs so far: 50
Fetched page 2 for 'official music video'. Total IDs so far: 100
Fetched page 3 for 'official music video'. Total IDs so far: 150
Fetching details for batch of 50 videos from index 0 to 50...
Fetching details for batch of 50 videos from index 50 to 100...
Fetching details for batch of 42 videos from index 100 to 142...
Collected 142 videos for 'official music video'

--- Starting data collection for query: 'live concert performance' ---
Searching for 'live concert performance'...
Fetched page 1 for 'live concert performance'. Total IDs so far: 50
Fetched page 2 for 'live concert performance'. Total IDs so far: 100
Fetched page 3 for 'live concert performance'. Total IDs so far: 150
Fetching details for batch of 50 videos from index 0 to 50...
Fetching details for batch of 50 videos from index 50 to 100...
Fetching de

Unnamed: 0,video_id,title,published_at,channel_id,category_id,view_count,like_count,comment_count,duration,description,tags
0,390Pc6_fdeg,It’s literally perfect 🫠 #coding #java #progra...,2024-12-19T00:40:54Z,UC8Wt7vynFfRnoVtpNpESWjw,22,5839276,150804,2309,PT13S,,[]
1,Sg4GMVMdOPo,Start coding with PYTHON in 5 minutes! 🐍,2024-08-16T16:08:32Z,UC4SVo0Ue36XCfOyb5Lh1viQ,27,349876,10378,452,PT5M50S,#python #pythontutorial #pythoncourse \n\nThis...,"['Python tutorial', 'python course', 'python p..."
2,yZl5FJ3ChkI,Python Roadmap For Beginners (Step By Step),2025-07-21T13:02:04Z,UCgKFOz_KrMbmypWrawtzDQg,28,18443,1760,12,PT53S,"If I was a beginner learning to code, I would ...",[]
3,ICMXzWk66Qg,Making Your Own Compiler! #programming #code #...,2023-07-12T12:00:22Z,UCWyEDE8MxOyGmdsluOpRqwQ,27,35829,1921,21,PT42S,#shorts \n\nFull Video: https://youtu.be/GsCWi...,"['Python', 'Programming', 'Tutorial', 'Guide',..."
4,BplMR0txSeA,"Python on phone, pydroid 3 #python #android #c...",2025-02-19T11:43:05Z,UC8adQ8nskI9jQKTYC0q3O-w,27,90850,0,12,PT26S,Pydroid3: https://play.google.com/store/apps/d...,"['Python', 'pydroi3', 'trending']"


In [None]:
#now I wanna get the channel data from the channelIds

def get_channel_details(channel_ids_list):

    all_channel_data = []
    # Process IDs in batches of 50
    for i in range(0, len(channel_ids_list), 50):
        batch_ids = channel_ids_list[i:i+50]
        id_string = ",".join(batch_ids)

        print(f"Fetching details for batch of {len(batch_ids)} channels...")
        try:

            request = Youtube.channels().list(
                part="snippet,statistics,brandingSettings",
                id=id_string
            )
            response = request.execute()

            for item in response.get('items', []):

                snippet = item.get('snippet', {})
                stats = item.get('statistics', {})
                branding = item.get('brandingSettings', {}).get('image', {})
                thumbnails = snippet.get('thumbnails', {})


                subscriber_count = stats.get('subscriberCount')

                channel_info = {
                    'channel_id': item.get('id'),
                    'channel_title': snippet.get('title'),
                    'channel_description': snippet.get('description'),
                    'channel_start_date': snippet.get('publishedAt'),
                    'subscriber_count': int(subscriber_count) if subscriber_count else np.nan,
                    'channel_view_count': int(stats.get('viewCount', 0)),
                    'channel_video_count': int(stats.get('videoCount', 0)),
                    'profile_picture_url': thumbnails.get('high', {}).get('url'),
                    'banner_image_url': branding.get('bannerExternalUrl')
                }
                all_channel_data.append(channel_info)
        except Exception as e:
            print(f"An error occurred: {e}")
            continue

    return pd.DataFrame(all_channel_data)



Main Execution Logic

In [None]:
# 1. Load your video dataset to get the channel IDs
video_data_path = '/content/drive/My Drive/YTML/YTML_Project_Data/V2_generalized_raw_data.csv'
df_videos = pd.read_csv(video_data_path)

# 2. Get a list of unique channel IDs
unique_channel_ids = df_videos['channel_id'].dropna().unique().tolist()
print(f"Found {len(unique_channel_ids)} unique channels to process.")

# 3. Fetch the comprehensive details for all channels
df_channels = get_channel_details(unique_channel_ids)

# 4. Save the new, comprehensive channel data
if not df_channels.empty:
    folder_path = '/content/drive/My Drive/YTML/YTML_Project_Data/'
    file_path = os.path.join(folder_path, 'V2_merged_channel_data.csv')
    df_channels.to_csv(file_path, index=False)

    print(f"\nComprehensive channel data successfully saved to: {file_path}")
    display(df_channels.head())
else:
    print("\nNo channel data was collected.")

Found 3485 unique channels to process.
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details for batch of 50 channels...
Fetching details

Unnamed: 0,channel_id,channel_title,channel_description,channel_start_date,subscriber_count,channel_view_count,channel_video_count,profile_picture_url,banner_image_url
0,UCBwmMxybNva6P_5VmxjzwqA,Apna College,"I'm Shradha, Ex-Microsoft Software Engineer, D...",2020-08-05T16:09:28.304314Z,6910000,1122049914,956,https://yt3.ggpht.com/FEcjRtez5od8UowDo6tTt9Wl...,https://yt3.googleusercontent.com/_IDelYCuXmkR...
1,UCwr-evhuzGZgDFrq_1pLt_A,Error Makes Clever,Empowering developers to level up their skills...,2023-04-10T07:37:38.000808Z,1070000,93357249,357,https://yt3.ggpht.com/10iKRvxRHNx00HW9Ch2mQ5Mz...,
2,UC8_RSKwbU1OmZWNEoLV1tQg,Data with Baraa,Hey! I'm Baraa Khatib Salkini - a data guy who...,2022-12-24T21:19:00.052502Z,134000,6235073,311,https://yt3.ggpht.com/dqs3pZO_8d4K2zp2ebhb2klV...,https://yt3.googleusercontent.com/VsxutdFRM36n...
3,UCoTIFmdMX2TY7rpqJALz8uw,Learn Technology,Welcome to Learn Technology - Your Gateway to ...,2023-06-18T05:44:06.596928Z,11900,2120517,314,https://yt3.ggpht.com/00ZNmscQdH57Fz76BVIsyjZH...,https://yt3.googleusercontent.com/URT0QWRAuS0u...
4,UCFkD145ffgXhP4-DyFJmK1w,The Code Run,Hello guys\nWelcome to The Code Run 😄\nPlease ...,2022-07-23T16:04:56.544564Z,97,50579,99,https://yt3.ggpht.com/e3wpwOc0dah39Ho_gK-rgxgo...,https://yt3.googleusercontent.com/7pdnNYs1-JDh...
