In [1]:
import pandas as pd
from googleapiclient.discovery import build
from datetime import datetime
import time

In [2]:
API_KEY = 'AIzaSyAK91bbxUwvfGDd-oCuX09X8k6jHU1ANrU'

# Apna College Channel ID
CHANNEL_ID = 'UCBwmMxybNva6P_5VmxjzwqA'

# Initialize YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)

In [3]:
# FUNCTION: Extract Video Metadata
def get_channel_videos(channel_id, max_results=50):
    """
    Extract all videos from a YouTube channel using pagination.
    
    Args:
        channel_id: YouTube channel ID
        max_results: Results per API call (max 50)
    
    Returns:
        List of video dictionaries with id, title, and published_date
    """
    videos = []
    next_page_token = None
    page_count = 0
    
    print(f"Starting video extraction from channel: {channel_id}\n")
    
    while True:
        try:
            # API request to search endpoint
            request = youtube.search().list(
                part='snippet,id',
                channelId=channel_id,
                maxResults=max_results,
                order='date',  # Sort by publish date
                type='video',  # Only get videos, not playlists
                pageToken=next_page_token
            )
            
            response = request.execute()
            page_count += 1
            
            # Extract video information
            for item in response.get('items', []):
                if item['id']['kind'] == 'youtube#video':
                    video_data = {
                        'video_id': item['id']['videoId'],
                        'title': item['snippet']['title'],
                        'published_date': item['snippet']['publishedAt'],
                        'description': item['snippet']['description']
                    }
                    videos.append(video_data)
            
            print(f"Page {page_count}: Extracted {len(response.get('items', []))} videos | Total: {len(videos)}")
            
            # Check if there are more pages
            next_page_token = response.get('nextPageToken')
            
            if not next_page_token:
                print("\nâœ“ All pages processed!")
                break
            
            # Be nice to the API - small delay between requests
            time.sleep(0.5)
            
        except Exception as e:
            print(f"\nâœ— Error on page {page_count}: {str(e)}")
            break
    
    return videos


In [4]:
# FUNCTION: Process and Clean Data
def process_video_data(videos):
    """
    Convert video list to DataFrame and perform basic cleaning.
    
    Args:
        videos: List of video dictionaries
    
    Returns:
        Pandas DataFrame with processed data
    """
    # Create DataFrame
    df = pd.DataFrame(videos)
    
    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'])
    
    # Extract just the date (without time)
    df['publish_date'] = df['published_date'].dt.date
    
    # Sort by date (newest first)
    df = df.sort_values('published_date', ascending=False).reset_index(drop=True)
    
    print(f"\n{'='*60}")
    print(f"DATASET SUMMARY")
    print(f"{'='*60}")
    print(f"Total Videos: {len(df)}")
    print(f"Date Range: {df['publish_date'].min()} to {df['publish_date'].max()}")
    print(f"Unique Titles: {df['title'].nunique()}")
    print(f"Duplicate Titles: {len(df) - df['title'].nunique()}")
    
    return df


In [5]:
# FUNCTION: Basic EDA
def perform_eda(df):
    """
    Perform exploratory data analysis on video metadata.
    """
    print(f"\n{'='*60}")
    print(f"EXPLORATORY DATA ANALYSIS")
    print(f"{'='*60}\n")
    
    # 1. Check for missing values
    print("1. Missing Values:")
    print(df.isnull().sum())
    
    # 2. Videos per year
    print("\n2. Videos Published Per Year:")
    df['year'] = pd.to_datetime(df['published_date']).dt.year
    year_counts = df['year'].value_counts().sort_index()
    print(year_counts)
    
    # 3. Sample titles
    print("\n3. Sample Video Titles (First 5):")
    for idx, title in enumerate(df['title'].head(), 1):
        print(f"   {idx}. {title}")
    
    # 4. Title length statistics
    df['title_length'] = df['title'].str.len()
    print(f"\n4. Title Length Statistics:")
    print(f"   Average: {df['title_length'].mean():.0f} characters")
    print(f"   Min: {df['title_length'].min()} | Max: {df['title_length'].max()}")
    
    return df


In [7]:
if __name__ == "__main__":
    print("="*60)
    print("YOUTUBE VIDEO METADATA EXTRACTION")
    print("Channel: Apna College")
    print("="*60 + "\n")
    
    # Step 1: Extract videos
    video_list = get_channel_videos(CHANNEL_ID)
    
    if not video_list:
        print("No videos found or error occurred.")
    else:
        # Step 2: Process data
        df = process_video_data(video_list)
        
        # Step 3: Basic EDA
        df = perform_eda(df)
        
        # Step 4: Save to CSV
        output_file = '../data/apna_college_videos.csv'
        df.to_csv(output_file, index=False)
        print(f"\nâœ“ Data saved to: {output_file}")
        print(f"\nColumns in dataset: {list(df.columns)}")
        
        # Display first few rows
        print("\n" + "="*60)
        print("SAMPLE DATA (First 3 rows):")
        print("="*60)
        print(df[['video_id', 'title', 'publish_date']].head(3).to_string(index=False))

print("\n" + "="*60)
print("SCRIPT COMPLETED SUCCESSFULLY!")
print("="*60)

YOUTUBE VIDEO METADATA EXTRACTION
Channel: Apna College

Starting video extraction from channel: UCBwmMxybNva6P_5VmxjzwqA

Page 1: Extracted 50 videos | Total: 50
Page 2: Extracted 50 videos | Total: 100
Page 3: Extracted 50 videos | Total: 150
Page 4: Extracted 0 videos | Total: 150

âœ“ All pages processed!

DATASET SUMMARY
Total Videos: 150
Date Range: 2021-09-20 to 2025-11-14
Unique Titles: 142
Duplicate Titles: 8

EXPLORATORY DATA ANALYSIS

1. Missing Values:
video_id          0
title             0
published_date    0
description       0
publish_date      0
dtype: int64

2. Videos Published Per Year:
year
2021    13
2022    76
2023    19
2024    26
2025    16
Name: count, dtype: int64

3. Sample Video Titles (First 5):
   1. Complete Full Stack Web Development Preparation : MERN Stack + 6 Major Projects | New Delta 8.0 ðŸš€
   2. How he got placed in Japan as Software Engineer ? Learnt Japanese with DSA, Development
   3. CISCO - 24LPA | Cisco Ideathon #results #placement #apnacol