# Data Collection

This notebook is for collecting data for the project. You can use the enhanced `scraping.py` module to scrape comments from YouTube videos with improved verbose output showing video titles during the scraping process.

## New Features:
- **Video Title Display**: Progress bars and logs now show video titles instead of just IDs
- **Enhanced Data**: Scraped data includes both video ID and video title columns
- **Better Error Handling**: More informative error messages with video context
- **Flexible API Key**: Can be provided via parameter or environment variable

## 1. Environment Setup

If you are using Google Colab, run the following cell to set up your environment. If you are working locally, you can skip this step.

In [None]:
import os
from pathlib import Path
import toml
import sys
import subprocess

# --- 1. Clone your GitHub repository ---
repo_url = "https://github.com/rnov24/civic_sentiment.git"
repo_name = "civic_sentiment"  # Should match the PROJ_ROOT in config.py
clone_path = Path("/content") / repo_name

if not clone_path.exists():
    # For private repositories, you'll need to use a GitHub token
    # from getpass import getpass
    # token = getpass('Enter your GitHub token: ')
    # repo_url = repo_url.replace("https://", f"https://{token}@")
    subprocess.run(["git", "clone", repo_url, str(clone_path)], check=True)
else:
    print("Repository already cloned.")

# --- 2. Install project dependencies ---
sys.path.append(str(clone_path))
pyproject_path = clone_path / "pyproject.toml"

with open(pyproject_path, "r") as f:
    pyproject = toml.load(f)

dependencies = pyproject["project"]["dependencies"]
subprocess.run(["pip", "install"] + dependencies, check=True)

# --- 3. Now you can import your project modules ---
# The PROJ_ROOT in config.py is now correctly set to /content/civic_sentiment
from civic_sentiment.config import RAW_DATA_DIR

print("\nEnvironment setup complete!")
print(f"Raw data directory: {RAW_DATA_DIR}")

## 2. Scrape YouTube Comments

In [None]:
from civic_sentiment.scraping import scrape_videos
import os

# Get the API key from the environment variable or set it directly
API_KEY = os.getenv("YOUTUBE_API_KEY")


# Example: Scrape comments from multiple videos with enhanced verbose output
video_ids = [
    "LJ8yd0uRvwY",
    "oOf1b1P6fGc",
]

if API_KEY:
    print(f"Scraping comments from {len(video_ids)} videos...")
    comments_df = scrape_videos(API_KEY, video_ids)
    print(f"\n✅ Found {len(comments_df)} comments from {comments_df['video_id'].nunique()} videos.")
    
    # Display video titles that were scraped
    if not comments_df.empty:
        print("\n📺 Videos processed:")
        for video_id, title in comments_df[['video_id', 'video_title']].drop_duplicates().values:
            print(f"  • {video_id}: {title}")
else:
    print("❌ YOUTUBE_API_KEY environment variable not set.")
    print("Please set your YouTube Data API key either:")
    print("1. As an environment variable: export YOUTUBE_API_KEY=your_key")
    print("2. Or modify the API_KEY variable in this cell")

In [None]:
# Display the first 5 rows of the DataFrame with the new video_title column
if not comments_df.empty:
    print("📊 Sample of scraped comments:")
    print(f"Columns: {list(comments_df.columns)}")
    print(f"Shape: {comments_df.shape}")
    print("\nFirst 5 comments:")
    display(comments_df.head())
    
    # Show some basic statistics
    print(f"\n📈 Summary:")
    print(f"Total comments: {len(comments_df)}")
    print(f"Unique videos: {comments_df['video_id'].nunique()}")
    print(f"Unique authors: {comments_df['author'].nunique()}")
    
    # Show comments per video
    print(f"\n📺 Comments per video:")
    video_stats = comments_df.groupby(['video_id', 'video_title']).size().reset_index(name='comment_count')
    for _, row in video_stats.iterrows():
        title_preview = row['video_title'][:50] + "..." if len(row['video_title']) > 50 else row['video_title']
        print(f"  {row['video_id']}: {row['comment_count']} comments")
        print(f"    Title: {title_preview}")
else:
    print("No comments were scraped.")

In [None]:
# Save the DataFrame to a CSV file
if not comments_df.empty:
    output_path = RAW_DATA_DIR / "comments.csv"
    comments_df.to_csv(output_path, index=False)
    print(f"💾 Comments saved to {output_path}")
    print(f"📁 File size: {output_path.stat().st_size / 1024:.1f} KB")
    
    # Show a preview of what was saved
    print(f"\n📋 Data saved includes:")
    print(f"  • {len(comments_df)} total comments")
    print(f"  • {comments_df['video_id'].nunique()} unique videos")
    print(f"  • Columns: {', '.join(comments_df.columns)}")
else:
    print("⚠️ No data to save - comments DataFrame is empty")