# Data Collection

This notebook is for collecting data for the project. You can use the `scraping.py` module to scrape data from YouTube.

## 1. Environment Setup

If you are using Google Colab, run the following cell to set up your environment. If you are working locally, you can skip this step.

In [None]:
import os
from pathlib import Path
import toml
import sys
import subprocess

# --- 1. Clone your GitHub repository ---
repo_url = "https://github.com/rnov24/civic_sentiment.git"
repo_name = "civic_sentiment"  # Should match the PROJ_ROOT in config.py
clone_path = Path("/content") / repo_name

if not clone_path.exists():
    # For private repositories, you'll need to use a GitHub token
    # from getpass import getpass
    # token = getpass('Enter your GitHub token: ')
    # repo_url = repo_url.replace("https://", f"https://{token}@")
    subprocess.run(["git", "clone", repo_url, str(clone_path)], check=True)
else:
    print("Repository already cloned.")

# --- 2. Install project dependencies ---
sys.path.append(str(clone_path))
pyproject_path = clone_path / "pyproject.toml"

with open(pyproject_path, "r") as f:
    pyproject = toml.load(f)

dependencies = pyproject["project"]["dependencies"]
subprocess.run(["pip", "install"] + dependencies, check=True)

# --- 3. Now you can import your project modules ---
# The PROJ_ROOT in config.py is now correctly set to /content/civic_sentiment
from civic_sentiment.config import RAW_DATA_DIR

print("\nEnvironment setup complete!")
print(f"Raw data directory: {RAW_DATA_DIR}")

## 2. Scrape YouTube Comments

In [None]:
from civic_sentiment.scraping import get_video_comments, main
import os

# Get the API key from the environment variable
API_KEY = os.getenv("YOUTUBE_API_KEY")

# Example: Get comments from a single video
video_id = "dQw4w9WgXcQ" # Rick Astley - Never Gonna Give You Up
if API_KEY:
    comments = get_video_comments(API_KEY, video_id)
    print(f"Found {len(comments)} comments for video {video_id}")
else:
    print("YOUTUBE_API_KEY environment variable not set.")

In [None]:
# Example: Scrape comments from multiple videos and save to a CSV file
video_ids = ["dQw4w9WgXcQ", "o-YBDTqX_ZU"] # Add your video IDs here
if API_KEY:
    main(video_ids)
else:
    print("YOUTUBE_API_KEY environment variable not set.")