In [None]:
pip install pandas numpy scikit-learn matplotlib seaborn requests beautifulsoup4 google-api-python-client




# YouTube Video Popularity Prediction — Data Collection

This notebook collects YouTube video metadata from two sources:
1. **Web Scraping:** Extract trending videos from YouTube's public web interface using BeautifulSoup.
2. **YouTube Data API:** Retrieve structured metadata for videos using the YouTube Data API v3.

Collected data will be stored in the `/data` folder as `.csv` files for preprocessing and model training.


# Youtube Video Popularity Predictrion - Scraping

In [1]:
# Mount Google Drive (if not already)
from google.colab import drive
drive.mount('/content/drive')

# Set the working directory
import os
project_path = "/content/drive/MyDrive/youtube-popularity-prediction"
os.chdir(project_path)
print("✅ Working directory set to:", os.getcwd())

# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Working directory set to: /content/drive/MyDrive/youtube-popularity-prediction


## 3. Web Scraping YouTube Trending Data

This section scrapes trending video titles, URLs, and (optionally) other visible metadata from YouTube's Trending page.


In [55]:
pip install requests beautifulsoup4 tqdm pandas lxml




In [63]:
%cd /content/drive/MyDrive/youtube-popularity-prediction/src


/content/drive/MyDrive/youtube-popularity-prediction/src


In [66]:
%%writefile scrape_youtube.py
import requests, json, re, time, random, pandas as pd
from tqdm import tqdm

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/123.0 Safari/537.36"
}

KEYWORDS = [
    "music video", "podcast", "sports highlights", "tech review",
    "news update", "tutorial", "education", "vlog", "movie trailer",
    "finance tips", "travel vlog", "gaming", "reaction video",
    "motivational speech", "fashion haul", "documentary", "interview",
    "live performance", "cooking recipe", "product review"
]

SAVE_PATH = "../data/youtube_scraped_3000.csv"
MAX_VIDEOS = 3000


def extract_videos_from_json(html):
    """Extract JSON data from the ytInitialData object."""
    match = re.search(r"ytInitialData\"[:=]\s*(\{.*?\})\s*;</script>", html, re.S)
    if not match:
        return []
    data = json.loads(match.group(1))
    videos = []
    try:
        contents = data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
        for section in contents:
            for item in section.get("itemSectionRenderer", {}).get("contents", []):
                video = item.get("videoRenderer")
                if not video:
                    continue
                vid = video.get("videoId")
                title = video["title"]["runs"][0]["text"]
                channel = video.get("ownerText", {}).get("runs", [{}])[0].get("text")
                views = video.get("viewCountText", {}).get("simpleText", "N/A")
                duration = video.get("lengthText", {}).get("simpleText", "N/A")
                videos.append({
                    "url": f"https://www.youtube.com/watch?v={vid}",
                    "title": title,
                    "channel": channel,
                    "views": views,
                    "duration": duration
                })
    except Exception as e:
        print("⚠️ Error parsing JSON:", e)
    return videos


def scrape_youtube_data():
    all_videos = []
    for keyword in tqdm(KEYWORDS, desc="🔍 Scraping search results"):
        search_url = f"https://www.youtube.com/results?search_query={keyword.replace(' ', '+')}"
        resp = requests.get(search_url, headers=HEADERS)
        videos = extract_videos_from_json(resp.text)

        for video in videos:
            all_videos.append(video)
            if len(all_videos) >= MAX_VIDEOS:
                break
        if len(all_videos) >= MAX_VIDEOS:
            break
        time.sleep(random.uniform(2, 4))

    df = pd.DataFrame(all_videos)
    df.drop_duplicates(subset="url", inplace=True)
    df.to_csv(SAVE_PATH, index=False)
    print(f"✅ Saved {len(df)} videos to {SAVE_PATH}")


if __name__ == "__main__":
    scrape_youtube_data()


Overwriting scrape_youtube.py


In [67]:
%cd /content/drive/MyDrive/youtube-popularity-prediction
!python src/scrape_youtube.py


/content/drive/MyDrive/youtube-popularity-prediction
🔍 Scraping search results: 100% 20/20 [41:03<00:00, 123.16s/it]
Traceback (most recent call last):
  File "/content/drive/MyDrive/youtube-popularity-prediction/src/scrape_youtube.py", line 112, in <module>
    scrape_youtube_data()
  File "/content/drive/MyDrive/youtube-popularity-prediction/src/scrape_youtube.py", line 107, in scrape_youtube_data
    df.to_csv(SAVE_PATH, index=False)
  File "/usr/local/lib/python3.12/dist-packages/pandas/util/_decorators.py", line 333, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/generic.py", line 3967, in to_csv
    return DataFrameRenderer(formatter).to_csv(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/formats/format.py", line 1014, in to_csv
    csv_formatter.save()
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/formats/csvs.py", line 

In [74]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/youtube-popularity-prediction/data/youtube_scraped_3000.csv')
print("✅ Rows:", len(df))
df.head()


✅ Rows: 3000


Unnamed: 0,url,title,channel,upload_date,duration,views,category,tags
0,https://www.youtube.com/watch?v=z3XFJxZGLV4,PATREON EXCLUSIVE | From Artist to Mogul (feat...,Joe Budden TV,2025-10-20T06:01:54-07:00,PT175M11S,106989,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous..."
1,https://www.youtube.com/watch?v=nORjJmqe1kM,The Joe Budden Podcast Episode 870 | Hour 2,Joe Budden TV,2025-10-19T05:01:26-07:00,PT186M46S,221091,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous..."
2,https://www.youtube.com/watch?v=ZSxdlP_tLLQ,The Joe Budden Podcast Episode 869 | Tricky Words,Joe Budden TV,2025-10-16T05:01:08-07:00,PT165M54S,265338,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous..."
3,https://www.youtube.com/watch?v=rr8Nqzo52Ps,Joe Budden Podcast Episode 866 | From One Cras...,Joe Budden TV,2025-10-05T05:01:37-07:00,PT197M0S,380974,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous..."
4,https://www.youtube.com/watch?v=jlXp8mmNRW0,The Joe Budden Podcast Episode 865 | Titty F**...,Joe Budden TV,2025-10-02T05:01:32-07:00,PT200M12S,282073,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous..."


In [72]:
import os
os.listdir("data")

['README.md', 'test_file.csv', 'youtube_scraped_3000.csv']

--------------------------------------------------------------------------------

# YouTube Video Popularity - API Collection

---



In [127]:
%cd /content/drive/MyDrive/youtube-popularity-prediction/src


/content/drive/MyDrive/youtube-popularity-prediction/src


In [155]:
%%writefile src/api_youtube.py
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import pathlib
from tqdm import tqdm
import time

# === Load API Key ===
env_path = pathlib.Path("/content/drive/MyDrive/youtube-popularity-prediction/.env")
load_dotenv(dotenv_path=env_path)
API_KEY = os.getenv("api_key")

if not API_KEY:
    raise ValueError(" API key not found. Check .env formatting.")

# === Output file ===
SAVE_PATH = "/content/drive/MyDrive/youtube-popularity-prediction/data/youtube_api_3000.csv"

# === YouTube regions (to reach 3000 videos total) ===
REGIONS = ["US", "IN", "GB", "BR", "JP", "KR", "FR", "DE", "CA", "MX", "RU", "IT", "AU", "ES", "ID"]

def get_trending_videos(region="US", max_results=300):
    """Collect trending videos from a single region."""
    base_url = "https://www.googleapis.com/youtube/v3/videos"
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        params = {
            "part": "snippet,contentDetails,statistics",
            "chart": "mostPopular",
            "regionCode": region,
            "maxResults": 50,
            "pageToken": next_page_token,
            "key": API_KEY
        }

        response = requests.get(base_url, params=params)
        data = response.json()

        if response.status_code != 200:
            print(f" API error {response.status_code} for region {region}: {data}")
            break

        for item in data.get("items", []):
            snippet = item.get("snippet", {})
            stats = item.get("statistics", {})
            content = item.get("contentDetails", {})

            videos.append({
                "region": region,
                "video_id": item.get("id"),
                "title": snippet.get("title"),
                "channel": snippet.get("channelTitle"),
                "category_id": snippet.get("categoryId"),
                "views": stats.get("viewCount"),
                "likes": stats.get("likeCount"),
                "comments": stats.get("commentCount"),
                "upload_date": snippet.get("publishedAt"),
                "duration": content.get("duration"),
                "tags": ", ".join(snippet.get("tags", [])) if "tags" in snippet else "",
                "description": snippet.get("description", "")
            })

            if len(videos) >= max_results:
                break

        next_page_token = data.get("nextPageToken")
        if not next_page_token:
            break

        time.sleep(1)  # polite delay to avoid quota spikes

    return videos


if __name__ == "__main__":
    all_videos = []
    for region in REGIONS:
        print(f" Collecting for region: {region}")
        region_videos = get_trending_videos(region, max_results=300)
        all_videos.extend(region_videos)
        print(f" Collected {len(region_videos)} from {region}. Total so far: {len(all_videos)}")

    df = pd.DataFrame(all_videos)
    df.to_csv(SAVE_PATH, index=False)
    print(f"\n Saved {len(df)} total videos to {SAVE_PATH}")


Overwriting src/api_youtube.py


In [156]:
from dotenv import load_dotenv
import os

env_path = "/content/drive/MyDrive/youtube-popularity-prediction/.env"
load_dotenv(dotenv_path=env_path, override=True)

api_key = os.getenv("api_key")
if api_key:
    print(" Loaded API key prefix:", api_key[:8], "...")
else:
    print(" API key not loaded. Check .env formatting.")



✅ Loaded API key prefix: AIzaSyB7 ...


In [157]:
%cd /content/drive/MyDrive/youtube-popularity-prediction
!python src/api_youtube.py


/content/drive/MyDrive/youtube-popularity-prediction
🌎 Collecting for region: US
✅ Collected 200 from US. Total so far: 200
🌎 Collecting for region: IN
✅ Collected 200 from IN. Total so far: 400
🌎 Collecting for region: GB
✅ Collected 200 from GB. Total so far: 600
🌎 Collecting for region: BR
✅ Collected 200 from BR. Total so far: 800
🌎 Collecting for region: JP
✅ Collected 200 from JP. Total so far: 1000
🌎 Collecting for region: KR
✅ Collected 200 from KR. Total so far: 1200
🌎 Collecting for region: FR
✅ Collected 200 from FR. Total so far: 1400
🌎 Collecting for region: DE
✅ Collected 200 from DE. Total so far: 1600
🌎 Collecting for region: CA
✅ Collected 200 from CA. Total so far: 1800
🌎 Collecting for region: MX
✅ Collected 200 from MX. Total so far: 2000
🌎 Collecting for region: RU
✅ Collected 199 from RU. Total so far: 2199
🌎 Collecting for region: IT
✅ Collected 200 from IT. Total so far: 2399
🌎 Collecting for region: AU
✅ Collected 200 from AU. Total so far: 2599
🌎 Collecting fo

In [158]:
!ls -lh /content/drive/MyDrive/youtube-popularity-prediction/data/youtube_api_3000.csv


-rw------- 1 root root 4.3M Oct 21 20:40 /content/drive/MyDrive/youtube-popularity-prediction/data/youtube_api_3000.csv


In [159]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/youtube-popularity-prediction/data/youtube_api_3000.csv")
print("✅ Rows collected:", len(df))
df.head()


✅ Rows collected: 2999


Unnamed: 0,region,video_id,title,channel,category_id,views,likes,comments,upload_date,duration,tags,description
0,US,pCv0oP9JLKw,Morgan Wallen - 20 Cigarettes (Official Music ...,MorganWallenVEVO,10,481851,20159.0,1816.0,2025-10-20T18:01:00Z,PT3M,"Morgan Wallen, Big Loud Records Mercury Record...","Listen to Morgan Wallen's new album, “I’m The ..."
1,US,grjC63MftfI,Marvel Zombies | Official Zombie Mode Trailer ...,Marvel Rivals,20,373611,23510.0,1706.0,2025-10-20T18:00:42Z,PT1M54S,,🎃 Rivals... the dead are rising.\n\nUnder Khon...
2,US,FZmddh1MuyE,never should've played this again,CoryxKenshin,20,2987710,285150.0,16165.0,2025-10-20T20:50:43Z,PT37M14S,"those nights at fredbears, those, nights, at, ...","WELCOME, back to Those Nights at Fredbears! Re..."
3,US,YcpMVsvK8pk,Madison Beer - bittersweet (Official Music Video),MadisonBeerMusicVEVO,10,540831,65216.0,3046.0,2025-10-21T02:00:06Z,PT4M29S,"sing it loud, dangerous, madison beer dangerou...",Madison Beer - bittersweet (Official Music Vid...
4,US,jcFd1qbRg_I,"Character Trailer - ""Nefer: Shadowbearing Serp...",Genshin Impact,20,387185,47462.0,2746.0,2025-10-21T03:01:00Z,PT2M48S,,"Once a serpent sinks its fangs into its prey, ..."


In [160]:
%cd /content/drive/MyDrive/youtube-popularity-prediction
!git status


/content/drive/MyDrive/youtube-popularity-prediction
Refresh index: 100% (6/6), done.
On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mapi_youtube.py[m
	[31mnotebooks/[m
	[31mold_data/[m
	[31mscrape_youtube.py[m
	[31msrc/api_youtube.py[m
	[31msrc/scrape_youtube.py[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
!git config --global user.email "sammshtramm@gmail.com"
!git config --global user.name "Your GitHubUsername"


In [161]:
!git add .
!git commit -m "Added completed data collection notebook and scripts"
!git push


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@efe96f72a0b9.(none)')
fatal: could not read Username for 'https://github.com': No such device or address
