In [10]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/youtube-popularity-prediction
!ls


Mounted at /content/drive
/content/drive/MyDrive/youtube-popularity-prediction
data  notebooks  old_data  README.md  reports  requirements.txt  src


In [6]:
import pandas as pd
import numpy as np

# Clean column names (remove spaces, lowercase)
df_scraped.columns = df_scraped.columns.str.strip().str.lower()
df_api.columns = df_api.columns.str.strip().str.lower()

# Fill missing numeric values and safely convert to numeric
numeric_cols = ["views", "likes", "comments"]

for df_name, df in [("Scraped", df_scraped), ("API", df_api)]:
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
    print(f"✅ {df_name} dataset numeric columns standardized: {[c for c in numeric_cols if c in df.columns]}")

# Fill other missing fields
fill_defaults = {
    "title": "Unknown Title",
    "channel": "Unknown Channel",
    "category": "Unknown",
    "upload_date": pd.NaT,
    "duration": "PT0S",
    "tags": ""
}
df_scraped = df_scraped.fillna(fill_defaults)
df_api = df_api.fillna(fill_defaults)

print("✅ Missing values handled and columns standardized.")


✅ Scraped dataset numeric columns standardized: ['views']
✅ API dataset numeric columns standardized: ['views', 'likes', 'comments']
✅ Missing values handled and columns standardized.


In [10]:
import re
import numpy as np
import pandas as pd
from datetime import datetime

# --- Helper 1: Convert ISO 8601 duration (e.g., 'PT5M33S') → minutes ---
def convert_duration(duration_str):
    if not isinstance(duration_str, str) or not duration_str.startswith("PT"):
        return np.nan
    minutes, seconds = 0, 0
    match_min = re.search(r"(\d+)M", duration_str)
    match_sec = re.search(r"(\d+)S", duration_str)
    if match_min:
        minutes = int(match_min.group(1))
    if match_sec:
        seconds = int(match_sec.group(1))
    return round(minutes + seconds / 60, 2)

# --- Helper 2: Convert upload date safely (handles tz-aware, strings, NaT) ---
def to_datetime_safe(val):
    try:
        dt = pd.to_datetime(val, errors="coerce", utc=True)
        if pd.notnull(dt):
            dt = dt.tz_localize(None)
        return dt
    except Exception:
        return pd.NaT

# --- Helper 3: Keyword count in titles/descriptions ---
def keyword_density(text):
    if not isinstance(text, str):
        return 0
    return len(re.findall(r"\b[a-zA-Z]{3,}\b", text))  # words ≥3 letters

# --- Apply feature transformations to both datasets ---
for name, df in [("Scraped", df_scraped), ("API", df_api)]:
    print(f"🔧 Processing {name} dataset...")

    # Duration in minutes
    if "duration" in df.columns:
        df["duration_mins"] = df["duration"].apply(convert_duration)

    # Upload date and days since upload
    if "upload_date" in df.columns:
        now = pd.Timestamp.now(tz=None)
        df["upload_date"] = df["upload_date"].apply(to_datetime_safe)
        df["days_since_upload"] = (now - df["upload_date"]).dt.days

    # Engagement rate (only if numeric columns exist)
    if all(col in df.columns for col in ["likes", "comments", "views"]):
        df["engagement_rate"] = (pd.to_numeric(df["likes"], errors="coerce") +
                                 pd.to_numeric(df["comments"], errors="coerce")) / \
                                 pd.to_numeric(df["views"], errors="coerce")
        df["engagement_rate"] = df["engagement_rate"].replace([np.inf, -np.inf], np.nan).fillna(0)

    # Keyword features
    if "title" in df.columns:
        df["title_keyword_count"] = df["title"].apply(keyword_density)
    if "description" in df.columns:
        df["desc_keyword_count"] = df["description"].apply(keyword_density)

print("✅ Feature engineering complete!")

# --- Preview results ---
print("\nScraped dataset sample:")
display(df_scraped.head(3))

print("\nAPI dataset sample:")
display(df_api.head(3))



🔧 Processing Scraped dataset...
🔧 Processing API dataset...
✅ Feature engineering complete!

Scraped dataset sample:


Unnamed: 0,url,title,channel,upload_date,duration,views,category,tags,duration_mins,days_since_upload,title_keyword_count
0,https://www.youtube.com/watch?v=z3XFJxZGLV4,PATREON EXCLUSIVE | From Artist to Mogul (feat...,Joe Budden TV,2025-10-20 06:01:54,PT175M11S,106989,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous...",175.18,1,12
1,https://www.youtube.com/watch?v=nORjJmqe1kM,The Joe Budden Podcast Episode 870 | Hour 2,Joe Budden TV,2025-10-19 05:01:26,PT186M46S,221091,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous...",186.77,2,6
2,https://www.youtube.com/watch?v=ZSxdlP_tLLQ,The Joe Budden Podcast Episode 869 | Tricky Words,Joe Budden TV,2025-10-16 05:01:08,PT165M54S,265338,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous...",165.9,5,7



API dataset sample:


Unnamed: 0,region,video_id,title,channel,category_id,views,likes,comments,upload_date,duration,tags,description,duration_mins,days_since_upload,engagement_rate,title_keyword_count,desc_keyword_count
0,US,pCv0oP9JLKw,Morgan Wallen - 20 Cigarettes (Official Music ...,MorganWallenVEVO,10,481851,20159.0,1816.0,2025-10-20 18:01:00,PT3M,"Morgan Wallen, Big Loud Records Mercury Record...","Listen to Morgan Wallen's new album, “I’m The ...",3.0,1,0.045605,6,252
1,US,grjC63MftfI,Marvel Zombies | Official Zombie Mode Trailer ...,Marvel Rivals,20,373611,23510.0,1706.0,2025-10-20 18:00:42,PT1M54S,,🎃 Rivals... the dead are rising.\n\nUnder Khon...,1.9,1,0.067493,8,135
2,US,FZmddh1MuyE,never should've played this again,CoryxKenshin,20,2987710,285150.0,16165.0,2025-10-20 20:50:43,PT37M14S,"those nights at fredbears, those, nights, at, ...","WELCOME, back to Those Nights at Fredbears! Re...",37.23,1,0.100851,5,87


In [11]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- Function: Clean and normalize dataset ---
def preprocess_and_normalize(df, dataset_name):
    print(f"\n🧹 Cleaning and normalizing {dataset_name} dataset...")

    # Drop duplicates and reset index
    df = df.drop_duplicates(subset=["title", "channel"], keep="first").reset_index(drop=True)

    # Fill or fix missing numeric data
    numeric_cols = ["views", "likes", "comments", "duration_mins", "days_since_upload", "engagement_rate"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
            df[col].fillna(df[col].median(), inplace=True)

    # Fill text-based columns
    for col in ["title", "description", "channel"]:
        if col in df.columns:
            df[col].fillna("Unknown", inplace=True)

    # --- Log-transform large-scale numeric features (e.g., views, likes) ---
    log_cols = ["views", "likes", "comments"]
    for col in log_cols:
        if col in df.columns:
            df[f"log_{col}"] = np.log1p(df[col])  # log(1+x) handles zeros safely

    # --- Standardize numeric features for ML models ---
    scaler = StandardScaler()
    scale_cols = [c for c in ["log_views", "duration_mins", "days_since_upload", "engagement_rate"] if c in df.columns]
    if scale_cols:
        df[[f"{c}_scaled" for c in scale_cols]] = scaler.fit_transform(df[scale_cols])

    # --- Final check for NaNs ---
    df = df.replace([np.inf, -np.inf], np.nan)
    df.fillna(0, inplace=True)

    print(f"✅ {dataset_name} preprocessing complete. Rows: {len(df)}, Columns: {len(df.columns)}")
    return df


# --- Run preprocessing on both datasets ---
df_scraped_clean = preprocess_and_normalize(df_scraped.copy(), "Scraped")
df_api_clean = preprocess_and_normalize(df_api.copy(), "API")


# --- Save cleaned datasets ---
scraped_path = "/content/drive/MyDrive/youtube-popularity-prediction/data/youtube_scraped_clean.csv"
api_path = "/content/drive/MyDrive/youtube-popularity-prediction/data/youtube_api_clean.csv"

df_scraped_clean.to_csv(scraped_path, index=False)
df_api_clean.to_csv(api_path, index=False)

print("\n💾 Saved cleaned datasets:")
print(f"   📂 Scraped → {scraped_path}")
print(f"   📂 API → {api_path}")

# --- Preview final structure ---
print("\n📊 Scraped sample:")
display(df_scraped_clean.head(3))

print("\n📊 API sample:")
display(df_api_clean.head(3))



🧹 Cleaning and normalizing Scraped dataset...
✅ Scraped preprocessing complete. Rows: 60, Columns: 15

🧹 Cleaning and normalizing API dataset...
✅ API preprocessing complete. Rows: 2362, Columns: 24


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always


💾 Saved cleaned datasets:
   📂 Scraped → /content/drive/MyDrive/youtube-popularity-prediction/data/youtube_scraped_clean.csv
   📂 API → /content/drive/MyDrive/youtube-popularity-prediction/data/youtube_api_clean.csv

📊 Scraped sample:


Unnamed: 0,url,title,channel,upload_date,duration,views,category,tags,duration_mins,days_since_upload,title_keyword_count,log_views,log_views_scaled,duration_mins_scaled,days_since_upload_scaled
0,https://www.youtube.com/watch?v=z3XFJxZGLV4,PATREON EXCLUSIVE | From Artist to Mogul (feat...,Joe Budden TV,2025-10-20 06:01:54,PT175M11S,106989,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous...",175.18,1,12,11.580491,-1.360145,1.559359,-0.994401
1,https://www.youtube.com/watch?v=nORjJmqe1kM,The Joe Budden Podcast Episode 870 | Hour 2,Joe Budden TV,2025-10-19 05:01:26,PT186M46S,221091,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous...",186.77,2,6,12.306334,-1.148385,1.705083,-0.993768
2,https://www.youtube.com/watch?v=ZSxdlP_tLLQ,The Joe Budden Podcast Episode 869 | Tricky Words,Joe Budden TV,2025-10-16 05:01:08,PT165M54S,265338,Entertainment,"['JOE BUDDEN', 'JOE BUDDEN TV', 'Slaughterhous...",165.9,5,7,12.488764,-1.095163,1.44268,-0.991868



📊 API sample:


Unnamed: 0,region,video_id,title,channel,category_id,views,likes,comments,upload_date,duration,...,engagement_rate,title_keyword_count,desc_keyword_count,log_views,log_likes,log_comments,log_views_scaled,duration_mins_scaled,days_since_upload_scaled,engagement_rate_scaled
0,US,pCv0oP9JLKw,Morgan Wallen - 20 Cigarettes (Official Music ...,MorganWallenVEVO,10,481851,20159.0,1816.0,2025-10-20 18:01:00,PT3M,...,0.045605,6,252,13.085392,9.911456,7.504942,1.368484,-1.099282,-0.041569,-0.036594
1,US,grjC63MftfI,Marvel Zombies | Official Zombie Mode Trailer ...,Marvel Rivals,20,373611,23510.0,1706.0,2025-10-20 18:00:42,PT1M54S,...,0.067493,8,135,12.830973,10.065224,7.442493,1.190724,-1.176474,-0.041569,0.459166
2,US,FZmddh1MuyE,never should've played this again,CoryxKenshin,20,2987710,285150.0,16165.0,2025-10-20 20:50:43,PT37M14S,...,0.100851,5,87,14.910018,12.560774,9.690666,2.643329,1.302786,-0.041569,1.214762


In [9]:
%cd /content/drive/MyDrive/
!git clone https://github.com/<your-username>/youtube-popularity-prediction.git
%cd youtube-popularity-prediction

[Errno 2] No such file or directory: '/content/drive/MyDrive/'
/content
/bin/bash: line 1: your-username: No such file or directory
[Errno 2] No such file or directory: 'youtube-popularity-prediction'
/content


In [14]:
!git config --global --unset user.email
!git config --global --unset user.name
!git config --global user.email "Samshtramm@gmail.com"
!git config --global user.name "SamShtram"



In [15]:
!git config --global --list


user.email=Samshtramm@gmail.com
user.name=SamShtram


In [11]:
from getpass import getpass
import os

# Store your GitHub credentials securely
os.environ['GITHUB_USER'] = input("input name")
os.environ['GITHUB_TOKEN'] = getpass("input token")

# Configure Git globally
!git config --global user.email "Sammshtramm@gmail.com"
!git config --global user.name "$SamShtram"

# Set up the remote URL to include your token
!git remote set-url origin https://$GITHUB_USER:$GITHUB_TOKEN@github.com/SamShtram/youtube-popularity-prediction


input nameSamShtram
input token··········


In [17]:
!git reset --soft HEAD~1


In [18]:
!git add .
!git commit -m "Clean commit without token: notebooks and scripts"


[main 3f1267e] Clean commit without token: notebooks and scripts
 3 files changed, 1 insertion(+), 134 deletions(-)
 delete mode 100644 api_youtube.py
 delete mode 100644 scrape_youtube.py


In [16]:

!git add .
!git commit -m "Added all notebooks, data, and scripts for full project pipeline"
!git push origin main


[main 63dba0c] Added all notebooks, data, and scripts for full project pipeline
 3 files changed, 1 insertion(+), 134 deletions(-)
 delete mode 100644 api_youtube.py
 delete mode 100644 scrape_youtube.py
Enumerating objects: 7, done.
Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 1.48 KiB | 101.00 KiB/s, done.
Total 4 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot contain secrets[K
remote: 
remote:     [K
remote:      (?) Learn how to resolve a blocked push[K
remote:      https://docs.github.com/code-security/secret-scanning/worki