In [1]:
import pandas as pd
import random
import uuid
from datetime import datetime, timedelta
from faker import Faker

# Setup
fake = Faker()
start_date = datetime.now() - timedelta(days=180)
channels = {
    "StarVT_vtuber_001": 150000,
    "TechVerse_tech_002": 200000,
    "EduNuggets_edu_003": 180000,
    "GameCrush_game_004": 250000
}

# --- 1. Video Performance Data ---
video_data = []
for cid, base_subs in channels.items():
    for i in range(30):  # 30 videos per channel
        upload_dt = start_date + timedelta(days=random.randint(0, 180), hours=random.randint(0, 23))
        video_data.append({
            "channel_id": cid,
            "video_id": f"{cid}_vid_{i+1}",
            "title": fake.sentence(nb_words=random.randint(3, 7)).replace(".", ""),
            "upload_date": upload_dt,
            "views": random.randint(10000, 1000000),
            "likes": random.randint(1000, 50000),
            "dislikes": random.randint(0, 2000),
            "comments_count": random.randint(100, 5000),
            "click_through_rate": round(random.uniform(2.0, 12.0), 2),
            "average_watch_time": round(random.uniform(2.5, 15.0), 2),
            "upload_day": upload_dt.strftime("%A"),
            "upload_hour": upload_dt.hour
        })
video_df = pd.DataFrame(video_data)
video_df.to_csv("video_performance_data.csv", index=False)

# --- 2. Weekly Subscriber Count (26 weeks) ---
subscriber_data = []
for cid, base in channels.items():
    subs = base
    for week in range(26):
        date = start_date + timedelta(weeks=week)
        subs += int(random.normalvariate(1200, 400))
        subscriber_data.append({
            "channel_id": cid,
            "date": date.strftime("%Y-%m-%d"),
            "subscriber_count": max(subs, 100000)
        })
subs_df = pd.DataFrame(subscriber_data)
subs_df.to_csv("subscriber_data.csv", index=False)

# --- 3. Comments Data (1,000 per channel) ---
comments_data = []
for cid in channels.keys():
    video_ids = video_df[video_df["channel_id"] == cid]["video_id"].tolist()
    for _ in range(1000):
        vid = random.choice(video_ids)
        ts = start_date + timedelta(days=random.randint(0, 180))
        comments_data.append({
            "channel_id": cid,
            "video_id": vid,
            "comment_id": str(uuid.uuid4()),
            "comment_text": fake.sentence(nb_words=random.randint(5, 20)),
            "timestamp": ts.strftime("%Y-%m-%d %H:%M:%S")
        })
comments_df = pd.DataFrame(comments_data)
comments_df.to_csv("comments_data.csv", index=False)
