In [None]:
import pandas as pd
import random
import os
from datetime import datetime, timedelta

# Create the raw_data directory if it doesn't exist
os.makedirs("raw_data", exist_ok=True)

# Define possible values
actions = ["play", "pause", "skip", "forward"]
devices = ["mobile", "desktop", "tablet"]
regions = ["US", "EU", "APAC"]

start_date = datetime(2023, 9, 1)
days = 7
user_ids = range(100, 201)
content_ids = random.sample(range(1000, 1011), 10)  # Fixed unique content_ids

# Generate Content Metadata (ensuring overlapping content_id)
titles = ["Summer Vibes", "Rock Anthem", "Daily News", "Jazz Nights", "Tech Podcast",
          "Hip Hop Beats", "Indie Wave", "Classical Hits", "Electronic Pulse", "Retro Pop"]
categories = ["Pop", "Rock", "Podcast", "News", "Jazz", "Hip-Hop", "Classical", "Indie", "Electronic", "Retro"]
artists = ["DJ Alpha", "The Beats", "News Daily", "Smooth Jazz", "Tech Guru",
           "MC Flow", "Indie Star", "Orchestra Live", "Electro King", "Vintage Crew"]

metadata = []
for content_id in content_ids:
    title = random.choice(titles)
    category = random.choice(categories)
    length = random.randint(120, 360)  # Duration in seconds
    artist = random.choice(artists)
    metadata.append([content_id, title, category, length, artist])

metadata_df = pd.DataFrame(metadata, columns=["content_id", "title", "category", "length", "artist"])
metadata_df.to_csv("raw_data/content_metadata.csv", index=False)

# Generate User Activity Logs per day
for i in range(days):
    date = start_date + timedelta(days=i)
    date_str = date.strftime("%Y-%m-%d")
    data = []

    for _ in range(random.randint(20, 30)):
        user_id = random.choice(user_ids)
        content_id = random.choice(content_ids)  # Ensure only selected content IDs are used
        action = random.choice(actions)
        timestamp = date + timedelta(hours=random.randint(0, 23), minutes=random.randint(0, 59), seconds=random.randint(0, 59))
        device = random.choice(devices)
        region = random.choice(regions)
        session_id = f"sess-{random.randint(1000, 9999)}-{random.choice('ABCDEFGH')}"

        data.append([user_id, content_id, action, timestamp.strftime("%Y-%m-%d %H:%M:%S"), device, region, session_id])

    df = pd.DataFrame(data, columns=["user_id", "content_id", "action", "timestamp", "device", "region", "session_id"])
    df.to_csv(f"raw_data/user_logs_{date_str}.csv", index=False)

# Zip the raw_data folder for downloading
from shutil import make_archive
import shutil

make_archive("user_data", 'zip', "raw_data")
shutil.move("user_data.zip", "./user_data.zip")

print("Data generation complete! Download 'user_data.zip' to access CSV files.")

Data generation complete! Download 'user_data.zip' to access CSV files.


In [None]:
from google.colab import files

files.download("user_data.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>