# Data Preprocessing

Prepare MovieLens 20M for streaming recommendation engine:
- Download dataset if needed
- Filter quality users (>=50 ratings)
- Per-user temporal split (80/20)
- Create ground truth for evaluation

In [None]:
import pandas as pd
import numpy as np
import pickle
import zipfile
import os
import requests
from pathlib import Path
from tqdm import tqdm

In [None]:
# paths
DATA_DIR = Path("../data")
ML20M_DIR = DATA_DIR / "ml-20m"
RATINGS_FILE = ML20M_DIR / "ratings.csv"
PROCESSED_DIR = DATA_DIR / "processed"

# thresholds
MIN_USER_RATINGS = 50
MIN_POSITIVE_TEST = 5
RATING_THRESHOLD = 3.5
TRAIN_RATIO = 0.8

## Download MovieLens 20M

In [None]:
def download_movielens():
    if RATINGS_FILE.exists():
        print(f"Dataset exists: {RATINGS_FILE}")
        return
    
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    url = "https://files.grouplens.org/datasets/movielens/ml-20m.zip"
    zip_path = DATA_DIR / "ml-20m.zip"
    
    print(f"Downloading from {url}...")
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get('content-length', 0))
    
    with open(zip_path, 'wb') as f:
        with tqdm(total=total, unit='B', unit_scale=True) as pbar:
            for chunk in resp.iter_content(8192):
                f.write(chunk)
                pbar.update(len(chunk))
    
    print("Extracting...")
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(DATA_DIR)
    
    zip_path.unlink()
    print(f"Done: {RATINGS_FILE}")

download_movielens()

## Load and Filter Data

In [None]:
ratings = pd.read_csv(RATINGS_FILE)
ratings = ratings.sort_values(['userId', 'timestamp']).reset_index(drop=True)

print(f"Loaded: {len(ratings):,} ratings, {ratings['userId'].nunique():,} users, {ratings['movieId'].nunique():,} movies")

In [None]:
# filter to quality users
user_counts = ratings.groupby('userId').size()
quality_users = user_counts[user_counts >= MIN_USER_RATINGS].index
ratings = ratings[ratings['userId'].isin(quality_users)].copy()

print(f"After filtering (>={MIN_USER_RATINGS} ratings): {len(ratings):,} ratings, {len(quality_users):,} users")

## Train/Test Split

In [None]:
def per_user_split(df, train_ratio=0.8):
    """Split each user's ratings by time."""
    train_parts, test_parts = [], []
    
    for uid, udf in df.groupby('userId'):
        n = len(udf)
        split_idx = int(n * train_ratio)
        
        if n - split_idx < 2:  # need at least 2 test ratings
            train_parts.append(udf)
        else:
            train_parts.append(udf.iloc[:split_idx])
            test_parts.append(udf.iloc[split_idx:])
    
    train = pd.concat(train_parts, ignore_index=True)
    test = pd.concat(test_parts, ignore_index=True) if test_parts else pd.DataFrame()
    return train, test

train, test = per_user_split(ratings, TRAIN_RATIO)
print(f"Train: {len(train):,} | Test: {len(test):,}")

In [None]:
train_users = set(train['userId'])
test_users = set(test['userId'])
cold_start = test_users - train_users

print(f"User overlap: {len(train_users & test_users):,} / {len(test_users):,} ({100 if not cold_start else len(train_users & test_users)/len(test_users)*100:.1f}%)")
print(f"Cold start users: {len(cold_start)}")

## Create Ground Truth

In [None]:
# ground truth = positive test ratings grouped by user
test_positive = test[test['rating'] >= RATING_THRESHOLD]

ground_truth_all = test_positive.groupby('userId')['movieId'].apply(set).to_dict()

# filter users with enough positive test items
ground_truth = {
    uid: movies 
    for uid, movies in ground_truth_all.items() 
    if len(movies) >= MIN_POSITIVE_TEST
}

print(f"Ground truth users: {len(ground_truth):,} (filtered from {len(ground_truth_all):,})")
print(f"Avg positive items per user: {np.mean([len(v) for v in ground_truth.values()]):.1f}")

## Save Processed Data

In [None]:
PROCESSED_DIR.mkdir(exist_ok=True)

# save train/test
train_save = train[['userId', 'movieId', 'rating', 'timestamp']]
test_save = test[['userId', 'movieId', 'rating', 'timestamp']]

train_save.to_csv(PROCESSED_DIR / "train_ratings.csv", index=False)
test_save.to_csv(PROCESSED_DIR / "test_ratings.csv", index=False)

# save timestamp-ordered version for streaming (critical for watermarks)
train_ordered = train_save.sort_values('timestamp')
train_ordered.to_csv(PROCESSED_DIR / "train_ratings_session_ordered.csv", index=False)

# save ground truth
with open(PROCESSED_DIR / "ground_truth.pkl", 'wb') as f:
    pickle.dump(ground_truth, f)

# save stats
stats = {
    'train_ratings': len(train_save),
    'test_ratings': len(test_save),
    'total_users': ratings['userId'].nunique(),
    'ground_truth_users': len(ground_truth),
    'rating_threshold': RATING_THRESHOLD,
    'min_user_ratings': MIN_USER_RATINGS,
}
with open(PROCESSED_DIR / "dataset_stats.pkl", 'wb') as f:
    pickle.dump(stats, f)
