In [2]:
import pandas as pd
import numpy as np

# Load the subset ratings (already shrunk MovieLens data)
ratings = pd.read_csv('../subset_ratings.csv')



In [3]:
# 1) Split original train/test by user (80% train, 20% test per user)
from sklearn.model_selection import train_test_split

def split_per_user(df, test_frac=0.2, seed=42):
    train_list, test_list = [], []
    for _, user_df in df.groupby('userId'):
        if len(user_df) < 2:
            train_list.append(user_df)
        else:
            tr, te = train_test_split(user_df, test_size=test_frac, random_state=seed)
            train_list.append(tr)
            test_list.append(te)
    train = pd.concat(train_list).reset_index(drop=True)
    test  = pd.concat(test_list).reset_index(drop=True) if test_list else pd.DataFrame(columns=df.columns)
    return train, test

train_df, test_df = split_per_user(ratings)



In [4]:
# --- USER COLD-START SIMULATION ---

# Parameters
cold_user_frac = 0.05  # 5% of users
cold_user_K    = 2     # keep only 2 ratings

all_users = train_df['userId'].unique()
n_cold = int(len(all_users) * cold_user_frac)
cold_users = np.random.choice(all_users, size=n_cold, replace=False)

user_train_rows, user_test_rows = [], []

for uid, grp in train_df.groupby('userId'):
    if uid in cold_users:
        # Sample K ratings for training
        sampled = grp.sample(n=min(cold_user_K, len(grp)), random_state=42)
        user_train_rows.append(sampled)
        # All others go to cold-start test
        user_test_rows.append(grp.drop(sampled.index))
    else:
        user_train_rows.append(grp)

# Build final user-structured sets
user_cold_train = pd.concat(user_train_rows).reset_index(drop=True)
user_cold_test  = pd.concat(user_test_rows).reset_index(drop=True)




In [5]:
# --- ITEM COLD-START SIMULATION ---

# Parameters
cold_item_frac = 0.05  # 5% of movies

all_items = train_df['movieId'].unique()
n_items = int(len(all_items) * cold_item_frac)
cold_items = np.random.choice(all_items, size=n_items, replace=False)

# Remove cold-start items from training
item_cold_train = train_df[~train_df['movieId'].isin(cold_items)].reset_index(drop=True)

# Keep only cold-start items in test
item_cold_test = test_df[test_df['movieId'].isin(cold_items)].reset_index(drop=True)



In [6]:
# Display splits
print("=== USER COLD-START ===")
print(f"Cold users count: {len(cold_users)}")
print(f"User Cold-Train shape: {user_cold_train.shape}")
print(f"User Cold-Test shape : {user_cold_test.shape}")

print("\n=== ITEM COLD-START ===")
print(f"Cold items count: {len(cold_items)}")
print(f"Item Cold-Train shape: {item_cold_train.shape}")
print(f"Item Cold-Test shape : {item_cold_test.shape}")



=== USER COLD-START ===
Cold users count: 500
User Cold-Train shape: (1569707, 4)
User Cold-Test shape : (89213, 4)

=== ITEM COLD-START ===
Cold items count: 352
Item Cold-Train shape: (1560622, 4)
Item Cold-Test shape : (24799, 4)


In [7]:
# Save to CSV for later evaluation
user_cold_train.to_csv('user_cold_train.csv', index=False)
user_cold_test.to_csv('user_cold_test.csv', index=False)
item_cold_train.to_csv('item_cold_train.csv', index=False)
item_cold_test.to_csv('item_cold_test.csv', index=False)