In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA_DIR = Path('/content/drive/MyDrive/ml-25m')
RATINGS_CSV = DATA_DIR/'ratings.csv'

ratings = pd.read_csv(
    RATINGS_CSV,
    usecols=['userId','movieId','rating','timestamp'],
    dtype={'userId':'int32','movieId':'int32','rating':'float32','timestamp':'int64'},
    engine='pyarrow'  # 若报错可去掉该参数
)

In [4]:
ratings = ratings.sort_values(['userId','timestamp','movieId']).reset_index(drop=True)

KEEP = 'last'
ratings = ratings.drop_duplicates(subset=['userId','movieId'], keep=KEEP)

MIN_K = 20
user_counts = ratings['userId'].value_counts()
keep_users = user_counts[user_counts >= MIN_K].index
ratings = ratings[ratings['userId'].isin(keep_users)].copy()

print(f"Ratings after cleaning: {len(ratings):,}  | users: {ratings.userId.nunique():,}")

Ratings after cleaning: 25,000,095  | users: 162,541


In [5]:
def leave_last_n(df: pd.DataFrame, n: int = 1):
    """Per-user temporal split: the last n interactions go to test."""
    trains, tests = [], []
    for uid, g in df.groupby('userId', sort=False):
        g = g.sort_values(['timestamp','movieId'])
        if len(g) > n:
            trains.append(g.iloc[:-n])
            tests.append(g.iloc[-n:])
        else:
            trains.append(g)
    train_df = pd.concat(trains, ignore_index=True)
    test_df  = pd.concat(tests,  ignore_index=True) if tests else pd.DataFrame(columns=df.columns)
    return train_df, test_df

N_LAST = 10
train_df, test_df = leave_last_n(ratings, n=N_LAST)

print(f"Train size: {len(train_df):,} | Test size: {len(test_df):,}")
print(f"Users in train: {train_df.userId.nunique():,} | Users in test: {test_df.userId.nunique():,}")


Train size: 23,374,685 | Test size: 1,625,410
Users in train: 162,541 | Users in test: 162,541


In [6]:
def sanity_check(train_df, test_df):
    if test_df.empty:
        print("Warning: test_df is empty (maybe N_LAST too large or many users with few interactions).")
        return
    tt = (train_df.groupby('userId')['timestamp'].max()
                  .rename('t_train_max')
                  .to_frame()
                  .join(test_df.groupby('userId')['timestamp'].min().rename('t_test_min'), how='inner'))
    ok_ratio = (tt['t_train_max'] <= tt['t_test_min']).mean()
    print(f"Temporal order OK for {ok_ratio*100:.2f}% of users (expect 100%).")

sanity_check(train_df, test_df)

Temporal order OK for 100.00% of users (expect 100%).


In [7]:
OUT_DIR = DATA_DIR
train_path = OUT_DIR/f'train_leave_last_{N_LAST}.csv'
test_path  = OUT_DIR/f'test_leave_last_{N_LAST}.csv'
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)
print(f"Saved:\n- {train_path}\n- {test_path}")




Saved:
- /content/drive/MyDrive/ml-25m/train_leave_last_10.csv
- /content/drive/MyDrive/ml-25m/test_leave_last_10.csv


In [8]:
print("\nTrain head:")
print(train_df.head(3).to_string(index=False))
print("\nTest head:")
print(test_df.head(3).to_string(index=False))


Train head:
 userId  movieId  rating  timestamp
      1     5952     4.0 1147868053
      1     2012     2.5 1147868068
      1     2011     2.5 1147868079

Test head:
 userId  movieId  rating  timestamp
      1    27266     4.5 1147879365
      1     8327     5.0 1147879375
      1    32591     5.0 1147879538
