In [3]:
import os
import pandas as pd
import numpy as np


## Load ratings data

In [4]:
data_folder = "../datasets/ml-latest-small/"

In [5]:
ratings = pd.read_csv(os.path.join(data_folder, "ratings.csv")).reset_index()
ratings.head(5)

Unnamed: 0,index,userId,movieId,rating,timestamp
0,0,1,1,4.0,964982703
1,1,1,3,4.0,964981247
2,2,1,6,4.0,964982224
3,3,1,47,5.0,964983815
4,4,1,50,5.0,964982931


## Stratified and Chronological Train-Test Split

For each user, set the oldest 80% rating history as train set, and the newest 20% as test set.

In [6]:
user_to_row_idx = ratings.sort_values("timestamp", ascending=True).groupby("userId")["index"].apply(list)
user_to_row_idx

userId
1      [43, 73, 171, 183, 120, 219, 220, 227, 184, 6,...
2      [232, 246, 260, 258, 253, 256, 235, 251, 244, ...
3      [272, 279, 274, 267, 270, 277, 261, 265, 285, ...
4      [310, 350, 394, 328, 436, 458, 448, 309, 341, ...
5      [553, 554, 524, 533, 542, 544, 551, 537, 525, ...
                             ...                        
606    [97501, 97742, 97679, 97552, 97694, 97829, 975...
607    [98527, 98518, 98628, 98538, 98481, 98494, 985...
608    [99077, 98934, 98699, 99055, 98680, 98897, 987...
609    [99502, 99521, 99510, 99520, 99517, 99506, 995...
610    [99554, 99699, 99649, 100010, 99739, 99541, 99...
Name: index, Length: 610, dtype: object

In [7]:
train_index = sum([i[:int(np.ceil(0.8 * len(i)))] for i in user_to_row_idx], [])
test_index = sum([i[int(np.ceil(0.8 * len(i))):] for i in user_to_row_idx], [])
ratings_train = ratings.iloc[train_index, :][["userId", "movieId", "rating","timestamp"]]
ratings_test = ratings.iloc[test_index, :][["userId", "movieId", "rating","timestamp"]]

In [8]:
ratings_train

Unnamed: 0,userId,movieId,rating,timestamp
43,1,804,4.0,964980499
73,1,1210,5.0,964980499
171,1,2628,4.0,964980523
183,1,2826,4.0,964980523
120,1,2018,5.0,964980523
...,...,...,...,...
100272,610,55067,3.5,1493848671
100629,610,103219,3.5,1493848674
100231,610,51666,2.0,1493848680
100699,610,112727,3.0,1493848682


In [9]:
ratings_test

Unnamed: 0,userId,movieId,rating,timestamp
76,1,1219,2.0,964983393
174,1,2644,4.0,964983393
91,1,1348,4.0,964983393
176,1,2654,5.0,964983393
83,1,1258,3.0,964983414
...,...,...,...,...
100612,610,101739,3.5,1495959269
99540,610,70,4.0,1495959282
99556,610,328,3.5,1495959299
99681,610,2459,3.5,1495959405


In [10]:
ratings_train.to_csv(os.path.join(data_folder, "train.csv"))

In [11]:
ratings_test.to_csv(os.path.join(data_folder, "test.csv"))