In [1]:
import pandas as pd
import numpy as np
import math
import random

In [2]:
seed = 2023
random.seed(seed)
np.random.seed(seed)

In [3]:
inter_df = pd.read_csv("raw-ml-100K/ml-100k.inter", sep='\t')
item_df = pd.read_csv("raw-ml-100K/ml-100k.item", sep='\t')

In [4]:
inter_df = inter_df.rename(columns={"user_id:token":"user_id", "item_id:token":"item_id", "rating:float":"rating", "timestamp:float":"timestamp"})
item_df = item_df.rename(columns={"item_id:token":"item_id", "movie_title:token_seq":"movie_title", "release_year":"token", "class:token_seq":"class"})

In [5]:
inter_df.user_id.nunique(), inter_df.item_id.nunique(), len(inter_df)

(6040, 3706, 1000209)

In [6]:
1 - len(inter_df) / (inter_df.user_id.nunique() * inter_df.item_id.nunique())

0.9553163743776871

In [7]:
inter_df = inter_df.drop_duplicates(["user_id", "item_id"], keep='first')

In [8]:
clean_inter_df = inter_df[inter_df["rating"] > 3]

In [9]:
threshold_inter_num = 15

while True:
    clean_inter_df = clean_inter_df.groupby('user_id').filter(lambda x:len(x)>=threshold_inter_num)
    clean_inter_df = clean_inter_df.groupby('item_id').filter(lambda x:len(x)>=threshold_inter_num)
    if clean_inter_df.groupby('user_id').size().min() >= threshold_inter_num and clean_inter_df.groupby('item_id').size().min() >= threshold_inter_num:
        break

In [10]:
sizes = clean_inter_df.groupby("user_id").apply(len)
print(sizes.min(), sizes.max(), len(clean_inter_df), clean_inter_df.user_id.nunique(), clean_inter_df.item_id.nunique())

15 1344 564827 5651 2555


In [11]:
user_id_codes, user_id_uniques = pd.factorize(clean_inter_df['user_id'])
clean_inter_df['user_id'] = user_id_codes

In [12]:
item_df = item_df[np.isin(item_df.item_id, clean_inter_df['item_id'].values)]

item_id_codes, item_id_uniques = pd.factorize(clean_inter_df['item_id'])
item_id_remap = dict(zip(item_id_uniques, np.arange(len(item_id_uniques), dtype=np.int64)))
clean_inter_df['item_id'] = item_id_codes
item_df['item_id'] = item_df['item_id'].map(item_id_remap)

In [13]:
# clean_inter_df = clean_inter_df.sort_values(by=["user_id", "timestamp"], axis=0)
clean_inter_df = clean_inter_df.sample(frac=1).reset_index(drop=True)

In [14]:
def split_group(group, split_ratio=[0.8,0.1,0.1]):
    num = len(group)
    test_num = math.ceil(num * split_ratio[2])
    valid_num = math.ceil(num * split_ratio[1])
    train_num = num - test_num - valid_num
    nums = [train_num, valid_num, test_num]
    offsets = [0] + list(np.cumsum(nums))
    splits = [group.iloc[offsets[i]:offsets[i+1]] for i in range(len(nums))]
    return splits

splits = clean_inter_df.groupby(by="user_id").apply(split_group)

In [15]:
train_pos_inter, valid_pos_inter, test_pos_inter = [pd.concat([s[i] for s in splits]) for i in range(3)]

In [16]:
len(train_pos_inter), len(valid_pos_inter), len(test_pos_inter)

(446905, 58961, 58961)

In [17]:
# filter valid and test data, where item_id is not appeared in train dataset
train_used_item_set = train_pos_inter.item_id.unique()
valid_pos_inter = valid_pos_inter[np.isin(valid_pos_inter.item_id, train_used_item_set, assume_unique=True)]
test_pos_inter = test_pos_inter[np.isin(test_pos_inter.item_id, train_used_item_set, assume_unique=True)]

In [18]:
len(train_pos_inter), len(valid_pos_inter), len(test_pos_inter)

(446905, 58961, 58961)

In [19]:

print("-"*10 + "user" + "-"*10)
sizes = train_pos_inter.groupby("user_id").apply(len)
print(sizes.min(), sizes.max())
sizes = valid_pos_inter.groupby("user_id").apply(len)
print(sizes.min(), sizes.max())
sizes = test_pos_inter.groupby("user_id").apply(len)
print(sizes.min(), sizes.max())

print("-"*10 + "item" + "-"*10)
sizes = train_pos_inter.groupby("item_id").apply(len)
print(sizes.min(), sizes.max())
sizes = valid_pos_inter.groupby("item_id").apply(len)
print(sizes.min(), sizes.max())
sizes = test_pos_inter.groupby("item_id").apply(len)
print(sizes.min(), sizes.max())

----------user----------
11 1074
2 135
2 135
----------item----------
8 2165
1 311
1 284


In [20]:
train_pos_inter.to_csv("pro-ml-100K/ml-100K-train.clean", sep='\t', index=False)
valid_pos_inter.to_csv("pro-ml-100K/ml-100K-valid.clean", sep='\t', index=False)
test_pos_inter.to_csv("pro-ml-100K/ml-100K-test.clean", sep='\t', index=False)
item_df.to_csv(f"pro-ml-100K/ml-100K-item.clean", sep='\t', index=False)