In [20]:
import pandas as pd
import numpy as np
import math
import random

In [21]:
seed = 2023
random.seed(seed)
np.random.seed(seed)

In [22]:
noise_ratio = 0.05

In [23]:
inter_df = pd.read_csv("raw-ml-100K/ml-100k.inter", sep='\t')
item_df = pd.read_csv("raw-ml-100K/ml-100k.item", sep='\t')

In [24]:
inter_df = inter_df.rename(columns={"user_id:token":"user_id", "item_id:token":"item_id", "rating:float":"rating", "timestamp:float":"timestamp"})

In [25]:
inter_df.user_id.nunique(), inter_df.item_id.nunique(), len(inter_df)

(943, 1682, 100000)

In [26]:
1 - len(inter_df) / (inter_df.user_id.nunique() * inter_df.item_id.nunique())

0.9369533063577546

In [27]:
inter_df = inter_df.drop_duplicates(["user_id", "item_id"], keep='first')

In [28]:
clean_inter_df = inter_df[inter_df["rating"] > 3]

In [29]:
threshold_inter_num = 15

while True:
    clean_inter_df = clean_inter_df.groupby('user_id').filter(lambda x:len(x)>=threshold_inter_num)
    clean_inter_df = clean_inter_df.groupby('item_id').filter(lambda x:len(x)>=threshold_inter_num)
    if clean_inter_df.groupby('user_id').size().min() >= threshold_inter_num and clean_inter_df.groupby('item_id').size().min() >= threshold_inter_num:
        break

In [30]:
sizes = clean_inter_df.groupby("user_id").size()
print(clean_inter_df.user_id.nunique(), clean_inter_df.item_id.nunique(), len(clean_inter_df))

779 685 49873


In [31]:
user_id_codes, user_id_uniques = pd.factorize(clean_inter_df['user_id'])
clean_inter_df['user_id'] = user_id_codes
item_id_codes, item_id_uniques = pd.factorize(clean_inter_df['item_id'])
clean_inter_df['item_id'] = item_id_codes

In [32]:
# clean_inter_df = clean_inter_df.sort_values(by=["user_id", "timestamp"], axis=0)
clean_inter_df = clean_inter_df.sample(frac=1).reset_index(drop=True)

In [33]:
def split_group(group, split_ratio=[0.8,0.1,0.1]):
    num = len(group)
    test_num = math.ceil(num * split_ratio[2])
    valid_num = math.ceil(num * split_ratio[1])
    train_num = num - test_num - valid_num
    nums = [train_num, valid_num, test_num]
    offsets = [0] + list(np.cumsum(nums))
    splits = [group.iloc[offsets[i]:offsets[i+1]] for i in range(len(nums))]
    return splits

splits = clean_inter_df.groupby(by="user_id").apply(split_group)

In [34]:
train_pos_inter, valid_pos_inter, test_pos_inter = [pd.concat([s[i] for s in splits]) for i in range(3)]

In [35]:
len(valid_pos_inter), len(test_pos_inter), len(train_pos_inter)

(5334, 5334, 39205)

In [36]:
def inject_noise_inters(df):
    sample_index = np.random.choice(df.index.values, size=int(noise_ratio * len(df)))
    sample_user_ids = df.loc[sample_index, 'user_id'].values
    sample_item_ids = df.loc[sample_index, 'item_id'].values
    item_num = df.item_id.nunique()
    user2posItem_dict = df.groupby('user_id')['item_id'].apply(list).to_dict()

    check_index = np.arange(len(sample_index))
    sample_item_ids = np.zeros_like(check_index)

    while len(check_index) > 0:
        sample_item_ids[check_index] = np.random.randint(0, item_num, size=len(check_index))
        check_index = [
            i
            for i, u, sample_item_id in zip(check_index, sample_user_ids[check_index], sample_item_ids[check_index])
            if sample_item_id in user2posItem_dict[u]
        ]
    
    df.loc[sample_index, 'item_id'] = sample_item_ids
    df.loc[sample_index, 'noise_flag'] = 1
    return df

train_pos_inter['noise_flag'] = 0
train_pos_inter = inject_noise_inters(train_pos_inter)

In [37]:
# filter valid and test data, where item_id is not appeared in train dataset
train_used_item_set = train_pos_inter.item_id.unique()
valid_pos_inter = valid_pos_inter[np.isin(valid_pos_inter.item_id, train_used_item_set, assume_unique=True)]
test_pos_inter = test_pos_inter[np.isin(test_pos_inter.item_id, train_used_item_set, assume_unique=True)]

In [38]:
len(valid_pos_inter), len(test_pos_inter), len(train_pos_inter)

(5334, 5334, 39205)

In [39]:

print("-"*10 + "user" + "-"*10)
sizes = train_pos_inter.groupby("user_id").size()
print(sizes.min(), sizes.max())
sizes = valid_pos_inter.groupby("user_id").size()
print(sizes.min(), sizes.max())
sizes = test_pos_inter.groupby("user_id").size()
print(sizes.min(), sizes.max())

print("-"*10 + "item" + "-"*10)
sizes = train_pos_inter.groupby("item_id").size()
print(sizes.min(), sizes.max())
sizes = valid_pos_inter.groupby("item_id").size()
print(sizes.min(), sizes.max())
sizes = test_pos_inter.groupby("item_id").size()
print(sizes.min(), sizes.max())

----------user----------
11 263
2 33
2 33
----------item----------
9 368
1 44
1 50


In [40]:
train_pos_inter.to_csv(f"pro-ml-100K/ml-100K-train.{noise_ratio}", sep='\t', index=False)
valid_pos_inter.to_csv("pro-ml-100K/ml-100K-valid.clean", sep='\t', index=False)
test_pos_inter.to_csv("pro-ml-100K/ml-100K-test.clean", sep='\t', index=False)