In [1]:
import pandas as pd
import numpy as np
import math
import random

In [2]:
seed = 2023
random.seed(seed)
np.random.seed(seed)

In [3]:
noise_ratio = 0.2

In [4]:
inter_df = pd.read_csv("raw-ml-1M/ml-1m.inter", sep='\t')
item_df = pd.read_csv("raw-ml-1M/ml-1m.item", sep='\t')

In [5]:
inter_df = inter_df.rename(columns={"user_id:token":"user_id", "item_id:token":"item_id", "rating:float":"rating", "timestamp:float":"timestamp"})
item_df = item_df.rename(columns={"item_id:token":"item_id", "movie_title:token_seq":"movie_title", "release_year":"token", "class:token_seq":"class"})

In [6]:
inter_df.user_id.nunique(), inter_df.item_id.nunique(), len(inter_df)

(6040, 3706, 1000209)

In [7]:
1 - len(inter_df) / (inter_df.user_id.nunique() * inter_df.item_id.nunique())

0.9553163743776871

In [8]:
inter_df = inter_df.drop_duplicates(["user_id", "item_id"], keep='first')

In [9]:
clean_inter_df = inter_df[inter_df["rating"] > 3]

In [10]:
threshold_inter_num = 15

while True:
    clean_inter_df = clean_inter_df.groupby('user_id').filter(lambda x:len(x)>=threshold_inter_num)
    clean_inter_df = clean_inter_df.groupby('item_id').filter(lambda x:len(x)>=threshold_inter_num)
    if clean_inter_df.groupby('user_id').size().min() >= threshold_inter_num and clean_inter_df.groupby('item_id').size().min() >= threshold_inter_num:
        break

In [11]:
sizes = clean_inter_df.groupby("user_id").size()
print(clean_inter_df.user_id.nunique(), clean_inter_df.item_id.nunique(), len(clean_inter_df))

5651 2555 564827


In [12]:
item_df = item_df[np.isin(item_df.item_id, clean_inter_df['item_id'].values)]

user_id_codes, user_id_uniques = pd.factorize(clean_inter_df['user_id'])
clean_inter_df['user_id'] = user_id_codes
item_id_codes, item_id_uniques = pd.factorize(clean_inter_df['item_id'])
item_id_remap = dict(zip(item_id_uniques, np.arange(len(item_id_uniques))))
clean_inter_df['item_id'] = item_id_codes
item_df['item_id'] = item_df['item_id'].map(item_id_remap)

In [13]:
# clean_inter_df = clean_inter_df.sort_values(by=["user_id", "timestamp"], axis=0)
clean_inter_df = clean_inter_df.sample(frac=1).reset_index(drop=True)

In [14]:
def split_group(group, split_ratio=[0.8,0.1,0.1]):
    num = len(group)
    test_num = math.ceil(num * split_ratio[2])
    valid_num = math.ceil(num * split_ratio[1])
    train_num = num - test_num - valid_num
    nums = [train_num, valid_num, test_num]
    offsets = [0] + list(np.cumsum(nums))
    splits = [group.iloc[offsets[i]:offsets[i+1]] for i in range(len(nums))]
    return splits

splits = clean_inter_df.groupby(by="user_id").apply(split_group)

In [15]:
train_pos_inter, valid_pos_inter, test_pos_inter = [pd.concat([s[i] for s in splits]) for i in range(3)]

In [16]:
len(valid_pos_inter), len(test_pos_inter), len(train_pos_inter)

(58961, 58961, 446905)

In [17]:
def inject_noise_inters(df):
    sample_index = np.random.choice(df.index.values, size=int(noise_ratio * len(df)))
    sample_user_ids = df.loc[sample_index, 'user_id'].values
    sample_item_ids = df.loc[sample_index, 'item_id'].values
    item_num = df.item_id.nunique()
    user2posItem_dict = df.groupby('user_id')['item_id'].apply(list).to_dict()

    check_index = np.arange(len(sample_index))
    sample_item_ids = np.zeros_like(check_index)

    while len(check_index) > 0:
        sample_item_ids[check_index] = np.random.randint(0, item_num, size=len(check_index))
        check_index = [
            i
            for i, u, sample_item_id in zip(check_index, sample_user_ids[check_index], sample_item_ids[check_index])
            if sample_item_id in user2posItem_dict[u]
        ]
    
    df.loc[sample_index, 'item_id'] = sample_item_ids
    df.loc[sample_index, 'noise_flag'] = 1
    return df

train_pos_inter['noise_flag'] = 0
train_pos_inter = inject_noise_inters(train_pos_inter)

In [18]:
# filter valid and test data, where item_id is not appeared in train dataset
train_used_item_set = train_pos_inter.item_id.unique()
valid_pos_inter = valid_pos_inter[np.isin(valid_pos_inter.item_id, train_used_item_set, assume_unique=True)]
test_pos_inter = test_pos_inter[np.isin(test_pos_inter.item_id, train_used_item_set, assume_unique=True)]

In [19]:
flag = (sum(np.isin(item_df.item_id, clean_inter_df['item_id'].unique())==False) > 0)
flag

False

In [20]:
# maybe some items will be removed due to noise injecting operation, we should remap item id again
if flag:
    item_df = item_df[np.isin(item_df.item_id, clean_inter_df['item_id'].unique())]

    item_id_codes, item_id_uniques = pd.factorize(clean_inter_df['item_id'])
    itemId_map_dict = dict(zip(item_id_uniques, np.arange(len(item_id_uniques))))
    clean_inter_df['item_id'] = item_id_codes
    valid_pos_inter['item_id'] = valid_pos_inter.item_id.map(itemId_map_dict)
    test_pos_inter['item_id'] = test_pos_inter.item_id.map(itemId_map_dict)
    item_df['item_id'] = item_df['item_id'].map(itemId_map_dict)

In [21]:
len(valid_pos_inter), len(test_pos_inter), len(train_pos_inter)

(58961, 58961, 446905)

In [22]:

print("-"*10 + "user" + "-"*10)
sizes = train_pos_inter.groupby("user_id").size()
print(sizes.min(), sizes.max())
sizes = valid_pos_inter.groupby("user_id").size()
print(sizes.min(), sizes.max())
sizes = test_pos_inter.groupby("user_id").size()
print(sizes.min(), sizes.max())

print("-"*10 + "item" + "-"*10)
sizes = train_pos_inter.groupby("item_id").size()
print(sizes.min(), sizes.max())
sizes = valid_pos_inter.groupby("item_id").size()
print(sizes.min(), sizes.max())
sizes = test_pos_inter.groupby("item_id").size()
print(sizes.min(), sizes.max())

----------user----------
11 1074
2 135
2 135
----------item----------
29 1797
1 311
1 284


In [23]:
train_pos_inter.to_csv(f"pro-ml-1M/ml-1M-train.{noise_ratio}", sep='\t', index=False)
valid_pos_inter.to_csv(f"pro-ml-1M/ml-1M-valid.{noise_ratio}", sep='\t', index=False)
test_pos_inter.to_csv(f"pro-ml-1M/ml-1M-test.{noise_ratio}", sep='\t', index=False)
item_df.to_csv(f"pro-ml-1M/ml-1M-item.{noise_ratio}", sep='\t', index=False)