In [None]:
import gzip
import zipfile
import random
import os
import re
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime
import json


DATASET = 'ml-1m'
RAW_PATH = os.path.join('../../data', DATASET)

In [2]:
# # Load Data

# 1. Load interaction data and item metadata
# 2. Filter out unuseful items
# 3. Calculate basic statistics

with zipfile.ZipFile(os.path.join(RAW_PATH, DATASET + '.zip')) as z:
    if DATASET == 'ml-100k':
        with z.open(os.path.join(DATASET, 'u.data')) as f:
            data_df = pd.read_csv(f, sep="\t", header=None)
        with z.open(os.path.join(DATASET, 'u.item')) as f:
            meta_df = pd.read_csv(f, sep='|', header=None, encoding='ISO-8859-1')
    elif DATASET == 'ml-1m':
        # with z.open(os.path.join(DATASET, 'ratings.dat')) as f:
        with z.open('ml-1m/ratings.dat') as f:
            data_df = pd.read_csv(f, sep='::', header=None, engine='python', encoding_errors='ignore')
        with z.open('ml-1m/movies.dat') as f:
            meta_df = pd.read_csv(f, sep='::', header=None, engine='python', encoding_errors='ignore')

In [3]:
data_df = pd.read_csv("../../data/ml-1m/ratings.dat", sep=r'::', header=None, engine='python', encoding_errors='ignore')
meta_df = pd.read_csv('../../data/ml-1m/movies.dat', sep=r'::', header=None, engine='python', encoding_errors='ignore')

data_df.columns = ['user_id', 'item_id', 'label', 'time']
data_df.head()

genres = [
    'i_Action', 'i_Adventure', 'i_Animation', "i_Children's", 'i_Comedy', 'i_Crime', 
    'i_Documentary', 'i_Drama', 'i_Fantasy', 'i_Film-Noir', 'i_Horror', 'i_Musical', 
    'i_Mystery', 'i_Romance', 'i_Sci-Fi', 'i_Thriller', 'i_War', 'i_Western', 'i_Other'
]
# if DATASET == 'ml-100k':
#     item_df = meta_df.drop([1, 3, 4], axis=1)
#     item_df.columns = ['item_id', 'i_year'] + genres
# elif DATASET == 'ml-1m' or 'ml-10m':
#     item_df = meta_df.copy()
#     item_df.columns = ['item_id', 'title', 'genre']
#     # item_df['title'] = item_df['title'].apply(lambda x: x.decode('ISO-8859-1'))
#     # item_df['genre'] = item_df['genre'].apply(lambda x: x.decode('ISO-8859-1'))
#     genre_dict = dict()
#     for g in genres:
#         genre_dict[g] = np.zeros(len(item_df), dtype=np.int32)
#     item_genre = item_df['genre'].apply(lambda x: x.split('|')).values
#     for idx, genre_lst in enumerate(item_genre):
#         for g in genre_lst:
#             genre_dict['i_' + g][idx] = 1
#     for g in genres:
#         item_df[g] = genre_dict[g]
#     item_df = item_df.drop(columns=['genre'])
# item_df.head()
item_df = meta_df.copy()
item_df.columns = ['item_id', 'title', 'genre']

genre_dict = dict()
for g in genres:
    genre_dict[g] = np.zeros(len(item_df), dtype=np.int32)
item_genre = item_df['genre'].apply(lambda x: x.split('|')).values
for idx, genre_lst in enumerate(item_genre):
    for g in genre_lst:
        genre_dict['i_' + g][idx] = 1
for g in genres:
    item_df[g] = genre_dict[g]
item_df = item_df.drop(columns=['genre'])
item_df.head()

Unnamed: 0,item_id,title,i_Action,i_Adventure,i_Animation,i_Children's,i_Comedy,i_Crime,i_Documentary,i_Drama,...,i_Film-Noir,i_Horror,i_Musical,i_Mystery,i_Romance,i_Sci-Fi,i_Thriller,i_War,i_Western,i_Other
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
item_df = item_df[item_df['item_id'].isin(data_df['item_id'])]

In [5]:
item_df.head()

Unnamed: 0,item_id,title,i_Action,i_Adventure,i_Animation,i_Children's,i_Comedy,i_Crime,i_Documentary,i_Drama,...,i_Film-Noir,i_Horror,i_Musical,i_Mystery,i_Romance,i_Sci-Fi,i_Thriller,i_War,i_Western,i_Other
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Filter items

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= 5].drop(columns=['cnt'])
print('Filter after:', len(data_df))
# item_df = item_df[item_df['item_id'].isin(data_df['item_id'])]


# ### Statistics

# n_users = data_df['user_id'].value_counts().size
# n_items = data_df['item_id'].value_counts().size
n_users = len(np.unique(data_df['user_id'].values))
n_items = len(np.unique(data_df['item_id'].values))
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

np.savez("{}/{}_statistic.npz".format(RAW_PATH, DATASET), n_users=n_users, n_items=n_items, n_clicks=n_clicks)
print("1. statistic保存完毕")


time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

Filter before: 1000209
Filter after: 999611
1. statistic保存完毕
# Users: 6040
# Items: 3416
# Interactions: 999611
Time Span: 2000-04-25/2003-02-28


In [7]:
data_df.head()

Unnamed: 0,user_id,item_id,label,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
# # Build Dataset

# ### Interaction data


np.random.seed(2024)
# NEG_ITEMS = 99

out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,858,956703932
1,6040,593,956703954
2,6040,2384,956703954
3,6040,1961,956703977
4,6040,2019,956703977


In [8]:
# # Build Dataset

# ### Interaction data


np.random.seed(2024)
# NEG_ITEMS = 99

out1_df = data_df.copy()
out1_df = out1_df.drop_duplicates(['user_id', 'item_id', 'time'])
out1_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out1_df = out1_df.reset_index(drop=True)
out1_df.head()

Unnamed: 0,user_id,item_id,label,time
0,6040,858,4,956703932
1,6040,593,5,956703954
2,6040,2384,4,956703954
3,6040,1961,4,956703977
4,6040,2019,5,956703977


In [11]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out1_df['user_id'] = out1_df['user_id'].apply(lambda x: user2id[x])
out1_df['item_id'] = out1_df['item_id'].apply(lambda x: item2id[x])
out1_df.head()

Unnamed: 0,user_id,item_id,label,time
0,6040,721,4,956703932
1,6040,550,5,956703954
2,6040,2021,4,956703954
3,6040,1631,4,956703977
4,6040,1689,5,956703977


In [12]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,721,956703932
1,6040,550,956703954
2,6040,2021,956703954
3,6040,1631,956703977
4,6040,1689,956703977


In [13]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = seq_df['item_id'].values.tolist()

with open('{}/{}_clicked_items.json'.format(RAW_PATH, DATASET), 'w') as f:
    json.dump(clicked_item_set, f)
print("2. clicked set保存完毕")

item_count = np.zeros(shape=n_items+1)
for key, values in clicked_item_set.items():
    for item in values:
        item_count[item] += 1
sorted_idx = item_count.argsort()
np.savez('{}/{}_item_count.npz'.format(RAW_PATH, DATASET), counts=item_count, idxes=sorted_idx[1:])
print("3. 冷热门数据保存完毕")

2. clicked set保存完毕
3. 冷热门数据保存完毕


In [15]:
def generate_dev_test(data_df, rate=0.2):
    user_groups = data_df.groupby('user_id')
    user_ids = list(user_groups.groups.keys())
    counts, gts = {}, {}
    n_users = data_df['user_id'].max()
    neg_avg = 0
    for i in range(1, n_users+1):
        if i not in user_ids:
            continue

        cur_group = user_groups.get_group(i)
        gt_num = int(np.ceil(rate * cur_group['item_id'].count()))
        counts[i] = gt_num  # 保存每个用户gt的个数
        neg_avg += gt_num
        cur_tail = cur_group.tail(gt_num)
        cur_gt = cur_tail['item_id'].values
        gts[i] = cur_gt.tolist()  # 保存每个用户的ft
    with open('{}/{}_count.json'.format(RAW_PATH, DATASET), 'w') as fs:
        json.dump(counts, fs)
    print("4. gt count保存完毕")
    with open('{}/{}_ground_truth.json'.format(RAW_PATH, DATASET), 'w') as fs:
        json.dump(gts, fs)
    print("5. gt保存完毕")

    neg_num = int(np.ceil(neg_avg/n_users) * 10)
    print('neg items的数量为{}'.format(neg_num))
    result_dfs = []
    for idx in range(2):
        result_df = None
        user_groups = data_df.groupby('user_id')
        user_ids = list(user_groups.groups.keys())
        for i in range(1, n_users + 1):
            if i not in user_ids:
                continue
            cur_tail = user_groups.get_group(i).tail(counts[i])
            if result_df is None:
                result_df = cur_tail.copy()
            else:
                result_df = pd.concat([result_df, cur_tail], axis=0)
        result_df = result_df.copy()
        # result_df = data_df.groupby('user_id').tail(1).copy()
        # 去除形成test和dev的数据
        data_df = data_df.drop(result_df.index)
        
        result_df = result_df.groupby('user_id').head(1)
        neg_items = []
        a_items = set(np.arange(1, n_items + 1, 1))
        for i in range(1, n_users+1):
            if i not in user_ids:
                continue
            # neg_num = min(counts[i] * 200, n_items - len(clicked_item_set[i])) - counts[i]
            gt_len = counts[i]
            if gt_len > 10:
                gt_len = 10
            cneg_num = neg_num - gt_len + 1
            unclicked = list(set(clicked_item_set[i]) ^ a_items)
            if len(unclicked) <= cneg_num:
                neg_items.append(gts[i][1:gt_len] + unclicked)
            else:
                neg_items.append(gts[i][1:gt_len] + random.sample(unclicked, cneg_num))
            # else:
            #     neg_items.append(random.sample(unclicked, cneg_num))
        result_df['neg_items'] = neg_items
        result_dfs.append(result_df)
    return result_dfs, data_df


leave_df = out_df.groupby('user_id').head(1)
leave1_df = out1_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)
data1_df = out1_df.drop(leave1_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
[test1_df, dev1_df], data1_df = generate_dev_test(data1_df)
# real_test_df = test_df.groupby('user_id').head(1)
# real_dev_df = dev_df.groupby('user_id').head(1)

train_df = pd.concat([leave_df, data_df]).sort_index()
train1_df = pd.concat([leave1_df, data1_df]).sort_index()

# # 重置索引
# train_df.reset_index(drop=True, inplace=True)
# test_df.reset_index(drop=True, inplace=True)
# dev_df.reset_index(drop=True, inplace=True)

4. gt count保存完毕
5. gt保存完毕
neg items的数量为340
4. gt count保存完毕
5. gt保存完毕
neg items的数量为340


In [11]:
len(train_df), len(dev_df), len(test_df)

(597337, 6040, 6040)

In [16]:
train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)
train1_df.to_csv(os.path.join(RAW_PATH, 'train2.csv'), sep='\t', index=False)
dev1_df.to_csv(os.path.join(RAW_PATH, 'dev2.csv'), sep='\t', index=False)
test1_df.to_csv(os.path.join(RAW_PATH, 'test2.csv'), sep='\t', index=False)

In [17]:
train1_df

Unnamed: 0,user_id,item_id,label,time
0,6040,721,4,956703932
1,6040,550,5,956703954
2,6040,2021,4,956703954
3,6040,1631,4,956703977
4,6040,1689,5,956703977
...,...,...,...,...
999041,5484,548,4,1045440226
999042,5484,3343,5,1045440226
999043,5484,761,5,1045440283
999044,5484,1005,5,1045440283


In [18]:
morem_c = train1_df.drop(columns=['time'])

In [19]:
morem_c

Unnamed: 0,user_id,item_id,label
0,6040,721,4
1,6040,550,5
2,6040,2021,4
3,6040,1631,4
4,6040,1689,5
...,...,...,...
999041,5484,548,4
999042,5484,3343,5
999043,5484,761,5
999044,5484,1005,5


In [28]:
unique_users = data_df['user_id'].unique()
unique_items = data_df['item_id'].unique()

num_users = len(unique_users)
num_items = len(unique_items)
user_to_index = {user: idx for idx, user in enumerate(unique_users)}
item_to_index = {item: idx for idx, item in enumerate(unique_items)}
rating_matrix = np.zeros((num_users, num_items))
for index, row in morem_c.iterrows():
    user_idx = user_to_index[row['user_id']]
    item_idx = item_to_index[row['item_id']]
    rating_matrix[user_idx, item_idx] = row['label']

In [29]:
rating_matrix

array([[5., 4., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [30]:
# 创建DataFrame
rating_df = pd.DataFrame(rating_matrix, 
                         index=unique_users,  # 设置行索引为user_id
                         columns=unique_items)  # 设置列名为item_id
rating_df

Unnamed: 0,550,2021,1631,1689,536,1202,197,2692,3014,1456,...,1300,928,374,1375,1242,835,3338,2220,3237,3334
6040,5.0,4.0,4.0,5.0,4.0,3.0,5.0,5.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6036,5.0,0.0,4.0,5.0,0.0,4.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1265,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2910,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
rating_df.to_csv(os.path.join(RAW_PATH, 'morem_c.csv'), sep='\t', index=False)

In [22]:
item_df

Unnamed: 0,item_id,title,i_Action,i_Adventure,i_Animation,i_Children's,i_Comedy,i_Crime,i_Documentary,i_Drama,...,i_Film-Noir,i_Horror,i_Musical,i_Mystery,i_Romance,i_Sci-Fi,i_Thriller,i_War,i_Western,i_Other
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,721,956703932
1,6040,550,956703954
2,6040,2021,956703954
3,6040,1631,956703977
4,6040,1689,956703977


In [13]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
909584,1,552,978824268,"[2298, 649, 546, 1, 1993, 1937, 674, 1315, 157..."
904058,2,1490,978299941,"[1160, 1308, 2123, 1173, 671, 2481, 2688, 345,..."
903814,3,1069,978298231,"[909, 1058, 1064, 1378, 1993, 3056, 100, 3340,..."
903710,4,1039,978294260,"[2547, 876, 1624, 3295, 667, 2667, 267, 1437, ..."
903439,5,482,978245891,"[1440, 1389, 380, 6, 2608, 1380, 2801, 2988, 1..."


In [14]:
dev_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
904125,1,2514,978302124,"[2298, 649, 546, 1, 1993, 1937, 674, 1315, 157..."
904013,2,1960,978299666,"[1160, 1308, 2123, 1173, 671, 2481, 2688, 345,..."
903766,3,2233,978297837,"[909, 1058, 1064, 1378, 1993, 3056, 100, 3340,..."
903706,4,2543,978294230,"[2547, 876, 1624, 765, 2217, 2086, 1030, 1376,..."
903399,5,1593,978245314,"[1440, 1389, 380, 6, 2608, 1380, 2801, 2988, 1..."


In [21]:
item_df['item_id'] = item_df['item_id'].apply(lambda x: item2id[x])

if DATASET == 'ml-1m':
    item_df['i_year'] = item_df['title'].apply(lambda x: int(re.match('.+\((\d{4})\)$', x).group(1)))
    item_df = item_df.drop(columns=['title'])
elif DATASET == 'ml-100k':
    item_df['i_year'] = item_df['i_year'].apply(lambda x: int(str(x).split('-')[-1]) if pd.notnull(x) else 0)
seps = [1900, 1940, 1950, 1960, 1970, 1980, 1985] + list(range(1990, int(item_df['i_year'].max() + 2)))
year_dict = {}
for i, sep in enumerate(seps[:-1]):
    for j in range(seps[i], seps[i + 1]):
        year_dict[j] = i + 1
item_df['i_year'] = item_df['i_year'].apply(lambda x: year_dict[x] if x > 0 else 0)
    
item_df.head()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/wuruxin/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_836989/1546356384.py", line 1, in <module>
    item_df['item_id'] = item_df['item_id'].apply(lambda x: item2id[x])
  File "/home/wuruxin/.local/lib/python3.8/site-packages/pandas/core/series.py", line 4630, in apply
    return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
  File "/home/wuruxin/.local/lib/python3.8/site-packages/pandas/core/apply.py", line 1025, in apply
    return self.apply_standard()
  File "/home/wuruxin/.local/lib/python3.8/site-packages/pandas/core/apply.py", line 1076, in apply_standard
    mapped = lib.map_infer(
  File "pandas/_libs/lib.pyx", line 2834, in pandas._libs.lib.map_infer
  File "/tmp/ipykernel_836989/1546356384.py", line 1, in <lambda>
    item_df['item_id'] = item_df['item_id'].apply(lambda x: item2id[x])
KeyEr

In [48]:
item_df

Unnamed: 0,item_id,i_Action,i_Adventure,i_Animation,i_Children's,i_Comedy,i_Crime,i_Documentary,i_Drama,i_Fantasy,...,i_Horror,i_Musical,i_Mystery,i_Romance,i_Sci-Fi,i_Thriller,i_War,i_Western,i_Other,i_year
0,1,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
1,2,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,13
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,13
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,13
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,18
3879,3949,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,18
3880,3950,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,18
3881,3951,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,18


In [49]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,550,956703954
1,6040,2021,956703954
2,6040,1631,956703977
3,6040,1689,956703977
4,6040,536,956704056


In [50]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,549,956703954
1,6040,2017,956703954
2,6040,1627,956703977
3,6040,1685,956703977
4,6040,535,956704056


In [53]:
item_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

In [27]:
import numpy as np
import pandas as pd

In [28]:
data_df = pd.read_csv('/data1/wrx/test/ReChorus/src/emmr_Grocery_GRU4Rec_5.csv', sep='\t')
data_df

Unnamed: 0.1,Unnamed: 0,user_id,rec_item_list
0,0,14681,[21 10 1 2 19]
1,1,14681,[ 5 21 27 19 0]
2,2,14681,[24 1 28 30 0]
3,3,14681,[15 25 0 20 21]
4,4,14681,[ 6 8 24 0 3]
...,...,...,...
14675,14675,14681,[29 5 1 26 28]
14676,14676,14681,[ 5 12 21 13 0]
14677,14677,14681,[ 0 6 17 1 2]
14678,14678,14681,[ 9 16 15 10 2]


In [29]:
data_df = data_df.drop(columns=['user_id'])
data_df

Unnamed: 0.1,Unnamed: 0,rec_item_list
0,0,[21 10 1 2 19]
1,1,[ 5 21 27 19 0]
2,2,[24 1 28 30 0]
3,3,[15 25 0 20 21]
4,4,[ 6 8 24 0 3]
...,...,...
14675,14675,[29 5 1 26 28]
14676,14676,[ 5 12 21 13 0]
14677,14677,[ 0 6 17 1 2]
14678,14678,[ 9 16 15 10 2]


In [30]:
data_df.rename(columns={'Unnamed: 0':'user_id', 'rec_item_list':'ind'}, inplace=True)
data_df

Unnamed: 0,user_id,ind
0,0,[21 10 1 2 19]
1,1,[ 5 21 27 19 0]
2,2,[24 1 28 30 0]
3,3,[15 25 0 20 21]
4,4,[ 6 8 24 0 3]
...,...,...
14675,14675,[29 5 1 26 28]
14676,14676,[ 5 12 21 13 0]
14677,14677,[ 0 6 17 1 2]
14678,14678,[ 9 16 15 10 2]


In [33]:
data_df.to_csv("result_5_emmr_Grocery_and_Gourmet_Food_GRU4Rec.csv", sep='\t')

In [53]:
dataset = "Clothing_Shoes_and_Jewelry"
k = 5
method = "GRU4Rec"

In [54]:
result_df = pd.read_csv('../res_{}/k{}/emmr_{}_{}_{}.csv'.format(dataset, k, dataset, method, k), sep='\t')

In [55]:
result_df

Unnamed: 0.1,Unnamed: 0,user_id,ind
0,0,39387,[20 2 8 19 0]
1,1,39387,[ 0 17 2 4 19]
2,2,39387,[ 5 0 10 12 6]
3,3,39387,[ 0 20 8 6 4]
4,4,39387,[19 15 8 5 11]
...,...,...,...
39381,39381,39387,[20 14 1 12 15]
39382,39382,39387,[ 3 0 15 17 13]
39383,39383,39387,[14 17 16 10 13]
39384,39384,39387,[18 20 15 19 12]


In [56]:
result_df = result_df.drop(columns=['user_id'])
result_df

Unnamed: 0.1,Unnamed: 0,ind
0,0,[20 2 8 19 0]
1,1,[ 0 17 2 4 19]
2,2,[ 5 0 10 12 6]
3,3,[ 0 20 8 6 4]
4,4,[19 15 8 5 11]
...,...,...
39381,39381,[20 14 1 12 15]
39382,39382,[ 3 0 15 17 13]
39383,39383,[14 17 16 10 13]
39384,39384,[18 20 15 19 12]


In [57]:
result_df.rename(columns={'Unnamed: 0':'user_id'}, inplace=True)
result_df

Unnamed: 0,user_id,ind
0,0,[20 2 8 19 0]
1,1,[ 0 17 2 4 19]
2,2,[ 5 0 10 12 6]
3,3,[ 0 20 8 6 4]
4,4,[19 15 8 5 11]
...,...,...
39381,39381,[20 14 1 12 15]
39382,39382,[ 3 0 15 17 13]
39383,39383,[14 17 16 10 13]
39384,39384,[18 20 15 19 12]


In [58]:
result_df.to_csv('../res_{}/k{}/emmr_{}_{}_{}.csv'.format(dataset, k, dataset, method, k), sep='\t')