In [107]:
import pandas as pd
import numpy as np

In [108]:
book_df_origin = pd.read_pickle('./small_Books.pkl')
meta_df_origin = pd.read_pickle('./small_meta_Books.pkl')

In [109]:
book_df = book_df_origin.copy()
meta_df = meta_df_origin.copy()

In [110]:
print(meta_df.columns)
print(book_df.columns)

Index(['title', 'parent_asin', 'genres', 'imdbId', 'tmdbId', 'startYear',
       'titleType', 'runtimeMinutes', 'endYear', 'publish_time'],
      dtype='object')
Index(['user_id', 'parent_asin', 'rating', 'timestamp'], dtype='object')


# 1. seperate cold-start and warm-start items according to item publish time

In [111]:
original_title2id_dict = {}
id_num = 0 
unique_titles = meta_df['title'].unique().tolist()
for title in unique_titles:
    original_title2id_dict[title] = id_num
    id_num+=1
    

In [112]:
meta_df['title_idx'] = meta_df['title'].apply(lambda x:original_title2id_dict[x])

In [113]:
print(book_df.columns, meta_df.columns)

Index(['user_id', 'parent_asin', 'rating', 'timestamp'], dtype='object') Index(['title', 'parent_asin', 'genres', 'imdbId', 'tmdbId', 'startYear',
       'titleType', 'runtimeMinutes', 'endYear', 'publish_time', 'title_idx'],
      dtype='object')


In [114]:
book_df['timestamp'] = pd.to_datetime(book_df['timestamp'], unit='s')

In [115]:
partial_meta_df = meta_df.copy()[['title','parent_asin','title_idx','publish_time']]

In [116]:
merged_book_df = pd.merge(left = book_df, right=partial_meta_df,how = 'left',on='parent_asin', )

In [117]:
print(merged_book_df.columns)

Index(['user_id', 'parent_asin', 'rating', 'timestamp', 'title', 'title_idx',
       'publish_time'],
      dtype='object')


In [118]:
merged_book_df['publish_time'].replace('\\N', np.nan, inplace=True)
merged_book_df.dropna(inplace=True)
merged_book_df['publish_time'] = merged_book_df['publish_time'].astype(int)

In [119]:
data = merged_book_df.copy()

In [120]:
data.columns = ['uid',  'iid','rating', 'timestamp', 'title', 'title_idx','publish_time']

In [121]:
item_publist_earliest_time = data.groupby('title_idx').agg({'publish_time':'min'})


In [122]:
print(item_publist_earliest_time)

           publish_time
title_idx              
0                  1995
1                  1995
2                  1995
3                  1995
4                  1995
...                 ...
87377              2022
87378              2023
87379              2021
87380              1968
87381              2023

[84055 rows x 1 columns]


In [123]:
cold_publish_data = item_publist_earliest_time[item_publist_earliest_time['publish_time'] >= 2023]
cold_publish_items = cold_publish_data.index.tolist()

In [124]:
cold_data = data[data['title_idx'].isin(cold_publish_items)]
print(cold_data['timestamp'].min())
print(cold_data['publish_time'].min())

2019-02-07 22:09:08
2023


In [125]:
item_review_earliest_time = data.groupby('title_idx').agg({'timestamp':'min'})

In [126]:
cold_review_data = item_review_earliest_time[item_review_earliest_time['timestamp'] >= pd.to_datetime('2023-01-01')]
cold_review_items = cold_review_data.index.tolist()

In [127]:
print(len(cold_publish_items))
print(len(cold_review_items))

835
4710


In [128]:
cold_items = set(cold_review_items).intersection(set(cold_publish_items))

In [129]:
print(len(cold_items))

827


# 2.get warm_train, warm_test, cold_test data

In [130]:
warm_start_time = '2020-01-01'
cold_start_time = '2023-06-01'
cold_end_time = '2023-07-30'

In [131]:
data = merged_book_df.copy()
data.columns = ['uid',  'iid','rating', 'timestamp', 'title', 'title_idx','publish_time']
cold_data = data[data['title_idx'].isin(cold_items)]
uncold_data = data[~data['title_idx'].isin(cold_items)]

In [132]:
# delete cold data' interactions
cold_data_test = cold_data[(cold_data['timestamp'] >= pd.to_datetime(cold_start_time)) & (cold_data['timestamp'] <= pd.to_datetime(cold_end_time))]
cold_items_test = cold_data_test['title_idx'].unique().tolist()

In [133]:
old_items = uncold_data['iid'].unique().tolist()
preserve_warm_items = list((set(old_items)))

In [134]:
preserve_warm_data = uncold_data[uncold_data['iid'].isin(preserve_warm_items)]

In [135]:
# process only warm_data
preserve_warm_data_test_index = (preserve_warm_data['timestamp'] >= pd.to_datetime(cold_start_time)) & (preserve_warm_data['timestamp'] <= pd.to_datetime(cold_end_time))
preserve_warm_data_train_index= (preserve_warm_data['timestamp'] >= pd.to_datetime(warm_start_time)) & (preserve_warm_data['timestamp'] < pd.to_datetime(cold_start_time))
preserve_warm_data_test = preserve_warm_data[preserve_warm_data_test_index]
preserve_warm_data_train = preserve_warm_data[preserve_warm_data_train_index]
print(preserve_warm_data_train.shape[0])
print(preserve_warm_data_test.shape[0])

4245008
146745


In [136]:
preserve_warm_data_train['env'] = 0 # warm train
preserve_warm_data_test['env'] = 1 # warm test
cold_data_test['env'] = 3 # cold
all_data = pd.concat([preserve_warm_data_train,preserve_warm_data_test,cold_data_test])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preserve_warm_data_train['env'] = 0 # warm train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preserve_warm_data_test['env'] = 1 # warm test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cold_data_test['env'] = 3 # cold


In [137]:
def filter_users(all_data):
    user_cnt = all_data['uid'].value_counts()
    warm_users = user_cnt[user_cnt >= 10].index.tolist()
    all_data = all_data[all_data['uid'].isin(warm_users)]
    return all_data

In [138]:
def filer_items(all_data):
    item_cnt = all_data['iid'].value_counts()
    filer_items = item_cnt[item_cnt >=10].index.tolist()
    all_data = all_data[all_data['iid'].isin(filer_items)]

    # filter items again, to make sure the warm items in the test set are warm---delete items with less than 10 inters in the training set
    train_warm_cnt = all_data[all_data['env'] == 0]['iid'].value_counts()
    delete_items = train_warm_cnt[train_warm_cnt < 10].index.unique().tolist()
    all_data = all_data[~all_data['iid'].isin(delete_items)]
    return all_data

In [139]:
all_data = filer_items(all_data)
all_data = filter_users(all_data)

In [140]:
print(all_data.shape[0])
print('warm train inter num',all_data[all_data['env'] == 0].shape[0]) # warm_train
print('warm evaluate inter num',all_data[all_data['env'] == 1].shape[0]) # warm_test
print('cold evaluate inter num',all_data[all_data['env'] == 3].shape[0]) # cold (cold_test)

4252144
warm train inter num 4106115
warm evaluate inter num 141439
cold evaluate inter num 4590


In [141]:
cold_data = all_data[all_data['env'] == 3]

In [142]:
cnt = cold_data['iid'].value_counts()

# 3.save data for TALLRec and CF model

In [143]:
# parent_asin to iid
parent_asin2iid_dict = {}
num = 0
for parent_asin in all_data['iid'].unique().tolist():
    parent_asin2iid_dict[parent_asin] = num
    num += 1
all_data['iid'] = all_data['iid'].apply(lambda x: parent_asin2iid_dict[x])

In [144]:
print(len(parent_asin2iid_dict))

17402


In [145]:
# parent_asin to iid
user_id2uid_dict = {}
num = 0
for user_id in all_data['uid'].unique().tolist():
    user_id2uid_dict[user_id] = num
    num += 1
all_data['uid'] = all_data['uid'].apply(lambda x: user_id2uid_dict[x])

In [146]:
print(len(user_id2uid_dict))

26533


In [147]:
print(all_data.columns)
print(all_data['publish_time'].isnull().sum())


Index(['uid', 'iid', 'rating', 'timestamp', 'title', 'title_idx',
       'publish_time', 'env'],
      dtype='object')
0


In [148]:
all_data.sort_values(by = 'timestamp', inplace=True, ascending=True)
all_data['label'] = all_data['rating'].apply(lambda x: 1 if x>=4 else 0)
all_data.drop_duplicates(inplace=True)

In [149]:
u_inter_all = all_data.groupby('uid').agg({'iid':list, 'label':list, 'title':list, 'timestamp':list, 'env':list})
print(u_inter_all.head(1))

                                                   iid  \
uid                                                      
0    [24, 16, 36, 39, 9, 47, 44, 8, 7, 12, 28, 13, ...   

                                                 label  \
uid                                                      
0    [1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, ...   

                                                 title  \
uid                                                      
0    [Shaun of the Dead (2004), Amelie (Fabuleux de...   

                                             timestamp  \
uid                                                      
0    [2020-01-19 17:08:33, 2020-01-19 17:10:50, 202...   

                                                   env  
uid                                                     
0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [150]:
import copy
def deal_with_each_u(x,u):
    items = np.array(x.iid)
    labels = np.array(x.label)
    titles = np.array(x.title)
    timestamp = np.array(x.timestamp)
    env = np.array(x.env)
    his = [0] # adding a '0' by default
    his_title = ['']
    results = []
    for i in range(items.shape[0]):
        results.append((u, items[i], timestamp[i], np.array(his), copy.copy(his_title),titles[i], labels[i],env[i]))
        # training data
        if labels[i] > 0: # positive 
            his.append(items[i])
            his_title.append(titles[i])
    return results

In [151]:
results = []
for u in u_inter_all.index:
    results.extend(deal_with_each_u(u_inter_all.loc[u],u))

In [152]:
u_, i_, time_, label_, his_, his_title, title_,env_ = [],[],[],[],[],[],[],[]
for re_ in results:
        u_.append(re_[0])
        i_.append(re_[1])
        time_.append(re_[2])
        his_.append(re_[3][-15:])
        his_title.append(re_[4][-15:])
        title_.append(re_[5])
        label_.append(re_[6])
        env_.append(re_[7])

In [153]:
all_data = pd.DataFrame({"uid":u_,'iid':i_,'label':label_, 'timestamp': time_ , 'his':his_,'his_title':his_title,'title':title_,'env':env_})

In [154]:
print(all_data.shape)
all_data.sort_values(by = 'timestamp', ascending=True,inplace=True)

(4252144, 8)


In [158]:
import random
def under_sample(df, p):
    items = df['iid'].unique().tolist()
    random.seed(2024)
    random.shuffle(items)
    preserve_items = items[ : int(len(items) * p)]
    new_df = df[df['iid'].isin(preserve_items)]
    return new_df

In [159]:
train_data = all_data[all_data['env'] == 0]
warm_test = all_data[all_data['env'] == 1]
cold_test = all_data[all_data['env'] == 3]
#  Under-sample evaluation data for warm items to make their quantity close to that of cold items,
#  as we use the metrics from the mixed validation set to early stop, following previous work.
warm_test = under_sample(warm_test, p = 0.1)
print('warm train inter num', train_data.shape[0]) # warm_train
print('warm evaluate inter num', warm_test.shape[0]) # warm_test
print('cold evaluate inter num', cold_test.shape[0]) # cold (cold_test)
mix_test = pd.concat((warm_test, cold_test))
unique_iids = mix_test['iid'].unique().tolist()
valid_data = []
test_data = []
for iid in unique_iids:
    iid_data = mix_test.loc[mix_test['iid'] == iid]
    split_index = len(iid_data) // 2  
    valid_data.append(iid_data.iloc[:split_index])
    test_data.append(iid_data.iloc[split_index:])
valid_all = pd.concat(valid_data)
test_all = pd.concat(test_data)
warm_valid = valid_all[valid_all['env'] == 1]
cold_valid = valid_all[valid_all['env'] == 3]
warm_test = test_all[test_all['env'] == 1]
cold_test = test_all[test_all['env'] == 3]

warm train inter num 4106115
warm evaluate inter num 14359
cold evaluate inter num 4590


In [160]:
train_data.to_pickle('./train_1.pkl')  # train_warm
valid_all.to_pickle('./valid_0.pkl') # valid_mix
test_all.to_pickle('./test_0.pkl') # valid_mix
warm_valid.to_pickle('./valid_1.pkl') # valid_warm
cold_valid.to_pickle('./valid_2.pkl') # valid_cold
warm_test.to_pickle('./test_1.pkl') # test_warm
cold_test.to_pickle('./test_2.pkl')  # test_cold

