In [None]:
import pandas as pd
import numpy as np

In [None]:
book_df_origin = pd.read_pickle('./small_Books.pkl')
meta_df_origin = pd.read_pickle('./small_meta_Books.pkl')

In [None]:
book_df = book_df_origin.copy()
meta_df = meta_df_origin.copy()

In [None]:
print(meta_df.columns)
print(book_df.columns)

# 1. seperate cold-start and warm-start items according to item publish time

In [None]:
original_title2id_dict = {}
id_num = 0 
unique_titles = meta_df['title'].unique().tolist()
for title in unique_titles:
    original_title2id_dict[title] = id_num
    id_num+=1
    

In [None]:
meta_df['title_idx'] = meta_df['title'].apply(lambda x:original_title2id_dict[x])

In [None]:
print(book_df.columns, meta_df.columns)

In [None]:
book_df['timestamp'] = pd.to_datetime(book_df['timestamp'], unit='s')

In [None]:
partial_meta_df = meta_df.copy()[['title','parent_asin','title_idx','publish_time']]

In [None]:
merged_book_df = pd.merge(left = book_df, right=partial_meta_df,how = 'left',on='parent_asin', )

In [None]:
print(merged_book_df.columns)

In [None]:
merged_book_df['publish_time'].replace('\\N', np.nan, inplace=True)
merged_book_df.dropna(inplace=True)
merged_book_df['publish_time'] = merged_book_df['publish_time'].astype(int)

In [None]:
data = merged_book_df.copy()

In [None]:
data.columns = ['uid',  'iid','rating', 'timestamp', 'title', 'title_idx','publish_time']

In [None]:
item_publist_earliest_time = data.groupby('title_idx').agg({'publish_time':'min'})


In [None]:
print(item_publist_earliest_time)

In [None]:
cold_publish_data = item_publist_earliest_time[item_publist_earliest_time['publish_time'] >= 2023]
cold_publish_items = cold_publish_data.index.tolist()

In [None]:
cold_data = data[data['title_idx'].isin(cold_publish_items)]
print(cold_data['timestamp'].min())
print(cold_data['publish_time'].min())

In [None]:
item_review_earliest_time = data.groupby('title_idx').agg({'timestamp':'min'})

In [None]:
cold_review_data = item_review_earliest_time[item_review_earliest_time['timestamp'] >= pd.to_datetime('2022-10-01')]
cold_review_items = cold_review_data.index.tolist()

In [None]:
print(len(cold_publish_items))
print(len(cold_review_items))

In [None]:
cold_items = set(cold_review_items).intersection(set(cold_publish_items))

In [None]:
print(len(cold_items))

# 2.get warm_train, warm_test, cold_test data

In [None]:
warm_start_time = '2020-01-01'
cold_start_time = '2023-01-01'
cold_end_time = '2023-10-01'

In [None]:
data = merged_book_df.copy()
data.columns = ['uid',  'iid','rating', 'timestamp', 'title', 'title_idx','publish_time']
cold_data = data[data['title_idx'].isin(cold_items)]
uncold_data = data[~data['title_idx'].isin(cold_items)]

In [None]:
# delete cold data' interactions
cold_data_test = cold_data[(cold_data['timestamp'] >= pd.to_datetime(cold_start_time)) & (cold_data['timestamp'] <= pd.to_datetime(cold_end_time))]
cold_items_test = cold_data_test['title_idx'].unique().tolist()

In [None]:
old_items = uncold_data['iid'].unique().tolist()
preserve_warm_items = list((set(old_items)))

In [None]:
preserve_warm_data = uncold_data[uncold_data['iid'].isin(preserve_warm_items)]

In [None]:
# process only warm_data
preserve_warm_data_test_index = (preserve_warm_data['timestamp'] >= pd.to_datetime(cold_start_time)) & (preserve_warm_data['timestamp'] <= pd.to_datetime(cold_end_time))
preserve_warm_data_train_index= (preserve_warm_data['timestamp'] >= pd.to_datetime(warm_start_time)) & (preserve_warm_data['timestamp'] < pd.to_datetime(cold_start_time))
preserve_warm_data_test = preserve_warm_data[preserve_warm_data_test_index]
preserve_warm_data_train = preserve_warm_data[preserve_warm_data_train_index]
print(preserve_warm_data_train.shape[0])
print(preserve_warm_data_test.shape[0])

In [None]:
preserve_warm_data_train['env'] = 0 # warm train
preserve_warm_data_test['env'] = 1 # warm test
cold_data_test['env'] = 3 # cold
all_data = pd.concat([preserve_warm_data_train,preserve_warm_data_test,cold_data_test])


In [None]:
def filter_users(all_data):
    user_cnt = all_data['uid'].value_counts()
    warm_users = user_cnt[user_cnt >= 50].index.tolist()
    all_data = all_data[all_data['uid'].isin(warm_users)]
    return all_data

In [None]:
def filer_items(all_data):
    item_cnt = all_data['iid'].value_counts()
    filer_items = item_cnt[item_cnt >=50].index.tolist()
    all_data = all_data[all_data['iid'].isin(filer_items)]

    # filter items again, to make sure the warm items in the test set are warm---delete items with less than 10 inters in the training set
    train_warm_cnt = all_data[all_data['env'] == 0]['iid'].value_counts()
    delete_items = train_warm_cnt[train_warm_cnt < 50].index.unique().tolist()
    all_data = all_data[~all_data['iid'].isin(delete_items)]
    return all_data

In [None]:
print(all_data.shape[0])
for i in range(5):
    all_data = filter_users(all_data)
    all_data = filer_items(all_data)
    print(all_data.shape[0])

In [None]:
print(all_data.shape[0])
print('warm train inter num',all_data[all_data['env'] == 0].shape[0]) # warm_train
print('warm evaluate inter num',all_data[all_data['env'] == 1].shape[0]) # warm_test
print('cold evaluate inter num',all_data[all_data['env'] == 3].shape[0]) # cold (cold_test)

In [None]:
cold_data = all_data[all_data['env'] == 3]

In [None]:
cnt = cold_data['iid'].value_counts()

# 3.save data for TALLRec and CF model

In [None]:
# parent_asin to iid
parent_asin2iid_dict = {}
num = 0
for parent_asin in all_data['iid'].unique().tolist():
    parent_asin2iid_dict[parent_asin] = num
    num += 1
all_data['iid'] = all_data['iid'].apply(lambda x: parent_asin2iid_dict[x])

In [None]:
print(len(parent_asin2iid_dict))

In [None]:
# parent_asin to iid
user_id2uid_dict = {}
num = 0
for user_id in all_data['uid'].unique().tolist():
    user_id2uid_dict[user_id] = num
    num += 1
all_data['uid'] = all_data['uid'].apply(lambda x: user_id2uid_dict[x])

In [None]:
print(len(user_id2uid_dict))

In [None]:
print(all_data.columns)
print(all_data['publish_time'].isnull().sum())


In [None]:
all_data.sort_values(by = 'timestamp', inplace=True, ascending=True)
all_data['label'] = all_data['rating'].apply(lambda x: 1 if x>=4 else 0)
all_data.drop_duplicates(inplace=True)

In [None]:
u_inter_all = all_data.groupby('uid').agg({'iid':list, 'label':list, 'title':list, 'timestamp':list, 'env':list})
print(u_inter_all.head(1))

In [None]:
import copy
def deal_with_each_u(x,u):
    items = np.array(x.iid)
    labels = np.array(x.label)
    titles = np.array(x.title)
    timestamp = np.array(x.timestamp)
    env = np.array(x.env)
    his = [0] # adding a '0' by default
    his_title = ['']
    results = []
    for i in range(items.shape[0]):
        results.append((u, items[i], timestamp[i], np.array(his), copy.copy(his_title),titles[i], labels[i],env[i]))
        # training data
        if labels[i] > 0: # positive 
            his.append(items[i])
            his_title.append(titles[i])
    return results

In [None]:
results = []
for u in u_inter_all.index:
    results.extend(deal_with_each_u(u_inter_all.loc[u],u))

In [None]:
u_, i_, time_, label_, his_, his_title, title_,env_ = [],[],[],[],[],[],[],[]
for re_ in results:
        u_.append(re_[0])
        i_.append(re_[1])
        time_.append(re_[2])
        his_.append(re_[3][-15:])
        his_title.append(re_[4][-15:])
        title_.append(re_[5])
        label_.append(re_[6])
        env_.append(re_[7])

In [None]:
all_data = pd.DataFrame({"uid":u_,'iid':i_,'label':label_, 'timestamp': time_ , 'his':his_,'his_title':his_title,'title':title_,'env':env_})

In [None]:
print(all_data.shape)
all_data.sort_values(by = 'timestamp', ascending=True,inplace=True)

In [None]:
import random
def under_sample(df, p):
    items = df['iid'].unique().tolist()
    random.seed(2024)
    random.shuffle(items)
    preserve_items = items[ : int(len(items) * p)]
    new_df = df[df['iid'].isin(preserve_items)]
    return new_df

In [None]:
train_data = all_data[all_data['env'] == 0]
warm_test = all_data[all_data['env'] == 1]
cold_test = all_data[all_data['env'] == 3]

# Under-sample evaluation data for warm items to make their quantity close to that of cold items,
# as we use the metrics from the mixed validation set to early stop, following previous work.
warm_test = under_sample(warm_test, p=0.02)

print('warm train inter num', train_data.shape[0])  # warm_train
print('warm evaluate inter num', warm_test.shape[0])  # warm_test
print('cold evaluate inter num', cold_test.shape[0])  # cold (cold_test)
mix_test = pd.concat((warm_test, cold_test))
unique_iids = mix_test['iid'].unique().tolist()
valid_data = []
test_data = []
for iid in unique_iids:
    iid_data = mix_test.loc[mix_test['iid'] == iid]
    if iid_data['env'].iloc[0] == 1:  # For warm items
        split_index_valid = len(iid_data) // 2
        valid_data.append(iid_data.iloc[:split_index_valid])  
        test_data.append(iid_data.iloc[split_index_valid:])  
    elif iid_data['env'].iloc[0] == 3:  # For cold items
        split_index_valid = len(iid_data) // 4
        split_index_test = len(iid_data) // 2
        valid_data.append(iid_data.iloc[:split_index_valid])  
        test_data.append(iid_data.iloc[split_index_valid:split_index_test])  
valid_all = pd.concat(valid_data)
test_all = pd.concat(test_data)

# Separate warm and cold validation and test sets
warm_valid = valid_all[valid_all['env'] == 1]
cold_valid = valid_all[valid_all['env'] == 3]
warm_test = test_all[test_all['env'] == 1]
cold_test = test_all[test_all['env'] == 3]

In [None]:
train_data.to_pickle('./train0.pkl')  # train_warm
valid_all.to_pickle('./valid0.pkl') # valid_mix
test_all.to_pickle('./test0.pkl') # valid_mix
warm_valid.to_pickle('./valid1.pkl') # valid_warm
cold_valid.to_pickle('./valid2.pkl') # valid_cold
warm_test.to_pickle('./test1.pkl') # test_warm
cold_test.to_pickle('./test2.pkl')  # test_cold
