In [1]:
from itertools import groupby
import os
import json
import datetime
import pickle
from unicodedata import category
import pandas as pd
import numpy as np
from sklearn import preprocessing

filter_min = 5
dividing_line = 0.8

cat = 'Software'
data_directory = 'datasets/' + cat + '/'
data_path = (data_directory+'Software.json')

users_id, items_id, ratings, reviews, times = [],[],[],[],[]
np.random.seed(2022)

# chang the time format
def str_to_days(s):
    st_date = datetime.date(1970, 1, 1)
    cur_date = datetime.date(
        int(s.split(', ')[1]), int(s.split(', ')[0].split(' ')[0]),
        int(s.split(', ')[0].split(' ')[1]))
    return (cur_date - st_date).days

with open(data_path,'r') as f:
    for line in f:
        js = json.loads(line)
        if str(js['reviewerID']) == 'unknown':
            print("unknown")
            continue
        if str(js['asin']) == 'unknown':
            print("unknown2")
            continue
        try:
            reviews.append(js['reviewText'])
        except KeyError:
            continue
        users_id.append(str(js['reviewerID']))
        items_id.append(str(js['asin']) )
        ratings.append(float(js['overall']))
        times.append((int(js['unixReviewTime'])))
        # times.append(str_to_days(js['reviewTime']))

data = pd.DataFrame({
    'user_id': pd.Series(users_id),
    'item_id': pd.Series(items_id),
    'ratings': pd.Series(ratings),
    'reviews': pd.Series(reviews),
    'times': pd.Series(times)
})[['user_id', 'item_id', 'ratings', 'reviews', 'times']]


print("============ %s ============" % data_path)
print("================= raw info =============================")
print("#users: %d" % len(data.user_id.unique()))
print("#items: %d" % len(data.item_id.unique()))
print("#actions: %d" % len(data))

#users: 1826
#items: 802
#actions: 12804


In [2]:



# =====================================================================================
# drop duplicated user-item pairs
data.drop_duplicates(subset=['user_id','item_id'], keep='first', inplace=True)

# discard cold-start items
count_i = data.groupby('item_id').user_id.count()
item_keep = count_i[count_i >= filter_min].index
data = data[data['item_id'].isin(item_keep)]

# discard cold-start users
count_u = data.groupby('user_id').item_id.count()
user_keep = count_u[count_u >= filter_min].index
data = data[data['user_id'].isin(user_keep)]

print("========================================================")
print("============== drop some data ==========================")
# output statistical information
n = len(data.user_id.unique())
m = len(data.item_id.unique())
p = len(data)
print("#users: %d" % n)
print("#items: %d" % m)
print("#actions: %d" % p)
print("density: %.4f" % (p/n/m))

count_u = data.groupby(['user_id']).item_id.count()
print('sequence length:')
print(count_u.describe())

# =====================================================================================
# sort by time
data.sort_values(by=['times'], kind='mergesort', inplace=True)

# =====================================================================================
# find the time dividing line
time = list(data.times)
div_line = time[int(len(time)*dividing_line)]

df1 = data[data.times<=div_line]
df2 = data[data.times>div_line]
df1_user = set(df1.user_id)
df2_user = set(df2.user_id)

user_m = set(df1_user & df2_user)
user_ml = set(df1_user | user_m)

df_m = data[data.user_id.isin(list(user_m))]
df_ml = data[data.user_id.isin(list(user_ml))]

print("==============================================================")
print("============= find the time dividing line (ml) ===============")
print("#users: %d" % len(user_ml))
print("#items: %d" % len(df_ml.item_id.unique()))
print("#actions: %d" % len(df_ml))
print("density: %.4f" % (len(df_ml)/len(user_ml)/len(df_ml.item_id.unique())))

count_dfml = df_ml.groupby(['user_id']).item_id.count()
print('sequence length:')
print(count_dfml.describe())

# =====================================================================================
# split the data
train_ml = df1[df1['user_id'].isin(list(user_ml))]
test_ml = df2[df2['user_id'].isin(list(user_ml))]

print("========================================================")
print("============== split the data ==========================")
print("#train actions: %d" % train_ml.shape[0])
print("#test actions: %d" % test_ml.shape[0])
print("#split rate: %.4f " % (test_ml.shape[0]/(train_ml.shape[0]+test_ml.shape[0])))

# =====================================================================================
# calculate the length of the sequence
def cal_len(df):
    return len(df)

a = train_ml.groupby('user_id').apply(cal_len)
a = a.reset_index()
a.columns = ['user_id', 'train_len']

b = test_ml.groupby('user_id').apply(cal_len)
b = b.reset_index()
b.columns = ['user_id', 'test_len']

res = pd.merge(a, b, how='inner', on=['user_id'])
# =====================================================================================
# filter the data --> the length of the sequence >= 5
user_l = list(user_ml -user_m)
user_seq5 = list(res[res.train_len>=5].user_id)
user_seq5 = user_l + user_seq5
df_seq5 = data[data.user_id.isin(list(user_seq5))]

# ================================================================================
# split data into test set, valid set and train set

df_train = df_seq5[df_seq5.times<=div_line]
df_test = df_seq5[df_seq5.times>div_line]


train = df_train[df_train['user_id'].isin(list(user_seq5))]
train['flag'] = list(['train']*train.shape[0])

test_valid = df_test[df_test['user_id'].isin(list(user_seq5))]

# split the test dataset
def cal_valid(df):
    return df.head(1)

test_valid = test_valid.sort_values('times')
test_valid['flag'] = test_valid.index

valid = test_valid.groupby('user_id').apply(cal_valid)

test_index = set(test_valid.flag.unique()) - set(valid.flag.unique())
test = test_valid[test_valid['flag'].isin(list(test_index))]

valid['flag'] = list(['valid']*valid.shape[0])
test['flag'] = list(['test']*test.shape[0])

# drop cold-start items in valid set and test set
valid = valid[valid.item_id.isin(train.item_id)]
test = test[test.user_id.isin(valid.user_id) & (
    test.item_id.isin(train.item_id) | test.item_id.isin(valid.item_id))]


# reset the user/item id
df_concat = pd.concat([train,test,valid],axis=0)

le = preprocessing.LabelEncoder()
df_concat['user_id'] = le.fit_transform(df_concat['user_id'])+1
df_concat['item_id'] = le.fit_transform(df_concat['item_id'])+1

train = df_concat[df_concat.flag=='train']
valid = df_concat[df_concat.flag=='valid']
test = df_concat[df_concat.flag=='test']

# print the data static
users_num = len(train.user_id.unique())
items_num = len(train.item_id.unique())
action_num = train.shape[0] + test.shape[0] + valid.shape[0]

print("========================================================")
print("=========== train_len >=5 ================")
print("#users: %d" % users_num)
print("#items: %d" % items_num)
print("#actions: %d" % action_num)
print("density: %.4f" % (action_num/items_num/users_num))
print("#train actions: %d" % train.shape[0])
print("#test&valid users: %d" % len(test_valid.user_id.unique()))
print("#test&valid actions: %d" % (test.shape[0]+valid.shape[0]))
print("#split rate: %.4f " % ((test.shape[0]+valid.shape[0])/(train.shape[0]+ \
                             (test.shape[0]+valid.shape[0]))))
print("#valid actions  %d " % (valid.shape[0]))
print("#test acitons  %d " % ( test.shape[0]))
print("#valid : test  %.4f " % (valid.shape[0]/(test.shape[0]+valid.shape[0])))


# =====================================================================================
# save the item_seq to txt
video_games_train = train.loc[:,['user_id','item_id']]
#video_games_train.to_csv(data_directory+category_name+'_train.txt',sep=' ', index=False,header=False)

video_games_test = test.loc[:,['user_id','item_id']]
#video_games_test.to_csv(data_directory+category_name+'_test.txt',sep=' ', index=False,header=False)

video_games_valid = valid.loc[:,['user_id','item_id']]
#video_games_valid.to_csv(data_directory+category_name+'_valid.txt',sep=' ', index=False,header=False)

# =====================================================================================
# processing the timestamp
def PreprocessData_Games(df):
    # for col in ("user", "item"):
    #     df[col] = df[col].astype(np.int32)
    df['ts'] = pd.to_datetime(df['ts'],unit='s')
    df = df.sort_values(by=['ts'])
    df['year'], df['month'], df['day'], df['dayofweek'], df['dayofyear'] , df['week'] = zip(*df['ts'].map(lambda x: [x.year,x.month,x.day,x.dayofweek,x.dayofyear,x.week]))
    df['year']-=df['year'].min()
    df['year']/=df['year'].max()
    df['month']/=12
    df['day']/=31
    df['dayofweek']/=7
    df['dayofyear']/=365
    df['week']/=4

    df.fillna(0,inplace=True)

    DATEINFO = {}
    UsersDict = {}
    for index, row in df.iterrows() :
      userid = int(row['user'])
      itemid = int(row['item'])

      year = row['year']
      month = row['month']
      day = row['day']
      dayofweek = row['dayofweek']
      dayofyear = row['dayofyear']
      week = row['week']
      DATEINFO[(userid,itemid)] = [year, month, day, dayofweek, dayofyear, week]

    return df, DATEINFO

df_concat = pd.concat([train,test,valid],axis=0)
cxt = df_concat.loc[:,['user_id','item_id','times']]
cxt.rename(columns={'user_id':'user','item_id':'item','times':'ts'},inplace=True)

df_time,DATEINFO = PreprocessData_Games(cxt)


#users: 1421
#items: 709
#actions: 10166
density: 0.0101
sequence length:
count    1421.000000
mean        7.154117
std         3.609382
min         5.000000
25%         5.000000
50%         6.000000
75%         8.000000
max        52.000000
Name: item_id, dtype: float64
#users: 1354
#items: 709
#actions: 9736
density: 0.0101
sequence length:
count    1354.000000
mean        7.190547
std         3.640216
min         5.000000
25%         5.000000
50%         6.000000
75%         8.000000
max        52.000000
Name: item_id, dtype: float64
#train actions: 8137
#test actions: 1599
#split rate: 0.1642 
#users: 951
#items: 635
#actions: 7038
density: 0.0117
#train actions: 6855
#test&valid users: 355
#test&valid actions: 183
#split rate: 0.0260 
#valid actions  128 
#test acitons  55 
#valid : test  0.6995 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['flag'] = list(['test']*test.shape[0])


In [3]:


# pick the negetive item
sample_pop = False
sample_num = 100
sr_user2items = df_concat.groupby(['user_id']).item_id.unique()
df_negative = pd.DataFrame({'user_id': df_concat.user_id.unique()})

# sample according to popularity
if sample_pop == True:
    sr_item2pop = df_seq5.item_id.value_counts(sort=True, ascending=False)
    arr_item = sr_item2pop.index.values
    arr_pop = sr_item2pop.values

    def get_negative_sample(pos):
        neg_idx = ~np.in1d(arr_item, pos)
        neg_item = arr_item[neg_idx]
        neg_pop = arr_pop[neg_idx]
        neg_pop = neg_pop / neg_pop.sum()

        return np.random.choice(neg_item, size=sample_num, replace=False, p=neg_pop)

    arr_sample = df_negative.user_id.apply(
        lambda x: get_negative_sample(sr_user2items[x])).values

# sample uniformly
else:
    arr_item = df_concat.item_id.unique()
    arr_sample = df_negative.user_id.apply(
        lambda x: np.random.choice(
            arr_item[~np.in1d(arr_item, sr_user2items[x])], size=sample_num, replace=False)).values

# output negative data
df_negative = pd.concat([df_negative, pd.DataFrame(list(arr_sample))], axis='columns')


In [4]:
df_concat

Unnamed: 0,user_id,item_id,ratings,reviews,times,flag
144,894,13,4.0,"I got tired of the Win98 crashes, so decided t...",966988800,train
143,803,13,1.0,It is worst piece of crap I ever had to instal...,977443200,train
155,188,15,5.0,"Finally! A really fun, exciting, game for girl...",978566400,train
141,188,12,5.0,This is the second game in the Nancy Drew soft...,980294400,train
127,188,11,4.0,I have now played all 3 of the Nancy Drew myst...,980985600,train
...,...,...,...,...,...,...
"(AVU1ILDDYW301, 11366)",918,626,5.0,This listing is for a few different versions. ...,1448755200,valid
"(AVXGN90BU95P8, 11587)",919,633,5.0,I've tried 2 other top selling courses that ar...,1445644800,valid
"(AW42V7SDGU8LR, 11004)",924,618,5.0,Why get anything else?,1483315200,valid
"(AX1EFGUXGMSQG, 8754)",930,493,5.0,Highly effective AV,1449100800,valid


In [5]:
train_df = df_concat[df_concat['flag'] == 'train']

# 按用户分组，收集 itemid 列表（如果要去重可加 .unique()，如果要保持顺序建议不加）
user_train = train_df.groupby('user_id')['item_id'].apply(list).to_dict()

In [6]:
test_df = df_concat[df_concat['flag'].isin(['test'])]

# 创建 userid 到 itemid 的映射字典
user_test = dict(zip(test_df['user_id'], test_df['item_id']))

In [7]:
valid_df = df_concat[df_concat['flag'].isin(['valid'])]

# 创建 userid 到 itemid 的映射字典
user_valid = dict(zip(valid_df['user_id'], valid_df['item_id']))

In [8]:
cxt = DATEINFO

In [9]:
user_negative = df_negative.set_index('user_id').apply(lambda row: row.tolist(), axis=1).to_dict()


In [10]:
import json
with open(data_directory+'user_train.json', 'w') as f:
    json.dump(user_train, f)
with open(data_directory+'user_test.json', 'w') as f:
    json.dump(user_test, f)
with open(data_directory+'user_valid.json', 'w') as f:
    json.dump(user_valid, f)
with open(data_directory+'user_negative.json', 'w') as f:
    json.dump(user_negative, f)

In [11]:
import pickle
with open(data_directory+'cxtdict.pkl', 'wb') as f:
    pickle.dump(cxt, f)

In [12]:
df_concat

Unnamed: 0,user_id,item_id,ratings,reviews,times,flag
144,894,13,4.0,"I got tired of the Win98 crashes, so decided t...",966988800,train
143,803,13,1.0,It is worst piece of crap I ever had to instal...,977443200,train
155,188,15,5.0,"Finally! A really fun, exciting, game for girl...",978566400,train
141,188,12,5.0,This is the second game in the Nancy Drew soft...,980294400,train
127,188,11,4.0,I have now played all 3 of the Nancy Drew myst...,980985600,train
...,...,...,...,...,...,...
"(AVU1ILDDYW301, 11366)",918,626,5.0,This listing is for a few different versions. ...,1448755200,valid
"(AVXGN90BU95P8, 11587)",919,633,5.0,I've tried 2 other top selling courses that ar...,1445644800,valid
"(AW42V7SDGU8LR, 11004)",924,618,5.0,Why get anything else?,1483315200,valid
"(AX1EFGUXGMSQG, 8754)",930,493,5.0,Highly effective AV,1449100800,valid


In [13]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

# 配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 8  # 根据显存情况调整

# 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.eval()

# 准备数据
texts = df_concat['reviews'].tolist()

# 批量处理函数
def encode_batch(batch_texts):
    return tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )

# 用于存储所有 embedding
all_embeddings = []

# 分批处理文本
for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    inputs = encode_batch(batch_texts)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # 获取每个样本的 [CLS] token 的 embedding
    batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    all_embeddings.extend(batch_embeddings)

# 转为 DataFrame 列
df_concat['embedding'] = all_embeddings


  from .autonotebook import tqdm as notebook_tqdm
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 880/880 [01:45<00:00,  8.36it/s]


In [14]:
user_item_embedding = {(row['user_id'], row['item_id']): row['embedding'] for _, row in df_concat.iterrows()}
import pickle
with open(data_directory+'/ItemFeatures.pkl', 'wb') as f:
    pickle.dump(user_item_embedding, f)

In [15]:
def average_embedding(group):
    # 计算每个item的平均embedding
    avg_embedding = np.mean(np.stack(group['embedding']), axis=0)
    return pd.Series({'avg_embedding': avg_embedding})

# 按照itemid分组并计算平均embedding
item_avg_embeddings = df_concat.groupby('item_id').apply(average_embedding)

In [16]:
item_embedding = dict(zip(item_avg_embeddings.index, item_avg_embeddings['avg_embedding'].tolist()))

In [17]:
item_embedding = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in item_embedding.items()}

# 保存为 JSON
with open(data_directory + '/itemFeat.json', 'w') as f:
    json.dump(item_embedding, f)