In [1]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
from datetime import datetime

In [2]:
with open("../raw/Yelp/yelp_academic_dataset_business.json") as f:
    buisness_data = f.readlines()
buisness_data = list(map(json.loads, buisness_data))

with open("../raw/Yelp/yelp_academic_dataset_review.json") as f:
    inter_data = f.readlines()
inter_data = list(map(json.loads, inter_data))

In [3]:
inter_dict = defaultdict(list)
for i in inter_data:
    inter_dict['user_id'].append(i['user_id'])
    inter_dict['business_id'].append(i['business_id'])
    inter_dict['rating'].append(i['stars'])
    datatime_str = i['date']
    timestamp = datetime.strptime(datatime_str, '%Y-%m-%d %H:%M:%S').timestamp()
    inter_dict['timestamp'].append(timestamp)
inter_df = pd.DataFrame(inter_dict)

In [6]:
inter_df = inter_df[inter_df['rating'] > 3]

In [7]:
# 统计user， business 和 interaction 的数量
user_count = inter_df['user_id'].nunique()
business_count = inter_df['business_id'].nunique()
interaction_count = inter_df.shape[0]
print('User: %d, Business: %d, Interaction: %d' % (user_count, business_count, interaction_count))

User: 1464850, Business: 147491, Interaction: 4684545


In [8]:
# 实现K-core，即通过不断循环迭代实现每个用户或者物品至少有 K 次交互
K = 20
while True:
    user_count = inter_df['user_id'].value_counts()
    item_count = inter_df['business_id'].value_counts()
    users_to_remove = user_count[user_count < K].index
    items_to_remove = item_count[item_count < K].index

    if len(users_to_remove) + len(items_to_remove) == 0:
        break
    
    inter_df = inter_df[
        ~inter_df['user_id'].isin(users_to_remove) &
        ~inter_df['business_id'].isin(items_to_remove)
    ]
    inter_df = inter_df.reset_index(drop=True)

In [9]:
# 统计user， business 和 interaction 的数量
user_count = inter_df['user_id'].nunique()
business_count = inter_df['business_id'].nunique()
interaction_count = inter_df.shape[0]
print('User: %d, Business: %d, Interaction: %d' % (user_count, business_count, interaction_count))

User: 13814, Business: 11484, Interaction: 624159


In [18]:
# 加一列history，按照时间顺序，每个用户之前的历史交互business_ids
inter_df = inter_df.sort_values(['user_id', 'timestamp'])
inter_df['history'] = inter_df.groupby('user_id')['business_id'].transform(
    lambda x: [' '.join(x.values[:i]) for i in range(len(x))]
)

In [20]:
inter_df['timestamp'] = pd.to_datetime(inter_df['timestamp'], unit='s')
start_time = inter_df['timestamp'].min()
end_time = inter_df['timestamp'].max()

# 计算 80% 和 90% 的时间切分点
time_80 = start_time + (end_time - start_time) * 0.8
time_90 = start_time + (end_time - start_time) * 0.9

# 划分训练集、验证集和测试集
train_df = inter_df[inter_df['timestamp'] <= time_80]
valid_df = inter_df[(inter_df['timestamp'] > time_80) & (inter_df['timestamp'] <= time_90)]
test_df = inter_df[inter_df['timestamp'] > time_90]

# 统计训练集、验证集和测试集的交互数量
train_interaction_count = train_df.shape[0]
valid_interaction_count = valid_df.shape[0]
test_interaction_count = test_df.shape[0]
print('Train interaction: %d, Valid interaction: %d, Test interaction: %d' % (train_interaction_count, valid_interaction_count, test_interaction_count))

Train interaction: 447161, Valid interaction: 110176, Test interaction: 66822


In [30]:
print("time_80: ", time_80)
print("time_90: ", time_90)

time_80:  2018-09-04 08:16:19.200000
time_90:  2020-05-12 22:02:32.100000


In [21]:
# 过滤掉 history 为空的数据
train_df = train_df[train_df['history'] != '']
# 过滤掉长度少于 5 的 history
train_df = train_df[train_df['history'].apply(lambda x: len(x.split()) >= 5)]

In [23]:
# 过滤掉 history 为空的数据
valid_df = valid_df[valid_df['history'] != '']
# 过滤掉长度少于 5 的 history
valid_df = valid_df[valid_df['history'].apply(lambda x: len(x.split()) >= 5)]
# 过滤掉 business_id不存在于 train_df 中 business 和 history 的 business_id集合的数据
train_business_id_set = set(train_df['business_id'].unique())
train_business_id_set.update(train_df['history'].str.split().explode().unique())
valid_df = valid_df[valid_df['business_id'].isin(train_business_id_set)]
valid_df = valid_df[valid_df['history'].apply(lambda x: all(i in train_business_id_set for i in x.split()))]

In [25]:
# 过滤掉 history 为空的数据
test_df = test_df[test_df['history'] != '']
# 过滤掉长度少于 5 的 history
test_df = test_df[test_df['history'].apply(lambda x: len(x.split()) >= 5)]
# 过滤掉 business_id不存在于 train_df 中 business 和 history 的 business_id集合的数据
test_df = test_df[test_df['business_id'].isin(train_business_id_set)]
test_df = test_df[test_df['history'].apply(lambda x: all(i in train_business_id_set for i in x.split()))]

In [28]:
# 统计训练集、验证集和测试集的交互数量
train_interaction_count = train_df.shape[0]
valid_interaction_count = valid_df.shape[0]
test_interaction_count = test_df.shape[0]
print('Train interaction: %d, Valid interaction: %d, Test interaction: %d' % (train_interaction_count, valid_interaction_count, test_interaction_count))
# 平均每个用户交互的物品数量
print(train_df['history'].apply(lambda x: len(x.split())).mean())
print(valid_df['history'].apply(lambda x: len(x.split())).mean())
print(test_df['history'].apply(lambda x: len(x.split())).mean())

Train interaction: 385589, Valid interaction: 49353, Test interaction: 8865
40.98368470054903
36.974104917634186
31.7863508178229


In [29]:
train_df.to_csv('../processed/Yelp/Yelp.train.csv', index=False)
valid_df.to_csv('../processed/Yelp/Yelp.valid.csv', index=False)
test_df.to_csv('../processed/Yelp/Yelp.test.csv', index=False)