In [1]:
from collections import defaultdict
import os
import torch
import random
import numpy as np
import pandas as pd
import json
import pickle
import gzip
from tqdm import tqdm

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)
    
def ReadLineFromFile(path):
    lines = []
    with open(path,'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        try:
            yield eval(l)
        except:
            yield json.loads(l)
        
'''
Set seeds
'''
seed = 2022
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [2]:
def Amazon2014(dataset_name, min_rating_score, data_core_num=None):
    '''
    reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
    asin - ID of the product, e.g. 0000013714
    reviewerName - name of the reviewer
    helpful - helpfulness rating of the review, e.g. 2/3
    --"helpful": [2, 3],
    reviewText - text of the review
    --"reviewText": "I bought this for my husband who plays the piano. ..."
    overall - rating of the product
    --"overall": 5.0,
    summary - summary of the review
    --"summary": "Heavenly Highway Hymns",
    unixReviewTime - time of the review (unix time)
    --"unixReviewTime": 1252800000,
    reviewTime - time of the review (raw)
    --"reviewTime": "09 13, 2009"
    '''
    datas = []

    if data_core_num is not None:
        data_file = f"./{dataset_name}/raw/reviews_{dataset_name}_{data_core_num}.json.gz"
    else:
        data_file = f"./{dataset_name}/raw/reviews_{dataset_name}.json.gz"

    for inter in parse(data_file):
        if float(inter['overall']) <= min_rating_score: # 小于一定分数去掉
            continue
        user = inter['reviewerID']
        item = inter['asin']
        time = inter['unixReviewTime']
        datas.append((user, item, int(time)))
    
    return datas

def Amazon2014_item_meta(dataset_name, data_maps):
    '''
    asin - ID of the product, e.g. 0000031852
    --"asin": "0000031852",
    title - name of the product
    --"title": "Girls Ballet Tutu Zebra Hot Pink",
    description
    price - price in US dollars (at time of crawl)
    --"price": 3.17,
    imUrl - url of the product image (str)
    --"imUrl": "http://ecx.images-amazon.com/images/I/51fAmVkTbyL._SY300_.jpg",
    related - related products (also bought, also viewed, bought together, buy after viewing)
    --"related":{
        "also_bought": ["B00JHONN1S"],
        "also_viewed": ["B002BZX8Z6"],
        "bought_together": ["B002BZX8Z6"]
    },
    salesRank - sales rank information
    --"salesRank": {"Toys & Games": 211836}
    brand - brand name
    --"brand": "Coxlures",
    categories - list of categories the product belongs to
    --"categories": [["Sports & Outdoors", "Other Sports", "Dance"]]
    '''
    datas = {}
    meta_file = f"./{dataset_name}/raw/meta_{dataset_name}.json.gz"
    item_asins = list(data_maps['item2id'].keys())
    for info in parse(meta_file):
        if info['asin'] not in item_asins:
            continue
        datas[info['asin']] = info
    return datas

def Amazon2014_user_meta(dataset_name, data_maps):
    return None



def Yelp(date_min, date_max, rating_score):
    datas = []
    data_flie = './Yelp/raw/yelp_academic_dataset_review.json'
    lines = open(data_flie).readlines()
    for line in tqdm(lines):
        review = json.loads(line.strip())
        user = review['user_id']
        item = review['business_id']
        rating = review['stars']
        date = review['date']
        if date < date_min or date > date_max or float(rating) <= rating_score:
            continue

        # 将date转换为直接可比的数字串 2004-10-12 10:13:32 -> 20041012101332
        time = date.replace('-','').replace(':','').replace(' ','')
        datas.append((user, item, int(time)))
    return datas

def Yelp_item_meta(datamaps):
    meta_infos = {}
    meta_file = './Yelp/raw/yelp_academic_dataset_business.json'
    item_ids = list(datamaps['item2id'].keys())
    lines = open(meta_file).readlines()
    for line in tqdm(lines):
        info = json.loads(line)
        if info['business_id'] not in item_ids:
            continue
        meta_infos[info['business_id']] = info
    return meta_infos

def Yelp_user_meta(data_maps):
    meta_infos = {}
    meta_file = './Yelp/raw/yelp_academic_dataset_user.json'
    user_ids = list(data_maps['user2id'].keys())
    lines = open(meta_file).readlines()
    for line in tqdm(lines):
        info = json.loads(line)
        if info['user_id'] not in user_ids:
            continue
        meta_infos[info['user_id']] = info
    return meta_infos

# 获取inter字典
def get_interaction(datas):
    user_seq = {}

    # user_seq[user] = [item1, item2, item3, ...]
    for data in datas:
        user, item, time = data
        if user in user_seq:
            user_seq[user].append((item, time))
        else:
            user_seq[user] = []
            user_seq[user].append((item, time))

    # 按照时间排序
    for user, item_time in user_seq.items():
        item_time.sort(key=lambda x: x[1])
        items = []
        for t in item_time:
            items.append(t[0])
        user_seq[user] = items
    return user_seq

# 检查是否满足K-core
def check_Kcore(user_items, user_core, item_core):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user, items in user_items.items():
        for item in items:
            user_count[user] += 1
            item_count[item] += 1

    for user, num in user_count.items():
        if num < user_core:
            return user_count, item_count, False
    for item, num in item_count.items():
        if num < item_core:
            return user_count, item_count, False
    return user_count, item_count, True # 已经保证Kcore

# K-core 过滤
def filter_Kcore(user_items, user_core, item_core):
    user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    while not isKcore:
        for user, num in user_count.items():
            if user_count[user] < user_core: # 直接把user 删除
                user_items.pop(user)
            else:
                user_items[user] = list(filter(lambda item: item_count[item] >= item_core, user_items[user]))
        user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    return user_items

# 重新映射id
def id_map(user_items): # user_items dict
    user2id = {} # raw 2 uid
    item2id = {} # raw 2 iid
    id2user = {} # uid 2 raw
    id2item = {} # iid 2 raw
    user_id = 0
    item_id = 0
    final_data = {}
    random_user_list = list(user_items.keys())
    random.shuffle(random_user_list)
    for user in random_user_list:
        items = user_items[user]
        if user not in user2id:
            user2id[user] = str(user_id)
            id2user[str(user_id)] = user
            user_id += 1
        iids = [] # item id lists
        for item in items:
            if item not in item2id:
                item2id[item] = str(item_id)
                id2item[str(item_id)] = item
                item_id += 1
            iids.append(item2id[item])
        uid = user2id[user]
        final_data[uid] = iids
    data_maps = {
        'user2id': user2id,
        'item2id': item2id,
        'id2user': id2user,
        'id2item': id2item
    }
    return final_data, user_id, item_id, data_maps

In [None]:
from typing import Literal


def main(dataset_name, data_type:Literal["Amazon2014", "Yelp"]="Amazon2014"):

    # 设置最低评分以及用户和物品的最低交互次数
    min_rating_score = 0.0
    user_core = 5
    item_core = 5

    # 读取raw数据
    if data_type == 'Yelp':
        date_max = '2019-12-31 00:00:00'
        date_min = '2019-01-01 00:00:00'
        datas = Yelp(date_min, date_max, min_rating_score)
    elif data_type == "Amazon2014":
        datas = Amazon2014(dataset_name, min_rating_score=min_rating_score, data_core_num=5)
    else:
        raise ValueError(f"Data type {data_type} is not supported!")

    # 获取inter信息
    user_items = get_interaction(datas)
    print(f'{dataset_name} Raw data has been processed! Lower than {min_rating_score} are deleted!')
    
    # 进行K-core过滤
    user_items = filter_Kcore(user_items, user_core=user_core, item_core=item_core)
    print(f'User {user_core}-core complete! Item {item_core}-core complete!')

    # 重新映射id
    user_items, user_num, item_num, data_maps = id_map(user_items)

    # 记录数据集的基本信息：用户数、物品数、交互数、稀疏度等
    user_count, item_count, _ = check_Kcore(user_items, user_core=user_core, item_core=item_core)
    user_count_list = list(user_count.values())
    user_avg, user_min, user_max = np.mean(user_count_list), np.min(user_count_list), np.max(user_count_list)
    user_std = np.std(user_count_list)
    item_count_list = list(item_count.values())
    item_avg, item_min, item_max = np.mean(item_count_list), np.min(item_count_list), np.max(item_count_list)
    item_std = np.std(item_count_list)
    interact_num = np.sum(user_count_list)
    sparsity = (1 - interact_num / (user_num * item_num)) * 100
    show_info = f'Total User: {user_num}, Avg User: {user_avg:.4f}, Std User: {user_std: .4f}, Min Len: {user_min}, Max Len: {user_max}\n' + \
                f'Total Item: {item_num}, Avg Item: {item_avg:.4f}, Std Item: {item_std: .4f}, Min Inter: {item_min}, Max Inter: {item_max}\n' + \
                f'Iteraction Num: {interact_num}, Sparsity: {sparsity:.2f}%'
    print(show_info)


    print('\nBegin extracting items meta infos...')
    if data_type == 'Amazon2014':
        item_meta_infos = Amazon2014_item_meta(dataset_name, data_maps)
        # attribute_num, avg_attribute, datamaps, item2attributes = get_attribute_Amazon(meta_infos, data_maps, attribute_core)
    elif data_type == 'Yelp':
        item_meta_infos = Yelp_item_meta(data_maps)
        # attribute_num, avg_attribute, datamaps, item2attributes = get_attribute_Yelp(meta_infos, data_maps, attribute_core)
    else:
        raise ValueError(f"Data type {data_type} is not supported!")
    print('Items meta infos extracted complete!')


    print('\nBegin extracting users meta infos...')
    if data_type == 'Amazon2014':
        user_meta_infos = Amazon2014_user_meta(dataset_name, data_maps)
    elif data_type == 'Yelp':
        user_meta_infos = Yelp_user_meta(data_maps)
    else:
        raise ValueError(f"Data type {data_type} is not supported!")
    print('Users meta infos extracted complete!')


    print('\nBegin saving data...')
    # -------------- Save Data ---------------
    inter_file = f'{dataset_name}/{dataset_name}.inter.csv'
    item_file = f'{dataset_name}/{dataset_name}.item.csv'
    user_file = f'{dataset_name}/{dataset_name}.user.csv'

    # 保存交互数据
    with open(f'{data_type}/{data_type}.inter.json', 'w') as f:
        json.dump(user_items, f, ensure_ascii=False, indent=4)

    # 保存物品数据
    items = {}
    for iid, item in data_maps['id2item'].items():
        info = item_meta_infos[item]
        if dataset_name == "Beauty2014":
            title = info.get('title', f"title_{iid}")
            description = info.get('description', f"description_{iid}")
        elif dataset_name == "Yelp":
            title = info.get('name', f"title_{iid}")
            description = info.get('categories', f"description_{iid}")
        else:
            raise ValueError(f"Dataset {dataset_name} is not supported!")
        items[iid] = {
            "title": title,
            "description": description
        }
    with open(f'{data_type}/{data_type}.item.json', 'w') as f:
        json.dump(items, f, ensure_ascii=False, indent=4)
    
    # # 保存用户数据
    # if dataset_name == "Beauty2014":
    #     users = []
    #     for uid, user in data_maps['id2user'].items():
    #         users.append([uid])
    #     user_df = pd.DataFrame(users, columns=['user_id'])
    #     user_df.to_csv(user_file, index=False, sep=',', quoting=1, encoding='utf-8')
    # elif dataset_name == "Yelp":
    #     users = []
    #     for uid, user in data_maps['id2user'].items():
    #         info = user_meta_infos[user]
    #         name = info.get('name', f"name_{uid}")
    #         users.append([uid, name])
    #     user_df = pd.DataFrame(users, columns=['user_id', 'name'])
    #     user_df.to_csv(user_file, index=False, sep=',', quoting=1, encoding='utf-8')

    print('Data saved complete!')

In [4]:
dataset_name = "Beauty2014"

main(dataset_name, data_type='Amazon2014')

Beauty2014 Raw data has been processed! Lower than 0.0 are deleted!
User 5-core complete! Item 5-core complete!
Total User: 22363, Avg User: 8.8764, Std User:  8.1636, Min Len: 5, Max Len: 204
Total Item: 12101, Avg Item: 16.4038, Std Item:  23.6090, Min Inter: 5, Max Inter: 431
Iteraction Num: 198502, Sparsity: 99.93%

Begin extracting items meta infos...
Items meta infos extracted complete!

Begin extracting users meta infos...
Users meta infos extracted complete!

Begin saving data...
Data saved complete!
