In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import tqdm
import csv

In [None]:
dataset = 'Yelp'

#### JSON convert to CSV

In [2]:
file_path = './yelp_academic_dataset_review.json'
csv_path = './yelp_academic_dataset_review.csv'

with open(file_path,'r',encoding='utf-8') as f:
    for line in f:
        line_contents = json.loads(line)
        headers=line_contents.keys()
        break
    print(headers)

with open(csv_path, 'w', newline='',encoding='utf-8') as fout:
    writer=csv.DictWriter(fout, headers)
    writer.writeheader()
    with open(file_path, 'r', encoding='utf-8') as new_file:
        for line in new_file:
            line_contents = json.loads(line)
            writer.writerow(line_contents)

dict_keys(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type', 'business_id'])


#### Show statistics of dataset

In [8]:
rating_df = pd.read_csv('./yelp_academic_dataset_review.csv', sep=',', skiprows=1, names=["votes", "userId", "review_id", "stars", "date", "text", "type", "itemId"])
rating_df.drop("votes", axis = 1, inplace=True)
rating_df.drop("review_id", axis = 1, inplace=True)
rating_df.drop("date", axis = 1, inplace=True)
rating_df.drop("text", axis = 1, inplace=True)
rating_df.drop("type", axis = 1, inplace=True)
rating_df.drop("stars", axis = 1, inplace=True)
# Print the number of users, items and interactions
print("Dataset statistics: ")
print(f"> No. of users: {len(rating_df['userId'].unique())}")
print(f"> No. of items: {len(rating_df['itemId'].unique())}")
print(f"> No. of interactions: {rating_df.shape[0]}")

Dataset statistics: 
> No. of users: 366715
> No. of items: 60785
> No. of interactions: 1569264


#### Data filter

In [11]:
# Drop duplicate records
rating_df.drop_duplicates(subset =['userId', 'itemId'], keep = 'first', inplace = True)

In [14]:
# Copy rating_df to rdf
rdf = copy.copy(rating_df)
# Calculate the total number of interactions of every user and item
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
print(rdf)

                         userId                  itemId  user_freq  item_freq
0        Xqd0DzHaiyRqVH3WRG7hzg  vcNAWiLM4dR7D2nwwJ7nCA          1          9
1        H1kH6QZV7Le4zqTRNxoZow  vcNAWiLM4dR7D2nwwJ7nCA          1          9
2        zvJCcrpm2yOZrxKffwGQLA  vcNAWiLM4dR7D2nwwJ7nCA         79          9
3        KBLW4wJA_fwoWmMhiHRVOA  vcNAWiLM4dR7D2nwwJ7nCA          1          9
5        Qrs3EICADUKNFoUq2iHStA  vcNAWiLM4dR7D2nwwJ7nCA          1          9
...                         ...                     ...        ...        ...
1569259  voIs5XRJJm_q7T1fII-iZQ  6TPxhpHqFedjMvBuw6pF3w          5         19
1569260  jUNtpHz7026QIf7Al_JNYw  6TPxhpHqFedjMvBuw6pF3w          2         19
1569261  u-z4zWDTW604g_N63hXqUw  6TPxhpHqFedjMvBuw6pF3w          5         19
1569262  58Zra9meHRvfpSVXT1kzaA  6TPxhpHqFedjMvBuw6pF3w          7         19
1569263  vYhGmN_Zb1a2-lSFK9c-bA  2EKGrbf2_81MrtjKZeOTng         22          1

[1521160 rows x 4 columns]


In [15]:
# Thresholds for user and item
user_threshold = 13
item_threshold = 13

In [16]:
# Remove users and items where their interactions less than threshold
while (rdf['user_freq'].min() < user_threshold or rdf['item_freq'].min() < item_threshold) :
    rdf.drop(rdf.index[rdf['user_freq'] < user_threshold], inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] < item_threshold], inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')   

In [17]:
# Show the number of users, items and the sparsity after preprocessing
usercnt = len(rdf['userId'].unique())
itemcnt = len(rdf['itemId'].unique())
print("total user: ", usercnt)
print("total item: ", itemcnt)
print('sparsity: ' + str(len(rdf) * 1.0 / (usercnt * itemcnt)))
# Drop the column of 'user_freq' and 'item_freq'
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)
rdf.reset_index(drop=True, inplace=True)

total user:  13991
total item:  10437
sparsity: 0.0032039033675181775


#### Renumber users and items

In [18]:
user_dic = dict()
item_dic = dict()

user_idx = 0
item_idx = 0

for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
  if row[1][0] not in user_dic.keys():
    user_dic[row[1][0]] = user_idx
    user_idx += 1
  # add a new book id with an index
  if row[1][1] not in item_dic.keys():
    item_dic[row[1][1]] = item_idx
    item_idx += 1

100%|██████████| 467847/467847 [00:12<00:00, 37388.05it/s]


In [19]:
header = ['userId', 'itemId']
with open(f'../../mod_data/{dataset}/{dataset}.csv', 'w', encoding='utf-8') as fp:
    writer = csv.writer(fp)
    writer.writerow(header)
    for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
        try:
            writer.writerow([user_dic[row[1][0]], item_dic[row[1][1]]])
        except KeyError as e:
            print(e, row[0])

100%|██████████| 467847/467847 [00:12<00:00, 36120.65it/s]
