In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import tqdm
import csv

In [9]:
dataset = 'BookCrossing'

In [2]:
# Remove all implicit rating records where rating equal to 0
rating_df = pd.read_csv('./BX-Book-Ratings.csv', sep=';', names=["userId", "itemId", "rating"], skiprows=1, encoding='unicode_escape')
rating_df.drop(rating_df.index[rating_df['rating'] == 0], axis=0, inplace=True)
# Print the number of users, items and interactions
print("Dataset statistics: ")
print(f"> No. of users: {len(rating_df['userId'].unique())}")
print(f"> No. of items: {len(rating_df['itemId'].unique())}")
print(f"> No. of interactions: {rating_df.shape[0]}")

Dataset statistics: 
> No. of users: 77805
> No. of items: 185973
> No. of interactions: 433671


In [3]:
# Drop the column of 'rating' and duplicate records
rating_df.drop('rating', axis=1, inplace=True)
rating_df.drop_duplicates(subset =['userId', 'itemId'], keep = 'first', inplace = True)

In [4]:
# Copy rating_df to rdf
rdf = copy.copy(rating_df)
# Calculate the total number of interactions of every user and item
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
print(rdf)

         userId       itemId  user_freq  item_freq
1        276726   0155061224          1          1
3        276729   052165615X          2          1
4        276729   0521795028          2          1
6        276736   3257224281          1          4
7        276737   0600570967          1          1
...         ...          ...        ...        ...
1149773  276704   0806917695          5          1
1149775  276704   1563526298          5          3
1149777  276709   0515107662          1         18
1149778  276721   0590442449          1          2
1149779  276723  05162443314          1          1

[433671 rows x 4 columns]


In [5]:
# Thresholds for user and item
user_threshold = 6
item_threshold = 6

In [6]:
# Remove users and items where their interactions less than thresholds
while (rdf['user_freq'].min() < user_threshold or rdf['item_freq'].min() < item_threshold) :
    rdf.drop(rdf.index[rdf['user_freq'] < user_threshold], inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] < item_threshold], inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')   

In [7]:
# Show the number of users, items and the sparsity after preprocessing
usercnt = len(rdf['userId'].unique())
itemcnt = len(rdf['itemId'].unique())
print("total user: ", usercnt)
print("total item: ", itemcnt)
print('sparsity: ' + str(len(rdf) * 1.0 / (usercnt * itemcnt)))
# Drop the column of 'user_freq' and 'item_freq'
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)
rdf.reset_index(drop=True, inplace=True)

total user:  5107
total item:  6515
sparsity: 0.002867266738909366


In [8]:
user_dic = dict()
item_dic = dict()

user_idx = 0
item_idx = 0

for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
  if row[1][0] not in user_dic.keys():
    user_dic[row[1][0]] = user_idx
    user_idx += 1
  # add a new book id with an index
  if row[1][1] not in item_dic.keys():
    item_dic[row[1][1]] = item_idx
    item_idx += 1

100%|██████████| 95400/95400 [00:02<00:00, 37059.49it/s]


In [10]:
header = ['userId', 'itemId']
with open(f'../../mod_data/{dataset}/{dataset}.csv', 'w', encoding='utf-8') as fp:
    writer = csv.writer(fp)
    writer.writerow(header)
    for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
        try:
            writer.writerow([user_dic[row[1][0]], item_dic[row[1][1]]])
        except KeyError as e:
            print(e, row[0])

100%|██████████| 95400/95400 [00:02<00:00, 34846.76it/s]
