In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import tqdm
import csv

In [None]:
dataset = 'Epinions'

In [2]:
rating_df = pd.read_csv('./ratings_data.txt', sep=' ', names=["userId", "itemId", "rating"])
# Print the number of users, items and interactions
print("Dataset statistics: ")
print(f"> No. of users: {len(rating_df['userId'].unique())}")
print(f"> No. of items: {len(rating_df['itemId'].unique())}")
print(f"> No. of interactions: {rating_df.shape[0]}")

Dataset statistics: 
> No. of users: 40163
> No. of Books: 139738
> No. of Interaction: 664824


In [3]:
# Drop ratings less than 3
for i in range(len(rating_df)):
    if rating_df.at[i, 'rating'] > 3:
        rating_df.at[i, 'rating'] = 1
    else: 
        rating_df.at[i, 'rating'] = 0
rating_df.drop(rating_df.index[rating_df['rating'] == 0], axis=0, inplace=True)
# Drop the column of 'rating' and duplicate records
rating_df.drop('rating', axis=1, inplace=True)
rating_df.drop_duplicates(subset =['userId', 'itemId'], keep = 'first', inplace = True)

In [4]:
# Copy rating_df to rdf
rdf = copy.copy(rating_df)
# Calculate the total number of interactions of every user and item
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
print(rdf)

        userId  itemId  user_freq  item_freq
0            1     100        160          4
1            1     101        160          2
4            1     103        160          2
6            1     105        160          8
7            1     106        160        196
...        ...     ...        ...        ...
664819   49289   30791         13          9
664820   49289    3862         13        393
664821   49289    3939         13        117
664822   49289   60213         13          5
664823   49289   62722         13          3

[495393 rows x 4 columns]


In [5]:
# Thresholds for user and item
user_threshold = 9
item_threshold = 9

In [6]:
# Remove users and items where their interactions less than thresholds
while (rdf['user_freq'].min() < user_threshold or rdf['item_freq'].min() < item_threshold) :
    rdf.drop(rdf.index[rdf['user_freq'] < user_threshold], inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] < item_threshold], inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')   

In [7]:
# Show the number of users, items and the sparsity after preprocessing
usercnt = len(rdf['userId'].unique())
itemcnt = len(rdf['itemId'].unique())
print("total user: ", usercnt)
print("total item: ", itemcnt)
print('sparsity: ' + str(len(rdf) * 1.0 / (usercnt * itemcnt)))
# Drop the column of 'user_freq' and 'item_freq'
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)
rdf.reset_index(drop=True, inplace=True)

total user:  8521
total item:  6941
sparsity: 0.003188711750071575


#### Renumber users and items

In [8]:
user_dic = dict()
item_dic = dict()

user_idx = 0
item_idx = 0

for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
  if row[1][0] not in user_dic.keys():
    user_dic[row[1][0]] = user_idx
    user_idx += 1
  # add a new book id with an index
  if row[1][1] not in item_dic.keys():
    item_dic[row[1][1]] = item_idx
    item_idx += 1

100%|██████████| 188594/188594 [00:04<00:00, 42988.54it/s]


In [9]:
header = ['userId', 'itemId']
with open(f'../../mod_data/{dataset}/{dataset}.csv', 'w', encoding='utf-8') as fp:
    writer = csv.writer(fp)
    writer.writerow(header)
    for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
        try:
            writer.writerow([user_dic[row[1][0]], item_dic[row[1][1]]])
        except KeyError as e:
            print(e, row[0])

100%|██████████| 188594/188594 [00:04<00:00, 41850.11it/s]
