In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import tqdm
import csv

In [None]:
dataset = 'ML1M'

#### Show statistics of dataset

In [5]:
rating_df = pd.read_csv('./ratings.dat', sep='::', names=["userId", "itemId", "rating", "timestamp"])
rating_df.drop(columns=['timestamp'], inplace=True)
# Print the number of users, items and interactions
print("Dataset statistics: ")
print(f"> No. of users: {len(rating_df['userId'].unique())}")
print(f"> No. of items: {len(rating_df['itemId'].unique())}")
print(f"> No. of interactions: {rating_df.shape[0]}")

  return func(*args, **kwargs)


Dataset statistics: 
> No. of users: 6040
> No. of items: 3706
> No. of interactions: 1000209


#### Data filter

In [6]:
# Drop the column of 'rating' and duplicate records
rating_df.drop('rating', axis=1, inplace=True)
rating_df.drop_duplicates(subset =['userId', 'itemId'], keep = 'first', inplace = True)

In [7]:
# Copy rating_df to rdf
rdf = copy.copy(rating_df)
# Calculate the total number of interactions of every user and item
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
print(rdf)

         userId  itemId  user_freq  item_freq
0             1    1193         53       1725
1             1     661         53        525
2             1     914         53        636
3             1    3408         53       1315
4             1    2355         53       1703
...         ...     ...        ...        ...
1000204    6040    1091        341        373
1000205    6040    1094        341       1229
1000206    6040     562        341        478
1000207    6040    1096        341        344
1000208    6040    1097        341       2269

[1000209 rows x 4 columns]


In [8]:
# Thresholds for user and item
user_threshold = 0
item_threshold = 0

In [10]:
# Remove users and items where their interactions less than threshold
while (rdf['user_freq'].min() < user_threshold or rdf['item_freq'].min() < item_threshold) :
    rdf.drop(rdf.index[rdf['user_freq'] < user_threshold], inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] < item_threshold], inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')   

In [11]:
# Show the number of users, items and the sparsity after preprocessing
usercnt = len(rdf['userId'].unique())
itemcnt = len(rdf['itemId'].unique())
print("total user: ", usercnt)
print("total item: ", itemcnt)
print('sparsity: ' + str(len(rdf) * 1.0 / (usercnt * itemcnt)))
# Drop the column of 'user_freq' and 'item_freq'
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)
rdf.reset_index(drop=True, inplace=True)

total user:  6040
total item:  3706
sparsity: 0.044683625622312845


#### Renumber users and items

In [12]:
user_dic = dict()
item_dic = dict()

user_idx = 0
item_idx = 0

for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
  if row[1][0] not in user_dic.keys():
    user_dic[row[1][0]] = user_idx
    user_idx += 1
  # add a new book id with an index
  if row[1][1] not in item_dic.keys():
    item_dic[row[1][1]] = item_idx
    item_idx += 1

100%|██████████| 1000209/1000209 [00:23<00:00, 41889.30it/s]


In [13]:
header = ['userId', 'itemId']
with open(f'../../mod_data/{dataset}/{dataset}.csv', 'w', encoding='utf-8') as fp:
    writer = csv.writer(fp)
    writer.writerow(header)
    for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
        try:
            writer.writerow([user_dic[row[1][0]], item_dic[row[1][1]]])
        except KeyError as e:
            print(e, row[0])

100%|██████████| 1000209/1000209 [00:24<00:00, 40149.07it/s]
