In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import tqdm
import csv

In [None]:
dataset = 'LFM360K'

#### Show statistics of dataset

In [2]:
rating_df = pd.read_csv('./usersha1-artmbid-artname-plays.tsv', sep='\t', names=["userId", "itemId", "itemName", "playcounts"])
# Print the number of users, items and interactions
print("Dataset statistics: ")
print(f"> No. of users: {len(rating_df['userId'].unique())}")
print(f"> No. of items: {len(rating_df['itemId'].unique())}")
print(f"> No. of interactions: {rating_df.shape[0]}")

Dataset statistics: 
> No. of users: 358868
> No. of items: 160113
> No. of interactions: 17535655


#### Remove abnormal records

In [3]:
nan_list= []
for i in tqdm(rating_df.iterrows(), total=rating_df.shape[0]):
    if (type(rating_df['itemId'][i[0]]) != type('string')):
        nan_list.append(i[0])
rating_df.drop(nan_list, inplace=True)

100%|██████████| 17535655/17535655 [08:10<00:00, 35729.15it/s]


In [4]:
print("Dataset statistics: ")
print(f"> No. of users: {len(rating_df['userId'].unique())}")
print(f"> No. of items: {len(rating_df['itemId'].unique())}")
print(f"> No. of interactions: {rating_df.shape[0]}")

Dataset statistics: 
> No. of users: 358858
> No. of items: 160112
> No. of interactions: 17309518


#### Data filter

In [5]:
# Drop playcounts less than 20
rating_df.drop("itemName", axis=1, inplace=True)
rating_df.drop(rating_df.index[rating_df['playcounts'] < 20], axis=0, inplace=True)
# Drop the column of 'playcounts' and duplicate records
rating_df.drop('playcounts', axis=1, inplace=True)
rating_df.drop_duplicates(subset =['userId', 'itemId'], keep = 'first', inplace = True)

In [6]:
# Copy rating_df to rdf
rdf = copy.copy(rating_df)
# Calculate the total number of interactions of every user and item
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
print(rdf)

                                            userId  \
0         00000c289a1829a808ac09c00daf10bc3c4e223b   
1         00000c289a1829a808ac09c00daf10bc3c4e223b   
2         00000c289a1829a808ac09c00daf10bc3c4e223b   
3         00000c289a1829a808ac09c00daf10bc3c4e223b   
4         00000c289a1829a808ac09c00daf10bc3c4e223b   
...                                            ...   
17535629                              sep 20, 2008   
17535630                              sep 20, 2008   
17535631                              sep 20, 2008   
17535632                              sep 20, 2008   
17535633                              sep 20, 2008   

                                        itemId  user_freq  item_freq  
0         3bd73256-3905-4f3a-97e2-8b341527f805         49         83  
1         f2fb0ff0-5679-42ec-a55c-15109ce6e320         49       7724  
2         b3ae82c2-e60b-4551-a76d-6620f1b456aa         49        716  
3         3d6bbeb7-f90e-4d10-b440-e153c0d10b53         49       126

In [7]:
# Thresholds for user and item
user_threshold = 50
item_threshold = 15

In [11]:
# Remove users and items where their interactions less than threshold
while (rdf['user_freq'].min() < user_threshold or rdf['item_freq'].min() < item_threshold) :
    rdf.drop(rdf.index[rdf['user_freq'] < user_threshold], inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] < item_threshold], inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')   

In [12]:
# Show the number of users, items and the sparsity after preprocessing
usercnt = len(rdf['userId'].unique())
itemcnt = len(rdf['itemId'].unique())
print("total user: ", usercnt)
print("total item: ", itemcnt)
print('sparsity: ' + str(len(rdf) * 1.0 / (usercnt * itemcnt)))
# Drop the column of 'user_freq' and 'item_freq'
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)
rdf.reset_index(drop=True, inplace=True)

total user:  52966
total item:  15263
sparsity: 0.0035667806253268397


#### Renumber users and items

In [13]:
user_dic = dict()
item_dic = dict()

user_idx = 0
item_idx = 0

for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
  if row[1][0] not in user_dic.keys():
    user_dic[row[1][0]] = user_idx
    user_idx += 1
  # add a new book id with an index
  if row[1][1] not in item_dic.keys():
    item_dic[row[1][1]] = item_idx
    item_idx += 1

100%|██████████| 2883457/2883457 [01:16<00:00, 37806.23it/s]


In [14]:
header = ['userId', 'itemId']
with open(f'../../mod_data/{dataset}/{dataset}.csv', 'w', encoding='utf-8') as fp:
    writer = csv.writer(fp)
    writer.writerow(header)
    for row in tqdm(rdf.iterrows(), total=rdf.shape[0]):
        try:
            writer.writerow([user_dic[row[1][0]], item_dic[row[1][1]]])
        except KeyError as e:
            print(e, row[0])

100%|██████████| 2883457/2883457 [01:20<00:00, 36036.75it/s]
