In [1]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import pandas as pd
import torch
import json
import random

In [39]:
data_dir = "./data/ml-25m"

user_col = "userId"
item_col = "movieId"
value_col = "rating"
time_col = "timestamp"

rating_path = os.path.join(data_dir, "ratings.csv")
df_raw = pd.read_csv(rating_path)
print("dimension:", df_raw.shape)
df_raw.head()  

dimension: (25000095, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [40]:
title_col = "title"
genre_col = "genres"

item_info_path = os.path.join(data_dir, "movies.csv")
df_item = pd.read_csv(item_info_path)
df_item = df_item[df_item[genre_col]!="(no genres listed)"]
print("dimension: ", df_item.shape)
df_item.head()

dimension:  (57361, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [41]:
df_rating = df_raw[df_raw[value_col] >= 4.0].copy()
df_rating = df_rating.merge(df_item, on=item_col)

### remove unpopular items and unpopular users

In [42]:
def threshold_user_item(df, uid_min, iid_min):
    n_users = df[user_col].unique().shape[0]
    n_items = df[item_col].unique().shape[0]
    sparsity = float(df.shape[0])/float(n_users*n_items)*100
    print("info")
    print("number of users: {}".format(n_users))
    print("number of items: {}".format(n_items))
    print("sparsity:{:4.4f}%".format(sparsity))
    
    done = False
    while not done:
        starting_shape = df.shape[0]
        item_counts = df.groupby(user_col)[item_col].count()
        df = df[~df[user_col].isin(item_counts[item_counts < iid_min].index.tolist())]
        user_counts = df.groupby(item_col)[user_col].count()
        df = df[~df[item_col].isin(user_counts[user_counts < uid_min].index.tolist())]
        ending_shape = df.shape[0]
        if starting_shape == ending_shape:
            done = True
            
    n_users = df[user_col].unique().shape[0]
    n_items = df[item_col].unique().shape[0]
    sparsity = float(df.shape[0])/float(n_users*n_items)*100
    print("number of users: {}".format(n_users))
    print("number of items: {}".format(n_items))
    print("sparsity:{:4.4f}%".format(sparsity))
    
    return df

In [43]:
user_min_num = 10
item_min_num = 10
new_df_rating = threshold_user_item(df_rating, user_min_num, item_min_num)

info
number of users: 162342
number of items: 38715
sparsity:0.1980%
number of users: 154051
number of items: 15306
sparsity:0.5227%


In [44]:
movieId_to_iid = {}
movieId_to_iid["<pad>"] = 0
iid_to_movieId = {}
iid_to_movieId[0] = "<pad>"

for (idx, movieId) in enumerate(new_df_rating.movieId.unique().tolist()):
    movieId_to_iid[movieId] = idx+1
    iid_to_movieId[idx+1] = movieId
    
userId_to_uid = {}
userId_to_uid["<pad>"] = 0
uid_to_userId = {}
uid_to_userId[0] = "<pad>"

for (idx, userId) in enumerate(new_df_rating.userId.unique().tolist()):
    userId_to_uid[userId] = idx+1
    uid_to_userId[idx+1] = userId 

In [45]:
user_num = len(uid_to_userId)
item_num = len(iid_to_movieId)
print("user_num:", user_num)
print("item_num:", item_num)

user_num: 154052
item_num: 15307


In [46]:
new_df_rating["userId"] = new_df_rating.userId.apply(lambda x: userId_to_uid[x])
new_df_rating["movieId"] = new_df_rating.movieId.apply(lambda x: movieId_to_iid[x])

In [47]:
user_itemlist_dict = new_df_rating.groupby(["userId"]).movieId.apply(list)
user_itemlist_dict = dict(user_itemlist_dict)

In [48]:
train_itemlist = list(new_df_rating.movieId.unique())

In [12]:
tmp = [i for i in range(100)]
a = random.sample(tmp, k=10)
print(a)
print(len(set(a)))

[95, 66, 7, 41, 44, 65, 99, 12, 94, 42]
10


In [54]:
def split_train_test(user_data):
    neg_sample = 500
    train = []
    valid = []
    test = []
    
    train_ratio = 0.7
    valid_ratio = 0.8
#     test_ratio = 0.1
    
    userlist = list(user_data.keys())
    usernum = len(userlist)
        
    for i in range(user_num):
        user_i = userlist[i]
        
        itemlist_i = user_data[user_i]
        random.shuffle(itemlist_i)
        itemnum_i = len(itemlist_i)
        
        valid_threshold = int(itemnum_i*train_ratio)
        test_threshold = int(itemnum_i*valid_ratio)
        
        train_itemlist_i = itemlist_i[:valid_threshold]
        valid_itemlist_i = itemlist_i[valid_threshold:test_threshold]
        test_itemlist_i = itemlist_i[test_threshold:]
        
        train_negitemlist_i = set(train_itemlist_i)^set(train_itemlist)
        train_negitemlist_i = list(train_negitemlist_i)
        
        for item_j in train_itemlist_i:
            sampled_negitemlist_i = random.sample(train_negitemlist_i, k=neg_sample)
            train.append([user_i, item_j, sampled_negitemlist_i])
        
        for item_j in valid_itemlist_i:
            valid.append([user_i, item_j])
            
        for item_j in test_itemlist_i:
            test.append([user_i, item_j])
            
    print("train_num", len(train))
    print("valid num", len(valid))
    print("test num", len(test))
    
    return train, valid, test

In [None]:
train_data, valid_data, test_data = split_train_test(user_itemlist_dict)

In [None]:
train_df = pd.DataFrame(train_data)
train_df.columns=["userid", "pos_itemid", "neg_itemid"]
valid_df = pd.DataFrame(valid_data)
valid_df.columns=["userid", "itemid"]
test_df = pd.DataFrame(test_data)
test_df.columns=["userid", "itemid"]
print("train num", len(train_df))
print("valid num", len(valid_df))
print("test num", len(test_df))

In [34]:
train_df.head()

Unnamed: 0,userid,itemid
0,1,37
1,1,27
2,1,17
3,1,15
4,1,31


In [35]:
train_data_file = "train_data.pickle"
train_data_abs_file = os.path.join(data_dir, train_data_file)
train_df.to_pickle(train_data_abs_file)

valid_data_file = "valid_data.pickle"
valid_data_abs_file = os.path.join(data_dir, valid_data_file)
valid_df.to_pickle(valid_data_abs_file)

test_data_file = "test_data.pickle"
test_data_abs_file = os.path.join(data_dir, test_data_file)
test_df.to_pickle(test_data_abs_file)

In [36]:
userId_to_uid = {str(k):str(userId_to_uid[k]) for k in userId_to_uid}
movieId_to_iid = {str(k):str(movieId_to_iid[k]) for k in movieId_to_iid}

uid_to_userId = {str(k):str(uid_to_userId[k]) for k in uid_to_userId}
iid_to_movieId = {str(k):str(iid_to_movieId[k]) for k in iid_to_movieId}

vocab = {"userindex2uid": userId_to_uid, "uid2userindex":uid_to_userId, "itemindex2iid": movieId_to_iid, "iid2itemindex":iid_to_movieId}
vocab_file = "vocab.json"

with open(os.path.join(data_dir, vocab_file), "w", encoding="utf8") as f:
    json.dump(vocab, f)

In [49]:
a = torch.randn((3, 4))

In [50]:
a

tensor([[ 0.2862, -1.0019,  0.0103,  0.0317],
        [ 0.7306,  0.9805,  1.1846, -1.0710],
        [-1.0851, -0.9494,  0.0846, -1.1893]])

In [51]:
b = torch.tensor([[1, 0, 0], [2, 1, 3], [1, 2, 3]])
a.scatter_(1, b, float("-inf"))

tensor([[   -inf,    -inf,  0.0103,  0.0317],
        [ 0.7306,    -inf,    -inf,    -inf],
        [-1.0851,    -inf,    -inf,    -inf]])

In [52]:
a

tensor([[   -inf,    -inf,  0.0103,  0.0317],
        [ 0.7306,    -inf,    -inf,    -inf],
        [-1.0851,    -inf,    -inf,    -inf]])

In [54]:
torch.topk(a, 1, -1)

torch.return_types.topk(
values=tensor([[ 0.0317],
        [ 0.7306],
        [-1.0851]]),
indices=tensor([[3],
        [0],
        [0]]))

In [55]:
a[:, 0] = float("-inf")

In [56]:
a

tensor([[  -inf,   -inf, 0.0103, 0.0317],
        [  -inf,   -inf,   -inf,   -inf],
        [  -inf,   -inf,   -inf,   -inf]])

In [58]:
list(a[0].numpy() == 0)

[False, False, False, False]

In [20]:
np.random.randint(10)

6