In [2]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import pandas as pd
import torch
import json
import random

In [53]:
data_dir = "./data/ml-100k"
file_path = os.path.join(data_dir, 'u.data')

names = ['user_id', 'item_id', 'rating', 'timestamp']
df_raw = pd.read_csv(file_path, sep = '\t', names = names)

value_col = "rating"
user_col = "user_id"
item_col = "item_id"
print("dimension:", df_raw.shape)
df_raw.head()  

dimension: (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [54]:
# title_col = "title"
# genre_col = "genres"

# item_info_path = os.path.join(data_dir, "movies.csv")
# df_item = pd.read_csv(item_info_path)
# df_item = df_item[df_item[genre_col]!="(no genres listed)"]
# print("dimension: ", df_item.shape)
# df_item.head()

In [55]:
df_rating = df_raw[df_raw[value_col] >= 4.0].copy()
# df_rating = df_rating.merge(df_item, on=item_col)

### remove unpopular items and unpopular users

In [56]:
def threshold_user_item(df, uid_min, iid_min):
    n_users = df[user_col].unique().shape[0]
    n_items = df[item_col].unique().shape[0]
    sparsity = float(df.shape[0])/float(n_users*n_items)*100
    print("info")
    print("number of users: {}".format(n_users))
    print("number of items: {}".format(n_items))
    print("sparsity:{:4.4f}%".format(sparsity))
    
    done = False
    while not done:
        starting_shape = df.shape[0]
        item_counts = df.groupby(user_col)[item_col].count()
        df = df[~df[user_col].isin(item_counts[item_counts < iid_min].index.tolist())]
        user_counts = df.groupby(item_col)[user_col].count()
        df = df[~df[item_col].isin(user_counts[user_counts < uid_min].index.tolist())]
        ending_shape = df.shape[0]
        if starting_shape == ending_shape:
            done = True
            
    n_users = df[user_col].unique().shape[0]
    n_items = df[item_col].unique().shape[0]
    sparsity = float(df.shape[0])/float(n_users*n_items)*100
    print("number of users: {}".format(n_users))
    print("number of items: {}".format(n_items))
    print("sparsity:{:4.4f}%".format(sparsity))
    
    return df

In [57]:
user_min_num = 10
item_min_num = 10
new_df_rating = threshold_user_item(df_rating, user_min_num, item_min_num)

info
number of users: 942
number of items: 1447
sparsity:4.0625%
number of users: 887
number of items: 822
sparsity:7.2367%


In [58]:
movieId_to_iid = {}
movieId_to_iid["<pad>"] = 0
iid_to_movieId = {}
iid_to_movieId[0] = "<pad>"

for (idx, movieId) in enumerate(new_df_rating[item_col].unique().tolist()):
    movieId_to_iid[movieId] = idx+1
    iid_to_movieId[idx+1] = movieId
    
userId_to_uid = {}
userId_to_uid["<pad>"] = 0
uid_to_userId = {}
uid_to_userId[0] = "<pad>"

for (idx, userId) in enumerate(new_df_rating[user_col].unique().tolist()):
    userId_to_uid[userId] = idx+1
    uid_to_userId[idx+1] = userId 

In [59]:
user_num = len(uid_to_userId)
item_num = len(iid_to_movieId)
print("user_num:", user_num)
print("item_num:", item_num)

user_num: 888
item_num: 823


In [60]:
new_df_rating["user_id"] = new_df_rating[user_col].apply(lambda x: userId_to_uid[x])
new_df_rating["item_id"] = new_df_rating[item_col].apply(lambda x: movieId_to_iid[x])

In [62]:
user_itemlist_dict = new_df_rating.groupby([user_col]).item_id.apply(list)
user_itemlist_dict = dict(user_itemlist_dict)

In [63]:
train_itemlist = list(new_df_rating[item_col].unique())

In [64]:
iid_to_movieId

{0: '<pad>',
 1: 474,
 2: 465,
 3: 1014,
 4: 222,
 5: 387,
 6: 1042,
 7: 392,
 8: 486,
 9: 144,
 10: 1,
 11: 95,
 12: 277,
 13: 234,
 14: 98,
 15: 193,
 16: 88,
 17: 274,
 18: 603,
 19: 32,
 20: 16,
 21: 304,
 22: 327,
 23: 201,
 24: 1137,
 25: 241,
 26: 4,
 27: 100,
 28: 322,
 29: 181,
 30: 196,
 31: 384,
 32: 143,
 33: 423,
 34: 515,
 35: 219,
 36: 919,
 37: 26,
 38: 427,
 39: 512,
 40: 165,
 41: 248,
 42: 229,
 43: 237,
 44: 480,
 45: 366,
 46: 518,
 47: 111,
 48: 625,
 49: 338,
 50: 25,
 51: 1016,
 52: 154,
 53: 498,
 54: 209,
 55: 23,
 56: 382,
 57: 208,
 58: 328,
 59: 496,
 60: 132,
 61: 174,
 62: 118,
 63: 96,
 64: 151,
 65: 307,
 66: 648,
 67: 346,
 68: 514,
 69: 789,
 70: 317,
 71: 195,
 72: 200,
 73: 750,
 74: 1147,
 75: 379,
 76: 815,
 77: 479,
 78: 272,
 79: 955,
 80: 504,
 81: 466,
 82: 135,
 83: 117,
 84: 273,
 85: 231,
 86: 294,
 87: 137,
 88: 164,
 89: 298,
 90: 79,
 91: 455,
 92: 755,
 93: 673,
 94: 172,
 95: 216,
 96: 452,
 97: 61,
 98: 48,
 99: 483,
 100: 220,
 101: 

In [65]:
train_itemlist

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [85]:
def split_train_test(user_data):
    neg_sample = 500
    train = []
    valid = []
    test = []
    
    train_ratio = 0.8
    valid_ratio = 0.9
#     test_ratio = 0.1
    
    userlist = list(user_data.keys())
    usernum = len(userlist)
        
    for i in range(usernum):
        user_i = userlist[i]
        
        itemlist_i = user_data[user_i]
        random.shuffle(itemlist_i)
        itemnum_i = len(itemlist_i)
        
        valid_threshold = int(itemnum_i*train_ratio)
        test_threshold = int(itemnum_i*valid_ratio)
        
        train_itemlist_i = itemlist_i[:valid_threshold]
        valid_itemlist_i = itemlist_i[valid_threshold:test_threshold]
        test_itemlist_i = itemlist_i[test_threshold:]
        
        train_negitemlist_i = set(train_itemlist_i)^set(train_itemlist)
        train_negitemlist_i = list(train_negitemlist_i)
        
        for item_j in train_itemlist_i:
            sampled_negitemlist_i = random.sample(train_negitemlist_i, k=neg_sample)
            train.append([user_i, item_j, sampled_negitemlist_i])
        
        for item_j in valid_itemlist_i:
            valid.append([user_i, item_j])
            
        for item_j in test_itemlist_i:
            test.append([user_i, item_j])
            
    print("train_num", len(train))
    print("valid num", len(valid))
    print("test num", len(test))
    
    return train, valid, test

In [86]:
train_data, valid_data, test_data = split_train_test(user_itemlist_dict)

train_num 41864
valid num 5224
test num 5676


In [87]:
train_df = pd.DataFrame(train_data)
train_df.columns=["userid", "pos_itemid", "neg_itemid"]
valid_df = pd.DataFrame(valid_data)
valid_df.columns=["userid", "itemid"]
test_df = pd.DataFrame(test_data)
test_df.columns=["userid", "itemid"]
print("train num", len(train_df))
print("valid num", len(valid_df))
print("test num", len(test_df))

train num 41864
valid num 5224
test num 5676


In [88]:
train_df.head()

Unnamed: 0,userid,pos_itemid,neg_itemid
0,1,108,"[524, 300, 689, 227, 135, 475, 623, 58, 337, 6..."
1,1,139,"[165, 308, 701, 46, 428, 26, 228, 736, 146, 35..."
2,1,55,"[348, 505, 390, 635, 158, 603, 802, 604, 682, ..."
3,1,421,"[483, 404, 63, 127, 649, 606, 42, 554, 588, 37..."
4,1,90,"[262, 807, 697, 670, 607, 395, 595, 202, 234, ..."


In [89]:
train_data_file = "train_data.pickle"
train_data_abs_file = os.path.join(data_dir, train_data_file)
train_df.to_pickle(train_data_abs_file)

valid_data_file = "valid_data.pickle"
valid_data_abs_file = os.path.join(data_dir, valid_data_file)
valid_df.to_pickle(valid_data_abs_file)

test_data_file = "test_data.pickle"
test_data_abs_file = os.path.join(data_dir, test_data_file)
test_df.to_pickle(test_data_abs_file)

In [90]:
userId_to_uid = {str(k):str(userId_to_uid[k]) for k in userId_to_uid}
movieId_to_iid = {str(k):str(movieId_to_iid[k]) for k in movieId_to_iid}

uid_to_userId = {str(k):str(uid_to_userId[k]) for k in uid_to_userId}
iid_to_movieId = {str(k):str(iid_to_movieId[k]) for k in iid_to_movieId}

vocab = {"userindex2uid": userId_to_uid, "uid2userindex":uid_to_userId, "itemindex2iid": movieId_to_iid, "iid2itemindex":iid_to_movieId}
vocab_file = "vocab.json"

with open(os.path.join(data_dir, vocab_file), "w", encoding="utf8") as f:
    json.dump(vocab, f)

In [72]:
data_dir

'./data/ml-100k'

In [78]:
# train_df.userid.unique()
print(max(train_df.pos_itemid.unique()), min(train_df.pos_itemid.unique()))

822 1


In [49]:
a = torch.randn((3, 4))

In [50]:
a

tensor([[ 0.2862, -1.0019,  0.0103,  0.0317],
        [ 0.7306,  0.9805,  1.1846, -1.0710],
        [-1.0851, -0.9494,  0.0846, -1.1893]])

In [51]:
b = torch.tensor([[1, 0, 0], [2, 1, 3], [1, 2, 3]])
a.scatter_(1, b, float("-inf"))

tensor([[   -inf,    -inf,  0.0103,  0.0317],
        [ 0.7306,    -inf,    -inf,    -inf],
        [-1.0851,    -inf,    -inf,    -inf]])

In [52]:
a

tensor([[   -inf,    -inf,  0.0103,  0.0317],
        [ 0.7306,    -inf,    -inf,    -inf],
        [-1.0851,    -inf,    -inf,    -inf]])

In [54]:
torch.topk(a, 1, -1)

torch.return_types.topk(
values=tensor([[ 0.0317],
        [ 0.7306],
        [-1.0851]]),
indices=tensor([[3],
        [0],
        [0]]))

In [55]:
a[:, 0] = float("-inf")

In [56]:
a

tensor([[  -inf,   -inf, 0.0103, 0.0317],
        [  -inf,   -inf,   -inf,   -inf],
        [  -inf,   -inf,   -inf,   -inf]])

In [58]:
list(a[0].numpy() == 0)

[False, False, False, False]

In [20]:
np.random.randint(10)

6