In [126]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import pandas as pd
import torch
import json

In [2]:
# path = os.getcwd()
# os.chdir(os.path.join("..", "..", "notebook_format"))

In [131]:
data_dir = "./data/ml-25m"

user_col = "userId"
item_col = "movieId"
value_col = "rating"
time_col = "timestamp"

rating_path = os.path.join(data_dir, "ratings.csv")
df_raw = pd.read_csv(rating_path)
print("dimension:", df_raw.shape)
df_raw.head()

dimension: (25000095, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [132]:
title_col = "title"
genre_col = "genres"

item_info_path = os.path.join(data_dir, "movies.csv")
df_item = pd.read_csv(item_info_path)
df_item = df_item[df_item[genre_col]!="(no genres listed)"]
print("dimension: ", df_item.shape)
df_item.head()

dimension:  (57361, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [133]:
class Item:
    
    def __init__(self, _id, title, genres, score=None):
        self.id = _id
        self.title = title
        self.score = score
        self.genres = genres
        
    def __repr__(self):
        return self.title
    
def create_item_mapping(df_item, item_col, title_col, genre_col):
    item_mapping = {}
    for row in df_item.itertuples():
        item_id = getattr(row, item_col)
        item_title = getattr(row, title_col)
        item_genre = getattr(row, genre_col)
        
        splitted = item_genre.split("|")
        genre_ratio = 1./len(splitted)
        
        item_genre = {genre: genre_ratio for genre in splitted}
        item = Item(item_id, item_title, item_genre)
        item_mapping[item_id] = item
        
    return item_mapping

item_mapping = create_item_mapping(df_item, item_col, title_col, genre_col)
print(item_mapping[1])

Toy Story (1995)


In [134]:
df_rating = df_raw[df_raw[value_col] >= 4.0].copy()
df_rating = df_rating.merge(df_item, on=item_col)

for col in (user_col, item_col):
    df_rating[col] = df_rating[col].astype('category')

userindex = df_rating.userId.tolist()
uid = df_rating[user_col].cat.codes
itemindex = df_rating.movieId.tolist()
iid = df_rating[item_col].cat.codes

print(len(userindex), len(uid), max(uid), min(uid))
print(len(itemindex), len(iid), max(iid), min(iid))

print("dimension", df_rating.shape)
print(df_rating.head())

12442311 12442311 162341 0
12442311 12442311 38714 0
dimension (12442311, 6)
  userId movieId  rating   timestamp                title  \
0      1     296     5.0  1147880044  Pulp Fiction (1994)   
1      3     296     5.0  1439474476  Pulp Fiction (1994)   
2      4     296     4.0  1573938898  Pulp Fiction (1994)   
3      5     296     4.0   830786155  Pulp Fiction (1994)   
4      7     296     4.0   835444730  Pulp Fiction (1994)   

                        genres  
0  Comedy|Crime|Drama|Thriller  
1  Comedy|Crime|Drama|Thriller  
2  Comedy|Crime|Drama|Thriller  
3  Comedy|Crime|Drama|Thriller  
4  Comedy|Crime|Drama|Thriller  


In [148]:
itemIndex2iid_dict["1"]

'0'

In [135]:
userIndex2uid_dict = {userindex_i:uid[i] for i, userindex_i in enumerate(userindex)}
itemIndex2iid_dict = {itemindex_i:iid[i] for i, itemindex_i in enumerate(itemindex)}
print("user num", len(userIndex2uid_dict), "item num", len(itemIndex2iid_dict))

user num 162342 item num 38715


In [136]:
df_rating["userId"] = df_rating.userId.apply(lambda x: userIndex2uid_dict[x])
df_rating["movieId"] = df_rating.movieId.apply(lambda x: itemIndex2iid_dict[x])

In [137]:
user_itemlist_dict = df_rating.groupby(["userId"]).movieId.apply(list)
user_itemlist_dict = dict(user_itemlist_dict)

In [138]:
def split_train_test(user_data):
    sample_threshold_per_user = 10
    train = []
    valid = []
    test = []
    
    train_ratio = 0.8
    valid_ratio = 0.9
#     test_ratio = 0.1
    
    userlist = list(user_data.keys())
    usernum = len(userlist)
        
    for i in range(usernum):
        user_i = userlist[i]
        
        itemlist_i = user_data[user_i]
        
        itemnum_i = len(itemlist_i)
        if itemnum_i < sample_threshold_per_user:
            continue
        
        valid_threshold = int(itemnum_i*train_ratio)
        test_threshold = int(itemnum_i*valid_ratio)
        
        train_itemlist_i = itemlist_i[:valid_threshold]
        valid_itemlist_i = itemlist_i[valid_threshold:test_threshold]
        test_itemlist_i = itemlist_i[test_threshold:]
        
#         print(user_i)
#         print(train_itemlist_i)
        
        for item_j in train_itemlist_i:
            train.append([user_i, item_j])
        
        for item_j in valid_itemlist_i:
            valid.append([user_i, item_j])
            
        for item_j in test_itemlist_i:
            test.append([user_i, item_j])
            
    print("train_num", len(train))
    print("valid num", len(valid))
    print("test num", len(test))
    
    return train, valid, test

In [139]:
train_data, valid_data, test_data = split_train_test(user_itemlist_dict)

train_num 9849079
valid num 1230185
test num 1309174


In [140]:
train_df = pd.DataFrame(train_data)
train_df.columns=["userid", "itemid"]
valid_df = pd.DataFrame(valid_data)
valid_df.columns=["userid", "itemid"]
test_df = pd.DataFrame(test_data)
test_df.columns=["userid", "itemid"]
print("train num", len(train_df))
print("valid num", len(valid_df))
print("test num", len(test_df))

train num 9849079
valid num 1230185
test num 1309174


In [141]:
train_data_file = "train_data.pickle"
train_data_abs_file = os.path.join(data_dir, train_data_file)
train_df.to_pickle(train_data_abs_file)

valid_data_file = "valid_data.pickle"
valid_data_abs_file = os.path.join(data_dir, valid_data_file)
valid_df.to_pickle(valid_data_abs_file)

test_data_file = "test_data.pickle"
test_data_abs_file = os.path.join(data_dir, test_data_file)
test_df.to_pickle(test_data_abs_file)

In [142]:
userIndex2uid_dict = {str(k):str(userIndex2uid_dict[k]) for k in userIndex2uid_dict}
itemIndex2iid_dict = {str(k):str(itemIndex2iid_dict[k]) for k in itemIndex2iid_dict}

vocab = {"userindex2uid": userIndex2uid_dict, "itemindex2iid": itemIndex2iid_dict}
vocab_file = "vocab.json"

with open(os.path.join(data_dir, vocab_file), "w", encoding="utf8") as f:
    json.dump(vocab, f)

In [143]:
user_num = train_df.userid.nunique()
item_num = train_df.itemid.nunique()
print("user num", user_num)
print("item num", item_num)

user num 154108
item num 17775


In [122]:
class MF(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
    
    def forward(self, user, item):
        return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
loss_func = torch.nn.CrossEntropyLoss()
model = MF(user_num, item_num)

In [123]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
epoch_num = 10
for epoch_index in range(epoch_num):
    


In [19]:
# def create_user_item_csr_matrix(data, user_col, item_col, value_col):
#     rows = data[user_col].cat.codes
#     cols = data[item_col].cat.codes
#     values = data[value_col].astype(np.float32)
    
#     return csr_matrix((values, (rows, cols)))

# user_item = create_user_item_csr_matrix(df_rating, user_col, item_col, value_col)
# print(user_item.shape)

(162342, 38715)


In [58]:
# df = pd.DataFrame({'vertebrates': ['Bird', 'Bird', 'Mammal', 'Fish',\
#                                      'Amphibian', 'Reptile', 'Mammal']})
# print(df.vertebrates.astype("category").cat.codes)
# print(df.vertebrates.astype("category").cat.categories)

In [59]:
# np.random.seed(1234)
# user_item_train, user_item_test = train_test_split(user_item, train_percentage=0.8)
# print(user_item_train.shape)