# Dependencies

In [2]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [2]:
MASK_RATIO = 0.25

In [11]:
douban_dataset = pd.read_csv('douban/moviereviews_cleaned.csv',sep='\t',header=0,names=["user_id", "movie_id", "rating", "comment", "time", "labels", "useful_num", "CategoryID", "ID"],engine='python')

In [12]:
douban_dataset.head()

Unnamed: 0,user_id,movie_id,rating,comment,time,labels,useful_num,CategoryID,ID
0,2349,21292,3,明明嘴上说着不要，为什么身体这么诚实。英文无字幕。,2018-06-25,,,1,1
1,2349,34584,4,逃出死循环。,2018-06-23,,,1,2
2,2349,3,5,4.5星。奇幻，感动。,2018-06-21,,,1,3
3,2349,13510,3,这么帅，不科学。无字幕。,2018-06-20,,,0,4
4,2349,17317,4,青春的放纵，良心的抉择。拍摄美。那年女主20岁，9年后有了《西部世界》,2018-06-10,,,1,5


In [8]:
douban_dataset.shape

(1278402, 9)

In [13]:
douban_dataset.groupby('user_id').count()[["movie_id"]].sort_values(by="user_id")

Unnamed: 0_level_0,movie_id
user_id,Unnamed: 1_level_1
1,141
2,34
3,447
4,18
5,299
...,...
2714,385
2715,256
2716,318
2717,561


In [15]:
douban_dataset.groupby('movie_id').count()[["user_id"]].sort_values(by="movie_id")

Unnamed: 0_level_0,user_id
movie_id,Unnamed: 1_level_1
1,438
2,279
3,527
4,779
5,552
...,...
34889,1
34890,1
34891,1
34892,1


In [35]:
USERS_CNT = 2718



In [63]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.movie_id.max()

    items = np.zeros(shape = (items_max_id, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['movie_id']-1), int(row['user_id'])-1] = row['rating']

    return items

In [64]:
ratings_sparse_matrix = generate_sparse_matrix(douban_dataset)

In [58]:
print(ratings_sparse_matrix.shape)
print(ratings_sparse_matrix[0].shape)
ratings_sparse_matrix[0]

(34893, 2718)
(2718,)


array([4., 0., 0., ..., 0., 0., 4.], dtype=float32)

In [59]:
train_ratings, test_ratings = train_test_split(ratings_sparse_matrix, test_size=0.1)

In [60]:
train_ratings.shape, test_ratings.shape

((31403, 2718), (3490, 2718))

In [61]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = douban_dataset.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for index, rating in enumerate(self.ratings):
            print(rating)
            known = np.where(rating > 0)[0].tolist()
            known_indices[index][known] = 1
            
        return known_indices
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for index, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[index] == 1)[0].tolist()
            known_cnt = len(known)
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            rating[masked] = 0
                        
            masked_indices[index][masked] = 1
            
            
        return masked_ratings, masked_indices
    
#     def normalize(self):
        
#         for index, rating in enumerate(self.ratings):
#             known = np.where(self.known_indices[index] == 1)[0]
            
#             if len(known) > 0:
#                 min_rating = rating[known].min()
#                 max_rating = rating[known].max()
#                 average_rating = (max_rating + min_rating) / 2
#                 range_rating = (max_rating - min_rating) / 2
#                 print(range_rating)
#                 rating[known] -= average_rating
#                 rating[known] /= range_rating
    
    def normalize(self):
        
        for index, rating in enumerate(self.ratings):
            print(index)
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for index, rating in enumerate(self.ratings):
            
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean


In [62]:
train_dataset = RatingsDataset(train_ratings)

user_id


TypeError: '>' not supported between instances of 'str' and 'int'

In [18]:
test_dataset = RatingsDataset(test_ratings)