# Dependencies

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [2]:
MASK_RATIO = 0.25

In [3]:
douban_dataset = pd.read_csv('douban/moviereviews_cleaned.csv',sep='\t',header=0,names=["user_id", "movie_id", "rating", "comment", "time", "labels", "useful_num", "CategoryID", "ID"],engine='python')

In [4]:
douban_dataset.head()

Unnamed: 0,user_id,movie_id,rating,comment,time,labels,useful_num,CategoryID,ID
0,2349,21292,3,明明嘴上说着不要，为什么身体这么诚实。英文无字幕。,2018-06-25,,,1,1
1,2349,34584,4,逃出死循环。,2018-06-23,,,1,2
2,2349,3,5,4.5星。奇幻，感动。,2018-06-21,,,1,3
3,2349,13510,3,这么帅，不科学。无字幕。,2018-06-20,,,0,4
4,2349,17317,4,青春的放纵，良心的抉择。拍摄美。那年女主20岁，9年后有了《西部世界》,2018-06-10,,,1,5


In [5]:
douban_dataset.groupby('user_id').count()[["movie_id"]].sort_values(by="user_id")

Unnamed: 0_level_0,movie_id
user_id,Unnamed: 1_level_1
1,141
2,34
3,447
4,18
5,299
...,...
2714,385
2715,256
2716,318
2717,561


In [6]:
douban_dataset.shape

(1278401, 9)

In [7]:
douban_dataset.head()

Unnamed: 0,user_id,movie_id,rating,comment,time,labels,useful_num,CategoryID,ID
0,2349,21292,3,明明嘴上说着不要，为什么身体这么诚实。英文无字幕。,2018-06-25,,,1,1
1,2349,34584,4,逃出死循环。,2018-06-23,,,1,2
2,2349,3,5,4.5星。奇幻，感动。,2018-06-21,,,1,3
3,2349,13510,3,这么帅，不科学。无字幕。,2018-06-20,,,0,4
4,2349,17317,4,青春的放纵，良心的抉择。拍摄美。那年女主20岁，9年后有了《西部世界》,2018-06-10,,,1,5


In [8]:
train_ratings, test_ratings = train_test_split(douban_dataset, test_size=0.1)

In [9]:
print("Max UserID: ", max(douban_dataset['user_id']))
print("UserIDs count: ", len(set(douban_dataset['user_id'])))

Max UserID:  2718
UserIDs count:  2712


In [10]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.movie_id.max()
    users_max_id = dataset.user_id.max()


    items = np.zeros(shape = (items_max_id, users_max_id), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['movie_id'] - 1), int(row['user_id'] - 1)] = row['rating']

    return items

In [11]:
train_sparse_matrix = generate_sparse_matrix(train_ratings)


In [12]:
test_sparse_matrix = generate_sparse_matrix(test_ratings)
print(test_sparse_matrix.shape)

(34885, 2718)


In [13]:
print("Train Sparse Matrix: ", train_sparse_matrix.shape)
print("Test Sparse Matrix: ", test_sparse_matrix.shape)

Train Sparse Matrix:  (34893, 2718)
Test Sparse Matrix:  (34885, 2718)


In [14]:
train_ratings.shape, test_ratings.shape

((1150560, 9), (127841, 9))

In [15]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for index, rating in enumerate(self.ratings):
            known = np.where(rating > 0)[0].tolist()
            known_indices[index][known] = 1
            
        return known_indices
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for index, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[index] == 1)[0].tolist()
            known_cnt = len(known)
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            rating[masked] = 0
                        
            masked_indices[index][masked] = 1
            
            
        return masked_ratings, masked_indices
    
    def normalize(self):
        
        for index, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for index, rating in enumerate(self.ratings):
            
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean

In [16]:
train_dataset = RatingsDataset(train_sparse_matrix)
print("Input: ", train_dataset[0]['inp'], "Input Length: ", len(train_dataset[0]['inp']))
print("Output: ", train_dataset[0]['out'], "Output Length: ", len(train_dataset[5]['out']))

test_dataset = RatingsDataset(test_sparse_matrix)
print("Input: ", test_dataset[0]['inp'], "Input Length: ", len(test_dataset[5]['inp']))
print("Output: ", test_dataset[0]['out'], "Output Length: ", len(test_dataset[5]['out']))

Input:  tensor([0.3957, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3957]) Input Length:  2718
Output:  tensor([0.3957, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3957]) Output Length:  2718
Input:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Input Length:  2718
Output:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Output Length:  2718


In [17]:
print(train_dataset[2])

{'inp': tensor([0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3399]), 'out': tensor([0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3399]), 'known_indices': array([0., 0., 0., ..., 0., 0., 1.]), 'masked_indices': array([0., 0., 0., ..., 0., 0., 0.])}


# Network

In [18]:
ALPHA = 1
BETA = 0.5

In [19]:
import torch.nn as nn


inputSize = douban_dataset.user_id.max()
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      nn.Linear(inputSize, 770),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(770, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

Denoising_Model(
  (encoder): Sequential(
    (0): Linear(in_features=2718, out_features=770, bias=True)
    (1): Tanh()
  )
  (decoder): Sequential(
    (0): Linear(in_features=770, out_features=2718, bias=True)
    (1): Tanh()
  )
)


In [20]:
def denosingLoss(output, target, known, masked):

        loss = 0

        for index, out in enumerate(output):
            out = output[index]

            known_indices = np.where(known[index] == 1)[0]
            masked_indices = np.where(masked[index] == 1)[0]
            known_masked_diff = list(set(known_indices) - set(masked_indices))        

            masked_output = output[index][masked_indices]
            masked_target = target[index][masked_indices]
            known_masked_diff_output = output[index][known_masked_diff]
            known_masked_diff_target = target[index][known_masked_diff]

            if len(masked_output) > 0 and len(known_masked_diff_output) > 0:
                loss += ALPHA * torch.sum(torch.square(torch.sub(masked_output, masked_target))) \
                        + BETA * torch.sum(torch.square(torch.sub(known_masked_diff_output, known_masked_diff_target)))

        return loss / BATCH_SIZE

In [21]:
print(len(train_dataset))

34893


In [22]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)

In [23]:
def train(model, masked_ratings, target, optimizer, criterion, known, masked):   
#     masked_ratings,target=masked_ratings.to(device),target.to(device)
    
    # Forward Pass
    output = model(masked_ratings)
    loss = denosingLoss(output, target, known, masked)
        
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
   # scheduler.step()

    return loss, output


In [26]:
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

# if torch.cuda.is_available() == True:
#     device="cuda:0"
# else:
device ="cpu"

EPOCHS = 20
BATCH_SIZE = 35

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model = Denoising_Model().to(device)
init_weights(model)
criterion = nn.MSELoss()
# optimizer=Adam(model.parameters(),lr=0.07,weight_decay=0.05)
optimizer = optim.SGD(model.parameters(), lr = 0.07, weight_decay = 0.05)
# exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.3)

epoch_loss = 0
epoch_loss = []

for epoch in range(EPOCHS): 
    acc_epoch_loss = 0
    
    for bidx, batch in enumerate(train_loader):
        
        x_train = batch['inp']
        y_train = batch['out']
        known = batch['known_indices']
        masked = batch['masked_indices']
        
        loss, predictions = train(model, x_train, y_train, optimizer, criterion, known, masked)
        acc_epoch_loss += loss
    
    epoch_loss.append(acc_epoch_loss / len(train_dataset))        
    print('Epoch {} Loss : {}'.format((epoch+1), epoch_loss[epoch] ))

Epoch 1 Loss : 0.38296809792518616
Epoch 2 Loss : 0.4057289659976959
Epoch 3 Loss : 0.4151080846786499
Epoch 4 Loss : 0.4117725193500519


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

from torch.nn import MSELoss


def test():
    noRatings = 0
    
    # True values
    y_true = []    
    # Predicted values
    y_pred = []
    
    test_loader = DataLoader(dataset = test_dataset, batch_size = 12, shuffle = False)
    
            
    for bidx, batch in enumerate(test_loader):
            x_train = batch['inp']
            for i, tensor in enumerate(batch['known_indices']):
                if(torch.count_nonzero(batch['known_indices'][i]) > 0):
                    noRatings+=1
            y_true.append(x_train)
            y_pred.append(model(x_train))
            acc_rmse = 0
            
            for idx, tensor in enumerate (y_pred[bidx]):
                mse = mean_squared_error(y_true[bidx][idx].detach().numpy(), tensor.detach().numpy()) / noRatings
                rmse = sqrt(mse)*2
                acc_rmse+=rmse
            
            if(bidx % 10 == 0):
                print("Batch: ", bidx+1, " Current RMSE: ", acc_rmse)    

    
test()