# Dependencies

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [3]:
MASK_RATIO = 0.25

In [5]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [6]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [8]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [9]:
ratings.groupby('MovieID').count()[["UserID"]].sort_values(by="UserID")

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [11]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
USERS_CNT = 6040

In [7]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.1)

In [8]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.MovieID.max()

    items = np.zeros(shape = (items_max_id, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [11]:
train_sparse_matrix = generate_sparse_matrix(train_ratings)


In [12]:
test_sparse_matrix = generate_sparse_matrix(test_ratings)
print(test_sparse_matrix.shape)

(3952, 6040)


In [13]:
print("Train Sparse Matrix: ", train_sparse_matrix.shape)
print("Test Sparse Matrix: ", test_sparse_matrix.shape)

Train Sparse Matrix:  (3952, 6040)
Test Sparse Matrix:  (3952, 6040)


In [14]:
train_ratings.shape, test_ratings.shape

((900188, 4), (100021, 4))

In [15]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for index, rating in enumerate(self.ratings):
            known = np.where(rating > 0)[0].tolist()
            known_indices[index][known] = 1
            
        return known_indices
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for index, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[index] == 1)[0].tolist()
            known_cnt = len(known)
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            rating[masked] = 0
                        
            masked_indices[index][masked] = 1
            
            
        return masked_ratings, masked_indices
    
    def normalize(self):
        
        for index, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for index, rating in enumerate(self.ratings):
            
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean

In [16]:
train_dataset = RatingsDataset(train_sparse_matrix)
print("Input: ", train_dataset[0]['inp'], "Input Length: ", len(train_dataset[0]['inp']))
print("Output: ", train_dataset[0]['out'], "Output Length: ", len(train_dataset[5]['out']))

test_dataset = RatingsDataset(test_sparse_matrix)
print("Input: ", test_dataset[0]['inp'], "Input Length: ", len(test_dataset[5]['inp']))
print("Output: ", test_dataset[0]['out'], "Output Length: ", len(test_dataset[5]['out']))

Input:  tensor([ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5718]) Input Length:  6040
Output:  tensor([ 0.4282,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5718]) Output Length:  6040
Input:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Input Length:  6040
Output:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Output Length:  6040


In [17]:
print(train_dataset[2])

{'inp': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'out': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'known_indices': array([0., 0., 0., ..., 0., 0., 0.]), 'masked_indices': array([0., 0., 0., ..., 0., 0., 0.])}


# Network

In [18]:
ALPHA = 1
BETA = 0.5

In [19]:
import torch.nn as nn


inputSize = 6040
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      nn.Linear(inputSize, 770),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(770, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

Denoising_Model(
  (encoder): Sequential(
    (0): Linear(in_features=6040, out_features=770, bias=True)
    (1): Tanh()
  )
  (decoder): Sequential(
    (0): Linear(in_features=770, out_features=6040, bias=True)
    (1): Tanh()
  )
)


In [20]:
def denosingLoss(output, target, known, masked):

        loss = 0

        for index, out in enumerate(output):
            out = output[index]

            known_indices = np.where(known[index] == 1)[0]
            masked_indices = np.where(masked[index] == 1)[0]
            known_masked_diff = list(set(known_indices) - set(masked_indices))        

            masked_output = output[index][masked_indices]
            masked_target = target[index][masked_indices]
            known_masked_diff_output = output[index][known_masked_diff]
            known_masked_diff_target = target[index][known_masked_diff]

            if len(masked_output) > 0 and len(known_masked_diff_output) > 0:
                loss += ALPHA * torch.sum(torch.square(torch.sub(masked_output, masked_target))) \
                        + BETA * torch.sum(torch.square(torch.sub(known_masked_diff_output, known_masked_diff_target)))

        return loss / BATCH_SIZE

In [21]:
print(len(train_dataset))

3952


In [22]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)

In [23]:
def train(model, masked_ratings, target, optimizer, criterion, known, masked):   
#     masked_ratings,target=masked_ratings.to(device),target.to(device)
    
    # Forward Pass
    output = model(masked_ratings)
    loss = denosingLoss(output, target, known, masked)
        
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
   # scheduler.step()

    return loss, output


In [24]:
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

# if torch.cuda.is_available() == True:
#     device="cuda:0"
# else:
device ="cpu"

EPOCHS = 20
BATCH_SIZE = 35

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model = Denoising_Model().to(device)
init_weights(model)
criterion = nn.MSELoss()
# optimizer=Adam(model.parameters(),lr=0.07,weight_decay=0.05)
optimizer = optim.SGD(model.parameters(), lr = 0.07, weight_decay = 0.05)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.3)

epoch_loss = 0
epoch_loss = []

for epoch in range(EPOCHS): 
    acc_epoch_loss = 0
    
    for bidx, batch in enumerate(train_loader):
        
        x_train = batch['inp']
        y_train = batch['out']
        known = batch['known_indices']
        masked = batch['masked_indices']
        
        loss, predictions = train(model, x_train, y_train, optimizer, criterion, known, masked)
        acc_epoch_loss += loss
    
    epoch_loss.append(acc_epoch_loss / len(train_dataset))        
    print('Epoch {} Loss : {}'.format((epoch+1), epoch_loss[epoch] ))
    


Epoch 1 Loss : 0.8248584866523743
Epoch 2 Loss : 0.6867635846138
Epoch 3 Loss : 0.6472795009613037
Epoch 4 Loss : 0.6294000148773193
Epoch 5 Loss : 0.6098825335502625
Epoch 6 Loss : 0.5980748534202576
Epoch 7 Loss : 0.5899688601493835
Epoch 8 Loss : 0.5837532877922058
Epoch 9 Loss : 0.5784732103347778
Epoch 10 Loss : 0.5743650197982788
Epoch 11 Loss : 0.5706970691680908
Epoch 12 Loss : 0.5676262378692627
Epoch 13 Loss : 0.5658714771270752
Epoch 14 Loss : 0.5627363920211792
Epoch 15 Loss : 0.5616626143455505
Epoch 16 Loss : 0.5594909191131592
Epoch 17 Loss : 0.5623922348022461
Epoch 18 Loss : 0.5585438013076782
Epoch 19 Loss : 0.5576106309890747
Epoch 20 Loss : 0.5572994947433472


In [33]:
test_batch = []
train_batch = []
known_batch = []
all_ratings_cnt = 0
batch_size = 12
minibatches = []

for idx, test in enumerate(test_dataset):
    known_ratings_cnt = len(np.where(test['known_indices'] == 1)[0])
    
    if known_ratings_cnt > 0:
        
        
        all_ratings_cnt += known_ratings_cnt
        train_batch.append(train_dataset[idx]['inp'])
        known_batch.append(torch.tensor(test['known_indices']))
        test_batch.append(test['inp'])
        
    if len(test_batch) >= batch_size:
        minibatches.append((torch.stack(train_batch), torch.stack(test_batch), torch.stack(known_batch)))
        test_batch.clear()
        train_batch.clear()
        known_batch.clear()
        
print(minibatches[0])
       

(tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5718],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64))


In [32]:
from sklearn.metrics import mean_squared_error
from math import sqrt

from torch.nn import MSELoss


def test():
    criterion = nn.MSELoss()
    noRatings = 0
    input, target, minibatch = {}, {}, {}

    
    mse = 0        
    with torch.no_grad():
        for bidx, batch in enumerate(minibatches):
                train_batch = batch[0]
                test_batch = batch[1]
                known_batch = batch[2]
                y_predictions = model(train_batch)

                for idx, test in enumerate(test_batch):
                    y_true = test[np.where(known_batch[idx].numpy() == 1)[0]]
                    y_pred = y_predictions[idx][np.where(known_batch[idx].numpy() == 1)[0]]
                    
                   # print(torch.sum((y_pred - y_true)**2))
                    #print(torch.sum(y_pred - y_true)**2)
                    mse += torch.sum((y_pred - y_true)**2)

    print(all_ratings_cnt)  
    mse = mse / all_ratings_cnt
    rmse = sqrt(mse)
    print(rmse  * 2)
    
test()


100021
0.7592135976054041
