# Dependencies

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [2]:
MASK_RATIO = 0.25

In [3]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [4]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [8]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [9]:
ratings.groupby('MovieID').count()[["UserID"]].sort_values(by="UserID")

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [10]:
USERS_CNT = 6040



In [31]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.MovieID.max()
    print(items_max_id)

    items = np.zeros(shape = (items_max_id, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [32]:
ratings_sparse_matrix = generate_sparse_matrix(ratings)

3952


In [33]:
print(ratings_sparse_matrix.shape)
print(ratings_sparse_matrix[0].shape)
ratings_sparse_matrix[0]

(3952, 6040)
(6040,)


array([5., 0., 0., ..., 0., 0., 3.], dtype=float32)

In [14]:
train_ratings, test_ratings = train_test_split(ratings_sparse_matrix, test_size=0.1)

In [15]:
train_ratings.shape, test_ratings.shape

((3556, 6040), (396, 6040))

In [34]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for index, rating in enumerate(self.ratings):
            print(rating)
            known = np.where(rating > 0)[0].tolist()
            known_indices[index][known] = 1
            
        return known_indices
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for index, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[index] == 1)[0].tolist()
            known_cnt = len(known)
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            rating[masked] = 0
                        
            masked_indices[index][masked] = 1
            
            
        return masked_ratings, masked_indices
    
#     def normalize(self):
        
#         for index, rating in enumerate(self.ratings):
#             known = np.where(self.known_indices[index] == 1)[0]
            
#             if len(known) > 0:
#                 min_rating = rating[known].min()
#                 max_rating = rating[known].max()
#                 average_rating = (max_rating + min_rating) / 2
#                 range_rating = (max_rating - min_rating) / 2
#                 print(range_rating)
#                 rating[known] -= average_rating
#                 rating[known] /= range_rating
    
    def normalize(self):
        
        for index, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for index, rating in enumerate(self.ratings):
            
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean


In [35]:
train_dataset = RatingsDataset(train_ratings)

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 4. 4. 0.]
[0. 0. 0. ... 0. 0. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0.

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 5.]
[0. 3. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[5. 0. 5. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0.

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 4. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 5. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 4. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 3.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0.

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 2.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0.

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 5. 0. ... 0. 4. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 2. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 4. 5.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 4.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0.

In [18]:
test_dataset = RatingsDataset(test_ratings)

# Network

In [22]:
!pip3 install torch-sparse



In [28]:
import torch.nn as nn
import sparselinear as sl


inputSize = 6040
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      sl.SparseLinear(inputSize, 500),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(500, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

AttributeError: 'NoneType' object has no attribute 'origin'

In [29]:
print(len(train_dataset))

3556


In [None]:
ALPHA = 1
BETA = 0.5

# def denoising_loss(output, target, known, masked):
    
#     loss = 0
    
#     for index, out in enumerate(output):
#         out = output[index]
        
#         known_indices = np.where(known[index] == 1)[0]
#         masked_indices = np.where(masked[index] == 1)[0]
#         known_masked_diff = list(set(known_indices) - set(masked_indices))        
        
#         masked_output = output[index][masked_indices] 
#         masked_target = target[index][masked_indices] 
#         known_masked_diff_output = output[index][known_masked_diff] 
#         known_masked_diff_target = target[index][known_masked_diff]
        
        
#         if len(masked_output) > 0 and len(known_masked_diff_output) > 0:
#             loss += ALPHA*torch.sum((masked_output - masked_target)**2) \
#                     + BETA*torch.sum((known_masked_diff_output - known_masked_diff_target)**2)
            
# #             print(ALPHA*torch.mean((masked_output - masked_target)**2) \
# #                     + BETA*torch.mean((known_masked_diff_output - known_masked_diff_target)**2))
            
# #             print(loss)
            
#     return loss / BATCH_SIZE

In [None]:
from torch.autograd import Variable

class DenoisingLoss(nn.Module):
    
    def __init__(self, alpha=1, beta=0.5):
      super().__init__()
      self.alpha = alpha
      self.beta = beta
    
    def forward(self, output, target, known, masked):

        loss = 0

        for index, out in enumerate(output):
            out = output[index]

            known_indices = np.where(known[index] == 1)[0]
            masked_indices = np.where(masked[index] == 1)[0]
            known_masked_diff = list(set(known_indices) - set(masked_indices))        

            masked_output = Variable(output[index][masked_indices], requires_grad=True)
            masked_target = Variable(target[index][masked_indices] , requires_grad=True)
            known_masked_diff_output = Variable(output[index][known_masked_diff], requires_grad=True)
            known_masked_diff_target = Variable(target[index][known_masked_diff], requires_grad=True)

            #print(masked_output)
           # print(masked_target)
            if len(masked_output) > 0 and len(known_masked_diff_output) > 0:
                loss += ALPHA*torch.sum((masked_output - masked_target)**2) \
                        + BETA*torch.sum((known_masked_diff_output - known_masked_diff_target)**2)

        return loss / BATCH_SIZE


In [None]:
def train(model, inp, target, optimizer, known, masked, criterion):   
   # masked_ratings,known_ratings=masked_ratings.to(device),known_ratings.to(device)

    # Forward Pass
    output = model(inp)
    loss = criterion.forward(output, target, known, masked)
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss, output


In [None]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

if torch.cuda.is_available()==True:
    device="cuda:0"
else:
    device ="cpu"
    
#criterion = nn.MSELoss()
criterion=DenoisingLoss(ALPHA, BETA)
EPOCHS = 20
BATCH_SIZE = 28

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model=Denoising_Model().to(device)
optimizer=optim.SGD(model.parameters(),lr=0.07,weight_decay=0.05)
# optimizer = Adam(network.parameters(), lr = 0.007)

epoch_loss=0
for epoch in range(EPOCHS):       
    for bidx, batch in enumerate(train_loader):
       
        x_train = batch['inp']
        y_train = batch['out']
        known = batch['known_indices']
        masked = batch['masked_indices']
        loss, predictions = train(network, x_train, y_train, optimizer, known, masked, criterion)
        epoch_loss += loss
        
    print('Epoch {} Loss : {}'.format((epoch+1),loss))



In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt


def test():
    criterion=nn.MSELoss()
    noRatings = 0
    input, target, minibatch = {}, {}, {}
    # TODO: Replace 'out'/'inp' 
    y_true = [test_dataset[i]['out'] for i in range(len(test_dataset))]
    y_pred = []

    test_loader = DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE, shuffle = False)
    for bidx, batch in enumerate(test_loader):
            x_train = batch['inp']
            y_pred.append(model(x_train))

    y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

    rmse = sqrt(mean_squared_error(y_true, y_pred))
    print("RMSE: ", rmse)

test()