# Dependencies

In [32]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [33]:
MASK_RATIO = 0.25

In [34]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [35]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [36]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [37]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [38]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [39]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [40]:
ratings.groupby('MovieID').count()[["UserID"]].sort_values(by="UserID")

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [41]:
USERS_CNT = 6040

In [42]:
## normalize the rating between [-1, 1]
ratings.Rating =(ratings.Rating-3)/2

In [43]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,1.0,978300760
1,1,661,0.0,978302109
2,1,914,0.0,978301968
3,1,3408,0.5,978300275
4,1,2355,1.0,978824291


In [44]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.MovieID.max()

    items = np.zeros(shape = (items_max_id, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [45]:
ratings_sparse_matrix = generate_sparse_matrix(ratings)

In [46]:
print(ratings_sparse_matrix.shape)
print(ratings_sparse_matrix[0].shape)
ratings_sparse_matrix[0]

(3952, 6040)
(6040,)


array([1., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [47]:
train_ratings, test_ratings = train_test_split(ratings_sparse_matrix, test_size=0.1)

In [48]:
train_ratings.shape, test_ratings.shape

((3556, 6040), (396, 6040))

In [49]:
from torch.utils.data import Dataset


class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings
        self.known_indices = self.get_known_indeces()
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp':  torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float()
#             'known_indices': self.known_indices[index],
#             'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indeces(self):
        known_indices = {}
        for index, rating in enumerate(self.ratings):
            known_indices[index] =  np.where(rating > 0)[0].tolist()
            
        return known_indices
    
   
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = {}
                
        for index, rating in enumerate(masked_ratings):
            
            known = self.known_indices[index]
            known_cnt = len(known)
                        
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            
            rating[masked] = 0
                        
            masked_indices[index] = masked
            
            
        return masked_ratings, masked_indices

In [50]:
train_dataset = RatingsDataset(train_ratings)

In [51]:
test_dataset = RatingsDataset(test_ratings)

# Network

In [76]:
import torch.nn as nn

inputSize = 6040
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      nn.Linear(inputSize, 100),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(100, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

Denoising_Model(
  (encoder): Sequential(
    (0): Linear(in_features=6040, out_features=100, bias=True)
    (1): Tanh()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=6040, bias=True)
    (1): Tanh()
  )
)


In [59]:
print(len(train_dataset))

3556


In [54]:
def train(model, masked_ratings, known_ratings, optimizer, criterion):   
    masked_ratings,known_ratings=masked_ratings.to(device),known_ratings.to(device)
    # Forward Pass
    output=model(masked_ratings)
    loss=criterion(output,known_ratings)
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss, output


In [79]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

if torch.cuda.is_available()==True:
    device="cuda:0"
else:
    device ="cpu"

criterion = nn.MSELoss()
EPOCHS = 20
BATCH_SIZE = 28

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model=Denoising_Model().to(device)
criterion=nn.MSELoss()
# optimizer=optim.SGD(model.parameters(),lr=0.07,weight_decay=0.05)
optimizer = Adam(network.parameters(), lr = 0.007)

epoch_loss=0
for epoch in range(EPOCHS):       
    for bidx, batch in enumerate(train_loader):
        x_train = batch['inp']
        y_train = batch['out']

        loss, predictions = train(network,x_train,y_train, optimizer, criterion)
        epoch_loss += loss
        
    print('Epoch {} Loss : {}'.format((epoch+1),loss))



3556
True
Epoch 1 Loss : 0.009264114312827587
Epoch 2 Loss : 0.012533292174339294
Epoch 3 Loss : 0.020862579345703125
Epoch 4 Loss : 0.02182365395128727
Epoch 5 Loss : 0.015517272986471653
Epoch 6 Loss : 0.024610206484794617
Epoch 7 Loss : 0.03386373817920685
Epoch 8 Loss : 0.013549523428082466
Epoch 9 Loss : 0.01635218970477581
Epoch 10 Loss : 0.024760758504271507
Epoch 11 Loss : 0.01519069168716669
Epoch 12 Loss : 0.01267748512327671
Epoch 13 Loss : 0.026379330083727837
Epoch 14 Loss : 0.025091085582971573
Epoch 15 Loss : 0.0225560013204813
Epoch 16 Loss : 0.016098754480481148
Epoch 17 Loss : 0.02474430575966835
Epoch 18 Loss : 0.018545040860772133
Epoch 19 Loss : 0.017396394163370132
Epoch 20 Loss : 0.021031230688095093


In [81]:
from sklearn.metrics import mean_squared_error
from math import sqrt


def test():
    criterion=nn.MSECriterion()
    noRatings = 0
    input, target, minibatch = {}, {}, {}
    # TODO: Replace 'out'/'inp' 
    y_true = [test_dataset[i]['out'].item() for i in range(len(test_dataset))]
    y_pred = []

    trial_loader = DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE, shuffle = False)
    for bidx, batch in enumerate(test_loader):
            x_train = batch['inp']
            y_pred.append(net(x_train))

    y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

    rmse = sqrt(mean_squared_error(y_true, y_pred))
    print("RMSE: ", rmse)

test()

AttributeError: module 'torch.nn' has no attribute 'MSECriterion'