# Dependencies

In [118]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [60]:
MASK_RATIO = 0.25

In [61]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [62]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [63]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [64]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [65]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [66]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [67]:
ratings.groupby('MovieID').count()[["UserID"]].sort_values(by="UserID")

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [68]:
USERS_CNT = 6040

In [250]:
## normalize the rating between [-1, 1]
ratings.Rating =(ratings.Rating-3)/2

In [251]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,1.0,978300760
1,1,661,0.0,978302109
2,1,914,0.0,978301968
3,1,3408,0.5,978300275
4,1,2355,1.0,978824291


In [258]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.MovieID.max()

    items = np.zeros(shape = (items_max_id, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [259]:
ratings_sparse_matrix = generate_sparse_matrix(ratings)

In [151]:
print(ratings_sparse_matrix.shape)
print(ratings_sparse_matrix[0].shape)
ratings_sparse_matrix[0]

(3952, 6040)
(6040,)


array([5., 0., 0., ..., 0., 0., 3.])

In [152]:
train_ratings, test_ratings = train_test_split(ratings_sparse_matrix, test_size=0.1)

In [153]:
train_ratings.shape, test_ratings.shape

((3556, 6040), (396, 6040))

In [229]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings
        self.known_indices = self.get_known_indices()
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = {}
        
        for index, rating in enumerate(self.ratings):
            known_indices[index] =  np.where(rating > 0)[0].tolist()
            
        return known_indices
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = {}
                
        for index, rating in enumerate(masked_ratings):
            
            known = self.known_indices[index]
            known_cnt = len(known)
                        
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            
            rating[masked] = 0
                        
            masked_indices[index] = masked
            
            
        return masked_ratings, masked_indices

In [230]:
train_ratings_dataset = RatingsDataset(train_ratings)

In [247]:
test_ratings_dataset = RatingsDataset(test_ratings)

# Network

In [12]:
class Denoising_Model(nn.Module):
    def __init__(self):
        super(denoising_model,self).__init__()
        self.encoder=nn.Sequential(
                      nn.Linear(inputSize, 770),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(770, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x

In [19]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm

def train():
    if torch.cuda.is_available()==True:
        device="cuda:0"
    else:
        device ="cpu"

    model=Denoising_Model().to(device)
    criterion=nn.MSELoss()
    optimizer=optim.SGD(model.parameters(),lr=0.07,weight_decay=0.05)


    epochs=20
    l=len(train_loader)
    epoch_loss=0
    running_loss=0
    for epoch in range(epochs):
        for dirty,clean,label in tqdm((train_loader)):
            dirty=dirty.view(dirty.size(0),-1).type(torch.FloatTensor)
            clean=clean.view(clean.size(0),-1).type(torch.FloatTensor)
            dirty,clean=dirty.to(device),clean.to(device)
            # Forward Pass
            output=model(dirty)
            loss=criterion(output,clean)
            #Backward Pass---------------------
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss+=loss.item()
            epoch_loss+=loss.item()
        print('Epoch {} Loss : {}'.format((epoch+1),epoch_loss))

In [20]:
from sklearn.metrics import mean_squared_error
from math import sqrt


def test():
    criterion=nn.MSECriterion()
    noRatings = 0
    input, target, minibatch = {}, {}, {}
    # TODO: Replace 'out'/'inp' 
    y_true = [test_dataset[i]['out'].item() for i in range(len(test_dataset))]
    y_pred = []

    trial_loader = DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE, shuffle = False)
    for bidx, batch in enumerate(test_loader):
            x_train = batch['inp']
            y_pred.append(net(x_train))

    y_pred = [x.item() for i in range(len(y_pred)) for x in y_pred[i] ]

    rmse = sqrt(mean_squared_error(y_true, y_pred))
    print("RMSE: ", rmse)
