# Dependencies

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [4]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [5]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [9]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [10]:
ratings.groupby('MovieID').count()[['UserID']].sort_values(by='UserID')

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [11]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [18]:
ratings['UserID'].nunique()

6040

In [23]:
ratings['UserID'].max()

6040

In [19]:
ratings['MovieID'].nunique()

3706

In [20]:
ratings['MovieID'].max()

3952

In [21]:
movies['MovieID'].max()

3952

In [22]:
movies['MovieID'].nunique()

3883

In [24]:
USERS_CNT = ratings['UserID'].max()
ITEMS_CNT = ratings['MovieID'].max()

In [25]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.1)

In [31]:
train_ratings.shape, test_ratings.shape

((900188, 4), (100021, 4))

In [28]:
def generate_sparse_matrix(dataset):

    items = np.zeros(shape = (ITEMS_CNT, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [29]:
train_sparse_matrix = generate_sparse_matrix(train_ratings)
test_sparse_matrix = generate_sparse_matrix(test_ratings)

In [30]:
print("Train Sparse Matrix: ", train_sparse_matrix.shape)
print("Test Sparse Matrix: ", test_sparse_matrix.shape)

Train Sparse Matrix:  (3952, 6040)
Test Sparse Matrix:  (3952, 6040)


In [12]:
MASK_RATIO = 0.25

In [38]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for idx, rating in enumerate(self.ratings):
            known = np.where(rating > 0)[0]
            known_indices[idx][known] = 1
            
        return known_indices
        
    
    # normalize between -1 and 1
    def normalize(self):
        
        for idx, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[idx] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for idx, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[idx] == 1)[0]

            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean
                
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for idx, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[idx] == 1)[0].tolist()
            known_cnt = len(known)
            masked_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_cnt)
            rating[masked] = 0
                        
            masked_indices[idx][masked] = 1
            
            
        return masked_ratings, masked_indices

In [39]:
train_dataset = RatingsDataset(train_sparse_matrix)
print("Input: ", train_dataset[0]['inp'], "Input Length: ", len(train_dataset[0]['inp']))
print("Output: ", train_dataset[0]['out'], "Output Length: ", len(train_dataset[0]['out']))

test_dataset = RatingsDataset(test_sparse_matrix)
print("Input: ", test_dataset[0]['inp'], "Input Length: ", len(test_dataset[0]['inp']))
print("Output: ", test_dataset[0]['out'], "Output Length: ", len(test_dataset[0]['out']))

Input:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Input Length:  6040
Output:  tensor([ 0.4262,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5738]) Output Length:  6040
Input:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Input Length:  6040
Output:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Output Length:  6040


In [40]:
print(train_dataset[1])

{'inp': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'out': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'known_indices': array([0., 0., 0., ..., 0., 0., 0.]), 'masked_indices': array([0., 0., 0., ..., 0., 0., 0.])}


In [44]:
len(np.where(train_dataset[1]['known_indices'] == 1)[0])

635

In [45]:
len(np.where(train_dataset[1]['masked_indices'] == 1)[0])

158

In [51]:
np.where(train_dataset[1]['known_indices'] == 1)

(array([  17,   22,   26,   39,   43,   47,   52,   59,   61,   74,   82,
          91,  108,  113,  118,  126,  135,  145,  146,  147,  148,  162,
         194,  197,  203,  221,  222,  228,  244,  261,  263,  269,  270,
         271,  292,  301,  307,  309,  320,  328,  336,  354,  365,  367,
         386,  402,  412,  423,  428,  430,  437,  448,  473,  474,  475,
         508,  532,  540,  548,  549,  562,  565,  570,  600,  628,  633,
         640,  661,  668,  677,  691,  694,  698,  709,  711,  712,  713,
         730,  745,  752,  764,  776,  779,  797,  809,  816,  868,  876,
         879,  889,  910,  958,  970,  983, 1003, 1009, 1031, 1041, 1049,
        1050, 1057, 1079, 1086, 1095, 1099, 1100, 1108, 1111, 1114, 1118,
        1119, 1124, 1131, 1140, 1151, 1167, 1201, 1202, 1206, 1219, 1229,
        1243, 1245, 1263, 1264, 1265, 1270, 1297, 1300, 1302, 1313, 1316,
        1329, 1332, 1338, 1339, 1347, 1368, 1370, 1376, 1379, 1388, 1391,
        1402, 1420, 1424, 1446, 1447, 

In [48]:
np.where(train_dataset[1]['masked_indices'] == 1)

(array([  22,   43,   82,  108,  126,  221,  222,  261,  269,  271,  354,
         367,  437,  448,  473,  532,  548,  549,  565,  600,  628,  712,
         764,  776,  797,  809,  816,  983, 1003, 1108, 1111, 1140, 1151,
        1167, 1219, 1229, 1245, 1300, 1313, 1316, 1329, 1347, 1368, 1370,
        1376, 1402, 1420, 1448, 1495, 1541, 1598, 1674, 1715, 1732, 1780,
        1853, 1890, 1898, 1919, 1942, 1995, 2053, 2087, 2096, 2108, 2118,
        2241, 2257, 2469, 2509, 2518, 2550, 2565, 2582, 2591, 2594, 2623,
        2664, 2674, 2906, 2961, 3023, 3081, 3092, 3109, 3118, 3139, 3154,
        3195, 3223, 3291, 3311, 3366, 3461, 3482, 3500, 3510, 3521, 3608,
        3609, 3617, 3654, 3666, 3675, 3691, 3725, 3755, 3776, 3791, 3807,
        3818, 3833, 3933, 3944, 3945, 3970, 4063, 4085, 4095, 4126, 4139,
        4226, 4311, 4317, 4359, 4362, 4403, 4445, 4489, 4507, 4576, 4599,
        4718, 4807, 4823, 4866, 4956, 5025, 5053, 5113, 5217, 5350, 5364,
        5614, 5656, 5662, 5697, 5750, 

In [52]:
train_dataset[1]['inp'][17]

tensor(-0.6071)

In [65]:
train_dataset[1]['inp'][5853]

tensor(0.)

In [54]:
train_dataset[1]['out'][17]

tensor(-0.6071)

In [64]:
train_dataset[1]['out'][5853]

tensor(0.3929)

# Network

In [66]:
import torch.nn as nn


inputSize = 6040
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      nn.Linear(inputSize, 770),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(770, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

Denoising_Model(
  (encoder): Sequential(
    (0): Linear(in_features=6040, out_features=770, bias=True)
    (1): Tanh()
  )
  (decoder): Sequential(
    (0): Linear(in_features=770, out_features=6040, bias=True)
    (1): Tanh()
  )
)


In [150]:
ALPHA = 1
BETA = 0.5
REG_PARAM = 0.005

In [158]:
def denosingLoss(output, target, known, masked):

    loss = 0
        
    for idx, out in enumerate(output):
        known_idx = np.where(known[idx] == 1)[0]
        masked_idx = np.where(masked[idx] == 1)[0]
        not_masked_idx = list(set(known_idx) - set(masked_idx))
        
        masked_output = output[idx][masked_idx]
        masked_target = target[idx][masked_idx]
        
        not_masked_output = output[idx][not_masked_idx]
        not_masked_target = target[idx][not_masked_idx]

        if len(masked_idx) > 0:
            loss += ALPHA * torch.sum(torch.square(torch.sub(masked_output, masked_target))) 
            
        if len(not_masked_idx) > 0:
            loss += BETA * torch.sum(torch.square(torch.sub(not_masked_output, not_masked_target)))
        
        # loss += REG_PARAM * torch.sum(torch.square(weights)) Add regularization

    return loss 

In [159]:
print(len(train_dataset))

3952


In [160]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)

In [161]:
def train(model, inp, target, optimizer, known, masked):   
#     masked_ratings,target=masked_ratings.to(device),target.to(device)
    
    # Forward Pass
    output = model(inp)
    loss = denosingLoss(output, target, known, masked)
        
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
   # scheduler.step()

    return loss, output


In [162]:
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

# if torch.cuda.is_available() == True:
#     device="cuda:0"
# else:
device ="cpu"

EPOCHS = 20
BATCH_SIZE = 35

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model = Denoising_Model().to(device)
init_weights(model)

#criterion = nn.MSELoss()
# optimizer=Adam(model.parameters(),lr=0.07,weight_decay=0.05)
optimizer = optim.SGD(model.parameters(), lr = 0.07, weight_decay = 0.05)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.3)

epoch_loss = 0
epoch_loss = []

for epoch in range(EPOCHS): 
    acc_epoch_loss = 0
    
    for bidx, batch in enumerate(train_loader):
        
        x_train = batch['inp']
        y_train = batch['out']
        known = batch['known_indices']
        masked = batch['masked_indices']
        
        loss, predictions = train(model, x_train, y_train, optimizer, known, masked)
        acc_epoch_loss += loss
    
    epoch_loss.append(acc_epoch_loss / len(train_dataset))  
    
    print('Epoch {} Loss : {}'.format((epoch+1), epoch_loss[epoch] ))
    


Epoch 1 Loss : 156.32398986816406
Epoch 2 Loss : 136.69903564453125
Epoch 3 Loss : 130.4447784423828
Epoch 4 Loss : 125.8723373413086
Epoch 5 Loss : 122.0219497680664
Epoch 6 Loss : 119.12550354003906
Epoch 7 Loss : 116.94558715820312
Epoch 8 Loss : 115.00597381591797
Epoch 9 Loss : 114.39061737060547
Epoch 10 Loss : 113.645263671875
Epoch 11 Loss : 112.80889892578125
Epoch 12 Loss : 112.53568267822266
Epoch 13 Loss : 112.33924102783203
Epoch 14 Loss : 112.1200942993164
Epoch 15 Loss : 111.87872314453125
Epoch 16 Loss : 111.5798568725586
Epoch 17 Loss : 111.89928436279297
Epoch 18 Loss : 111.86024475097656
Epoch 19 Loss : 111.86781311035156
Epoch 20 Loss : 111.67546081542969


In [163]:
test_batch = []
train_batch = []
known_batch = []
all_ratings_cnt = 0
batch_size = 35
minibatches = []

for idx, test in enumerate(test_dataset):
    known_ratings_cnt = len(np.where(test['known_indices'] == 1)[0])
    
    if known_ratings_cnt > 0:
        
        
        all_ratings_cnt += known_ratings_cnt
        train_batch.append(train_dataset[idx]['inp'])
        known_batch.append(torch.tensor(test['known_indices']))
        test_batch.append(test['inp'])
        
    if len(test_batch) >= batch_size:
        minibatches.append((torch.stack(train_batch), torch.stack(test_batch), torch.stack(known_batch)))
        test_batch.clear()
        train_batch.clear()
        known_batch.clear()
        
print(minibatches[0])
       

(tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0572],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0185]]), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64))


In [164]:
from sklearn.metrics import mean_squared_error
from math import sqrt

from torch.nn import MSELoss


def test():
    criterion = nn.MSELoss()
    noRatings = 0
    input, target, minibatch = {}, {}, {}

    
    mse = 0        
    with torch.no_grad():
        for bidx, batch in enumerate(minibatches):
                train_batch = batch[0]
                test_batch = batch[1]
                known_batch = batch[2]
                y_predictions = model(train_batch)

                for idx, test in enumerate(test_batch):
                    y_true = test[np.where(known_batch[idx].numpy() == 1)[0]]
                    y_pred = y_predictions[idx][np.where(known_batch[idx].numpy() == 1)[0]]
                    
                    mse += torch.sum((y_pred - y_true)**2)

    print(all_ratings_cnt)  
    mse = mse / all_ratings_cnt
    rmse = sqrt(mse)
    print(rmse)
    
test()


100021
0.9517879488358227
