# Dependencies

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll


# Dataset

In [2]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [3]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [7]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [8]:
ratings.groupby('MovieID').count()[['UserID']].sort_values(by='UserID')

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [9]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
ratings['UserID'].nunique()

6040

In [11]:
ratings['UserID'].max()

6040

In [12]:
ratings['MovieID'].nunique()

3706

In [13]:
ratings['MovieID'].max()

3952

In [14]:
movies['MovieID'].max()

3952

In [15]:
movies['MovieID'].nunique()

3883

In [16]:
USERS_CNT = ratings['UserID'].max()
ITEMS_CNT = ratings['MovieID'].max()

In [17]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.1)

In [18]:
train_ratings.shape, test_ratings.shape

((900188, 4), (100021, 4))

In [19]:
def generate_sparse_matrix(dataset):

    items = np.zeros(shape = (ITEMS_CNT, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [20]:
train_sparse_matrix = generate_sparse_matrix(train_ratings)
test_sparse_matrix = generate_sparse_matrix(test_ratings)

In [22]:
print("Train Sparse Matrix: ", train_sparse_matrix.shape)
print("Test Sparse Matrix: ", test_sparse_matrix.shape)

Train Sparse Matrix:  (3952, 6040)
Test Sparse Matrix:  (3952, 6040)


In [23]:
MASK_RATIO = 0.25

In [24]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for idx, rating in enumerate(self.ratings):
            known = np.where(rating > 0)[0]
            known_indices[idx][known] = 1
            
        return known_indices
        
    
    # normalize between -1 and 1
    def normalize(self):
        
        for idx, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[idx] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for idx, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[idx] == 1)[0]

            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean
                
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for idx, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[idx] == 1)[0].tolist()
            known_cnt = len(known)
            masked_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_cnt)
            rating[masked] = 0
                        
            masked_indices[idx][masked] = 1
            
            
        return masked_ratings, masked_indices

In [25]:
train_dataset = RatingsDataset(train_sparse_matrix)
print("Input: ", train_dataset[0]['inp'], "Input Length: ", len(train_dataset[0]['inp']))
print("Output: ", train_dataset[0]['out'], "Output Length: ", len(train_dataset[0]['out']))

test_dataset = RatingsDataset(test_sparse_matrix)
print("Input: ", test_dataset[0]['inp'], "Input Length: ", len(test_dataset[0]['inp']))
print("Output: ", test_dataset[0]['out'], "Output Length: ", len(test_dataset[0]['out']))

Input:  tensor([0.4289, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]) Input Length:  6040
Output:  tensor([ 0.4289,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5711]) Output Length:  6040
Input:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Input Length:  6040
Output:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Output Length:  6040


In [26]:
print(train_dataset[1])

{'inp': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'out': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'known_indices': array([0., 0., 0., ..., 0., 0., 0.]), 'masked_indices': array([0., 0., 0., ..., 0., 0., 0.])}


In [27]:
len(np.where(train_dataset[1]['known_indices'] == 1)[0])

626

In [28]:
len(np.where(train_dataset[1]['masked_indices'] == 1)[0])

156

In [29]:
np.where(train_dataset[1]['known_indices'] == 1)

(array([   9,   12,   17,   22,   26,   39,   43,   47,   52,   59,   61,
          74,   91,  113,  118,  126,  135,  146,  147,  148,  156,  162,
         194,  197,  203,  221,  222,  228,  244,  261,  263,  269,  270,
         271,  283,  292,  300,  301,  307,  309,  320,  322,  328,  336,
         354,  365,  367,  402,  412,  423,  428,  430,  435,  437,  448,
         473,  474,  475,  508,  519,  530,  532,  540,  548,  562,  570,
         600,  628,  633,  661,  668,  677,  691,  694,  698,  709,  712,
         713,  719,  720,  730,  745,  756,  764,  776,  779,  797,  809,
         816,  868,  876,  879,  889,  892,  910,  924,  954,  970,  983,
        1003, 1014, 1031, 1041, 1049, 1050, 1057, 1079, 1086, 1095, 1099,
        1100, 1108, 1111, 1114, 1118, 1124, 1140, 1151, 1167, 1175, 1180,
        1201, 1202, 1206, 1219, 1229, 1245, 1263, 1264, 1270, 1300, 1302,
        1313, 1316, 1329, 1332, 1338, 1347, 1368, 1370, 1376, 1379, 1388,
        1391, 1394, 1402, 1420, 1421, 

In [30]:
np.where(train_dataset[1]['masked_indices'] == 1)

(array([   9,   22,   52,  113,  126,  148,  156,  162,  197,  261,  269,
         328,  336,  475,  530,  548,  562,  600,  633,  691,  720,  730,
         764,  809,  816,  876,  924,  954,  983, 1095, 1099, 1100, 1124,
        1151, 1167, 1202, 1245, 1302, 1391, 1605, 1640, 1688, 1763, 1840,
        1930, 1942, 1968, 1979, 1982, 1995, 2062, 2087, 2156, 2250, 2318,
        2410, 2488, 2539, 2582, 2674, 2741, 2742, 2776, 2847, 2886, 2906,
        2908, 2925, 2938, 2961, 2965, 2998, 3031, 3079, 3092, 3100, 3127,
        3140, 3157, 3162, 3195, 3223, 3258, 3271, 3291, 3390, 3433, 3500,
        3502, 3508, 3511, 3538, 3577, 3581, 3609, 3649, 3697, 3710, 3714,
        3806, 3807, 3818, 3820, 3950, 3970, 3993, 3998, 4005, 4041, 4047,
        4085, 4111, 4140, 4152, 4221, 4226, 4237, 4278, 4317, 4343, 4344,
        4489, 4509, 4566, 4632, 4665, 4686, 4818, 4901, 4956, 4974, 5000,
        5010, 5111, 5221, 5229, 5231, 5301, 5400, 5407, 5451, 5492, 5535,
        5569, 5613, 5626, 5688, 5719, 

In [31]:
train_dataset[1]['inp'][17]

tensor(-0.6062)

In [32]:
train_dataset[1]['inp'][5853]

tensor(0.3938)

In [33]:
train_dataset[1]['out'][17]

tensor(-0.6062)

In [34]:
train_dataset[1]['out'][5853]

tensor(0.3938)

# Network

In [35]:
import torch.nn as nn


inputSize = 6040
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      #nn.BatchNorm1d(6040),
                      nn.Linear(inputSize, 770),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      #nn.BatchNorm1d(770),
                      nn.Linear(770, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

Denoising_Model(
  (encoder): Sequential(
    (0): Linear(in_features=6040, out_features=770, bias=True)
    (1): Tanh()
  )
  (decoder): Sequential(
    (0): Linear(in_features=770, out_features=6040, bias=True)
    (1): Tanh()
  )
)


In [36]:
ALPHA = 1
BETA = 0.5

In [37]:
def denosingLoss(output, target, known, masked):

    loss = 0
        
    for idx, out in enumerate(output):
        known_idx = np.where(known[idx] == 1)[0]
        masked_idx = np.where(masked[idx] == 1)[0]
        not_masked_idx = list(set(known_idx) - set(masked_idx))
        
        masked_output = output[idx][masked_idx]
        masked_target = target[idx][masked_idx]
        
        not_masked_output = output[idx][not_masked_idx]
        not_masked_target = target[idx][not_masked_idx]

        if len(masked_idx) > 0:
            loss += ALPHA * torch.sum(torch.square(torch.sub(masked_output, masked_target))) 
            
        if len(not_masked_idx) > 0:
            loss += BETA * torch.sum(torch.square(torch.sub(not_masked_output, not_masked_target)))
        
    return loss 

In [38]:
print(len(train_dataset))

3952


In [39]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)

In [40]:
def train(model, inp, target, optimizer, known, masked):   
#     masked_ratings,target=masked_ratings.to(device),target.to(device)
    
    # Forward Pass
    output = model(inp)
    loss = denosingLoss(output, target, known, masked)
        
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
   # scheduler.step()

    return loss, output


In [42]:
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

# if torch.cuda.is_available() == True:
#     device="cuda:0"
# else:
device ="cpu"

EPOCHS = 30
BATCH_SIZE = 35

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model = Denoising_Model().to(device)
init_weights(model)

#criterion = nn.MSELoss()
#optimizer=Adam(model.parameters(),lr=0.01)
optimizer = optim.SGD(model.parameters(), lr = 0.03, weight_decay = 0.01)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.3)

epoch_loss = []

for epoch in range(EPOCHS): 
    acc_epoch_loss = 0
    
    for bidx, batch in enumerate(train_loader):
        
        x_train = batch['inp']
        y_train = batch['out']
        known = batch['known_indices']
        masked = batch['masked_indices']
        
        loss, predictions = train(model, x_train, y_train, optimizer, known, masked)
        acc_epoch_loss += loss
    
    epoch_loss.append(acc_epoch_loss / len(train_dataset))  
    
    print('Epoch {} Loss : {}'.format((epoch+1), epoch_loss[epoch] ))
    


Epoch 1 Loss : 106.9705581665039
Epoch 2 Loss : 103.97200012207031
Epoch 3 Loss : 98.64114379882812
Epoch 4 Loss : 95.55217742919922
Epoch 5 Loss : 93.20600128173828
Epoch 6 Loss : 91.5093765258789
Epoch 7 Loss : 90.0951156616211
Epoch 8 Loss : 88.74757385253906
Epoch 9 Loss : 87.6395263671875
Epoch 10 Loss : 86.55996704101562
Epoch 11 Loss : 85.7756576538086
Epoch 12 Loss : 85.02269744873047
Epoch 13 Loss : 84.36695861816406
Epoch 14 Loss : 83.59457397460938
Epoch 15 Loss : 83.2055435180664
Epoch 16 Loss : 82.6905517578125
Epoch 17 Loss : 82.09183502197266
Epoch 18 Loss : 81.7367935180664
Epoch 19 Loss : 81.39216613769531
Epoch 20 Loss : 81.13054656982422
Epoch 21 Loss : 80.73898315429688
Epoch 22 Loss : 80.50210571289062
Epoch 23 Loss : 80.28089141845703
Epoch 24 Loss : 80.04816436767578
Epoch 25 Loss : 79.84832763671875
Epoch 26 Loss : 79.61077117919922
Epoch 27 Loss : 79.36669158935547
Epoch 28 Loss : 79.15909576416016
Epoch 29 Loss : 78.98438262939453
Epoch 30 Loss : 78.8361663818

In [43]:
test_batch = []
train_batch = []
known_batch = []
batch_size = 35
minibatches = []

for idx, test in enumerate(test_dataset):
    known_ratings_cnt = len(np.where(test['known_indices'] == 1)[0])
    
    if known_ratings_cnt > 0:

        train_batch.append(train_dataset[idx]['out'])
        known_batch.append(torch.tensor(test['known_indices']))
        test_batch.append(test['out'])
        
    if len(test_batch) >= batch_size:
        minibatches.append((torch.stack(train_batch), torch.stack(test_batch), torch.stack(known_batch)))
        test_batch.clear()
        train_batch.clear()
        known_batch.clear()
        
if len(test_batch) > 0:
    minibatches.append((torch.stack(train_batch), torch.stack(test_batch), torch.stack(known_batch)))
    test_batch.clear()
    train_batch.clear()
    known_batch.clear()       

In [44]:
def denormalize(arr):
    arr = np.array(arr)

    target_from = 1
    target_to = 5

    arr_min = min(arr)
    arr_max = max(arr)

    area_length = arr_max - arr_min
    scale_coef = ( target_to - target_from ) / area_length

    arr *= scale_coef
    arr += target_from - arr.min()

    return arr


In [48]:

from sklearn.metrics import mean_squared_error
from math import sqrt

from torch.nn import MSELoss


def test():
    y_true_buffer = []
    y_pred_buffer = []

    with torch.no_grad():
        for bidx, batch in enumerate(minibatches):
                train_batch = batch[0]
                test_batch = batch[1]
                known_batch = batch[2]
                y_predictions = model(train_batch)

                for idx, test in enumerate(test_batch):
                    y_true = test[np.where(known_batch[idx].numpy() == 1)[0]]
                    y_pred = y_predictions[idx][np.where(known_batch[idx].numpy() == 1)[0]]

                    y_true_buffer += y_true.numpy().tolist()
                    y_pred_buffer += y_pred.numpy().tolist()


    #y_true_buffer = denormalize(y_true_buffer)
    #y_pred_buffer = denormalize(y_pred_buffer)

    mse = mean_squared_error(y_true_buffer, y_pred_buffer)
    rmse = sqrt(mse)
    print(rmse)

    
test()


0.9143430251864957
