# Dependencies

In [1]:
import torch
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Dataset

In [3]:
MASK_RATIO = 0.25

In [5]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',header=None,names=["MovieID", "Title", "Genres"],engine='python')
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,names=["UserID", "MovieID", "Rating", "Timestamp"],engine='python')
users = pd.read_csv('ml-1m/users.dat',sep='::',header=None,names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],engine='python')

In [6]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
movies.shape, ratings.shape, users.shape

((3883, 3), (1000209, 4), (6040, 5))

In [8]:
ratings.groupby('UserID').count()[["MovieID"]].sort_values(by="MovieID")

Unnamed: 0_level_0,MovieID
UserID,Unnamed: 1_level_1
947,20
4068,20
2530,20
341,20
5258,20
...,...
1181,1521
1941,1595
4277,1743
1680,1850


In [9]:
ratings.groupby('MovieID').count()[["UserID"]].sort_values(by="UserID")

Unnamed: 0_level_0,UserID
MovieID,Unnamed: 1_level_1
402,1
2214,1
3382,1
2217,1
2218,1
...,...
480,2672
1210,2883
1196,2990
260,2991


In [11]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
USERS_CNT = 6040

In [7]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.1)

In [8]:
def generate_sparse_matrix(dataset):
    items_max_id = dataset.MovieID.max()

    items = np.zeros(shape = (items_max_id, USERS_CNT), dtype = np.float32)

    for index, row in dataset.iterrows():
        items[int(row['MovieID'] - 1), int(row['UserID'] - 1)] = row['Rating']

    return items

In [11]:
train_sparse_matrix = generate_sparse_matrix(train_ratings)


In [12]:
test_sparse_matrix = generate_sparse_matrix(test_ratings)
print(test_sparse_matrix.shape)

(3952, 6040)


In [13]:
print("Train Sparse Matrix: ", train_sparse_matrix.shape)
print("Test Sparse Matrix: ", test_sparse_matrix.shape)

Train Sparse Matrix:  (3952, 6040)
Test Sparse Matrix:  (3952, 6040)


In [14]:
train_ratings.shape, test_ratings.shape

((900188, 4), (100021, 4))

In [15]:
from torch.utils.data import Dataset

class RatingsDataset(Dataset):
    
    def __init__(self, ratings):
        self.ratings = ratings.copy()
        self.known_indices = self.get_known_indices()
        self.normalize()
        self.subtract_mean()
        
        self.masked_ratings, self.masked_indices = self.mask_ratings()
        
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        
        result = {
            'inp': torch.from_numpy(np.array(self.masked_ratings[index])).float(), 
            'out': torch.from_numpy(np.array(self.ratings[index])).float(),
            'known_indices': self.known_indices[index],
            'masked_indices': self.masked_indices[index]
        }
        
        return result
        
    
    def get_known_indices(self):
        known_indices = np.zeros(shape = self.ratings.shape)

        for index, rating in enumerate(self.ratings):
            known = np.where(rating > 0)[0].tolist()
            known_indices[index][known] = 1
            
        return known_indices
        
    def mask_ratings(self):
    
        masked_ratings = self.ratings.copy()
        masked_indices = np.zeros(shape = self.ratings.shape)
                
        for index, rating in enumerate(masked_ratings):
            
            known = np.where(self.known_indices[index] == 1)[0].tolist()
            known_cnt = len(known)
            masked_features_cnt = int(MASK_RATIO * known_cnt)
            masked = random.sample(known, masked_features_cnt)
            rating[masked] = 0
                        
            masked_indices[index][masked] = 1
            
            
        return masked_ratings, masked_indices
    
    def normalize(self):
        
        for index, rating in enumerate(self.ratings):
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                rating[known] -= 3
                rating[known] /= 2
    
    def subtract_mean(self):
        
        for index, rating in enumerate(self.ratings):
            
            known = np.where(self.known_indices[index] == 1)[0]
            
            if len(known) > 0:
                mean = rating[known].mean()
                rating[known] -= mean

In [16]:
train_dataset = RatingsDataset(train_sparse_matrix)
print("Input: ", train_dataset[0]['inp'], "Input Length: ", len(train_dataset[0]['inp']))
print("Output: ", train_dataset[0]['out'], "Output Length: ", len(train_dataset[5]['out']))

test_dataset = RatingsDataset(test_sparse_matrix)
print("Input: ", test_dataset[0]['inp'], "Input Length: ", len(test_dataset[5]['inp']))
print("Output: ", test_dataset[0]['out'], "Output Length: ", len(test_dataset[5]['out']))

Input:  tensor([ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5718]) Input Length:  6040
Output:  tensor([ 0.4282,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.5718]) Output Length:  6040
Input:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Input Length:  6040
Output:  tensor([0., 0., 0.,  ..., 0., 0., 0.]) Output Length:  6040


In [17]:
print(train_dataset[2])

{'inp': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'out': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'known_indices': array([0., 0., 0., ..., 0., 0., 0.]), 'masked_indices': array([0., 0., 0., ..., 0., 0., 0.])}


# Network

In [18]:
ALPHA = 1
BETA = 0.5

In [19]:
import torch.nn as nn


inputSize = 6040
class Denoising_Model(nn.Module):
    def __init__(self):
        super(Denoising_Model,self).__init__()
        self.encoder=nn.Sequential(
                      nn.Linear(inputSize, 770),  # There are 6040 users in movieLens-1M
                      nn.Tanh()
                      )

        self.decoder=nn.Sequential(
                      nn.Linear(770, inputSize),
                      nn.Tanh()
                      )


    def forward(self,x):
        x=self.encoder(x)
        x=self.decoder(x)
        return x
    
network = Denoising_Model()
print(network)

Denoising_Model(
  (encoder): Sequential(
    (0): Linear(in_features=6040, out_features=770, bias=True)
    (1): Tanh()
  )
  (decoder): Sequential(
    (0): Linear(in_features=770, out_features=6040, bias=True)
    (1): Tanh()
  )
)


In [20]:
def denosingLoss(output, target, known, masked):

        loss = 0

        for index, out in enumerate(output):
            out = output[index]

            known_indices = np.where(known[index] == 1)[0]
            masked_indices = np.where(masked[index] == 1)[0]
            known_masked_diff = list(set(known_indices) - set(masked_indices))        

            masked_output = output[index][masked_indices]
            masked_target = target[index][masked_indices]
            known_masked_diff_output = output[index][known_masked_diff]
            known_masked_diff_target = target[index][known_masked_diff]

            if len(masked_output) > 0 and len(known_masked_diff_output) > 0:
                loss += ALPHA * torch.sum(torch.square(torch.sub(masked_output, masked_target))) \
                        + BETA * torch.sum(torch.square(torch.sub(known_masked_diff_output, known_masked_diff_target)))

        return loss / BATCH_SIZE

In [21]:
print(len(train_dataset))

3952


In [22]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)

In [23]:
def train(model, masked_ratings, target, optimizer, criterion, known, masked):   
#     masked_ratings,target=masked_ratings.to(device),target.to(device)
    
    # Forward Pass
    output = model(masked_ratings)
    loss = denosingLoss(output, target, known, masked)
        
    #Backward Pass---------------------
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
   # scheduler.step()

    return loss, output


In [24]:
from torch.optim import Adam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch import optim
from torch.utils.data import DataLoader

# if torch.cuda.is_available() == True:
#     device="cuda:0"
# else:
device ="cpu"

EPOCHS = 20
BATCH_SIZE = 35

train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
model = Denoising_Model().to(device)
init_weights(model)
criterion = nn.MSELoss()
# optimizer=Adam(model.parameters(),lr=0.07,weight_decay=0.05)
optimizer = optim.SGD(model.parameters(), lr = 0.07, weight_decay = 0.05)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.3)

epoch_loss = 0
epoch_loss = []

for epoch in range(EPOCHS): 
    acc_epoch_loss = 0
    
    for bidx, batch in enumerate(train_loader):
        
        x_train = batch['inp']
        y_train = batch['out']
        known = batch['known_indices']
        masked = batch['masked_indices']
        
        loss, predictions = train(model, x_train, y_train, optimizer, criterion, known, masked)
        acc_epoch_loss += loss
    
    epoch_loss.append(acc_epoch_loss / len(train_dataset))        
    print('Epoch {} Loss : {}'.format((epoch+1), epoch_loss[epoch] ))
    


Epoch 1 Loss : 0.8248584866523743
Epoch 2 Loss : 0.6867635846138
Epoch 3 Loss : 0.6472795009613037
Epoch 4 Loss : 0.6294000148773193
Epoch 5 Loss : 0.6098825335502625
Epoch 6 Loss : 0.5980748534202576
Epoch 7 Loss : 0.5899688601493835
Epoch 8 Loss : 0.5837532877922058
Epoch 9 Loss : 0.5784732103347778
Epoch 10 Loss : 0.5743650197982788
Epoch 11 Loss : 0.5706970691680908
Epoch 12 Loss : 0.5676262378692627
Epoch 13 Loss : 0.5658714771270752
Epoch 14 Loss : 0.5627363920211792
Epoch 15 Loss : 0.5616626143455505
Epoch 16 Loss : 0.5594909191131592
Epoch 17 Loss : 0.5623922348022461
Epoch 18 Loss : 0.5585438013076782
Epoch 19 Loss : 0.5576106309890747
Epoch 20 Loss : 0.5572994947433472


In [26]:
test_batch = []
train_batch = []
known_batch = []
all_ratings_cnt = 0
batch_size = 12
minibatches = []

for idx, test in enumerate(test_dataset):
    known_ratings_cnt = len(np.where(test['known_indices'] == 1)[0])
    
    if known_ratings_cnt > 0:
        
        print(idx)
        print(known_ratings_cnt, len(np.where(test['known_indices'] == 1)[0]), len(np.where(train_dataset[idx]['known_indices'] == 1)[0]))
        print(len(np.where(test['known_indices'] == 1)[0]) + len(np.where(train_dataset[idx]['known_indices'] == 1)[0]))
        
        all_ratings_cnt += known_ratings_cnt
        train_batch.append(train_dataset[idx]['inp'])
        known_batch.append(torch.tensor(test['known_indices']))
        test_batch.append(test['inp'])
        
    if len(test_batch) >= batch_size:
        minibatches.append((torch.stack(train_batch), torch.stack(test_batch), torch.stack(known_batch)))
        test_batch.clear()
        train_batch.clear()
        known_batch.clear()
        
print(minibatches[0])
print(len(minibatches))
       

0
217 217 1860
2077
1
68 68 633
701
2
55 55 423
478
3
14 14 156
170
4
32 32 264
296
5
94 94 846
940
6
35 35 423
458
7
7 7 61
68
8
5 5 97
102
9
75 75 813
888
10
107 107 926
1033
11
22 22 138
160
12
8 8 91
99
13
22 22 131
153
14
18 18 128
146
15
70 70 612
682
16
79 79 756
835
17
17 17 140
157
18
29 29 360
389
19
23 23 137
160
20
145 145 1211
1356
21
40 40 338
378
22
13 13 113
126
23
62 62 562
624
24
100 100 880
980
25
7 7 93
100
26
5 5 56
61
27
20 20 159
179
28
43 43 360
403
29
11 11 63
74
30
13 13 128
141
31
147 147 1364
1511
32
1 1 3
4
33
183 183 1568
1751
34
4 4 66
70
35
83 83 845
928
36
1 1 7
8
37
4 4 24
28
38
123 123 1239
1362
39
4 4 26
30
40
31 31 211
242
41
21 21 200
221
42
15 15 151
166
43
33 33 278
311
44
56 56 488
544
45
11 11 155
166
46
117 117 1020
1137
47
40 40 342
382
48
5 5 22
27
49
174 174 1609
1783
51
43 43 388
431
53
3 3 38
41
54
3 3 42
45
55
2 2 7
9
56
6 6 93
99
57
46 46 455
501
58
1 1 7
8
59
34 34 323
357
60
4 4 63
67
61
53 53 488
541
62
12 12 99
111
63
6 6 73
79
64
2

618
610
27 27 189
216
611
7 7 72
79
612
9 9 73
82
613
2 2 7
9
614
5 5 62
67
615
28 28 250
278
617
2 2 7
9
618
3 3 29
32
620
6 6 36
42
625
1 1 18
19
626
11 11 110
121
627
53 53 432
485
629
1 1 6
7
630
7 7 68
75
631
1 1 12
13
632
2 2 18
20
633
1 1 2
3
634
6 6 71
77
636
11 11 129
140
637
3 3 16
19
638
12 12 66
78
639
7 7 64
71
640
1 1 0
1
644
2 2 18
20
646
56 56 385
441
647
167 167 1360
1527
648
5 5 11
16
649
12 12 77
89
651
1 1 8
9
652
54 54 558
612
655
4 4 35
39
660
54 54 471
525
661
11 11 117
128
662
23 23 241
264
663
2 2 6
8
664
3 3 32
35
667
3 3 44
47
668
3 3 20
23
669
3 3 53
56
670
42 42 414
456
671
2 2 0
2
672
63 63 500
563
673
42 42 344
386
677
31 31 275
306
679
11 11 53
64
680
2 2 13
15
681
2 2 2
4
684
6 6 24
30
687
15 15 198
213
690
12 12 109
121
691
12 12 94
106
693
18 18 123
141
694
15 15 120
135
696
6 6 79
85
699
20 20 125
145
701
3 3 24
27
703
6 6 36
42
704
3 3 33
36
705
1 1 1
2
706
26 26 249
275
707
56 56 623
679
708
16 16 128
144
709
6 6 45
51
710
4 4 58
62
711
1 1 1
2
713

1419
1304
14 14 107
121
1305
13 13 123
136
1306
162 162 1406
1568
1309
8 8 35
43
1310
1 1 6
7
1311
2 2 27
29
1312
3 3 10
13
1316
3 3 37
40
1319
104 104 821
925
1320
45 45 504
549
1321
4 4 28
32
1322
6 6 37
43
1323
1 1 21
22
1324
3 3 20
23
1325
6 6 50
56
1326
27 27 328
355
1327
1 1 24
25
1328
2 2 34
36
1329
11 11 105
116
1330
6 6 44
50
1331
12 12 90
102
1332
75 75 658
733
1333
37 37 301
338
1334
1 1 20
21
1335
7 7 48
55
1336
5 5 59
64
1338
76 76 596
672
1339
21 21 195
216
1340
6 6 47
53
1341
34 34 228
262
1342
63 63 624
687
1343
31 31 330
361
1344
55 55 512
567
1345
23 23 151
174
1346
49 49 402
451
1347
27 27 211
238
1348
1 1 13
14
1349
34 34 375
409
1350
5 5 30
35
1351
11 11 83
94
1352
14 14 174
188
1353
31 31 236
267
1354
11 11 66
77
1355
123 123 1187
1310
1356
59 59 489
548
1357
118 118 1059
1177
1358
20 20 202
222
1360
13 13 146
159
1361
13 13 87
100
1362
16 16 96
112
1364
8 8 50
58
1365
13 13 95
108
1366
36 36 328
364
1368
2 2 3
5
1369
111 111 1024
1135
1370
74 74 718
792
1371
103 

795
1994
21 21 161
182
1995
12 12 100
112
1996
64 64 821
885
1997
12 12 101
113
1998
15 15 112
127
1999
158 158 1469
1627
2000
102 102 909
1011
2001
99 99 703
802
2002
65 65 711
776
2003
47 47 325
372
2004
74 74 642
716
2005
98 98 885
983
2006
4 4 61
65
2007
4 4 11
15
2008
50 50 372
422
2009
43 43 345
388
2010
124 124 1034
1158
2011
117 117 1031
1148
2012
65 65 557
622
2013
17 17 163
180
2014
37 37 343
380
2015
10 10 99
109
2016
10 10 152
162
2017
60 60 529
589
2018
66 66 562
628
2019
71 71 702
773
2020
96 96 693
789
2021
24 24 230
254
2022
83 83 691
774
2023
15 15 133
148
2024
19 19 202
221
2025
16 16 145
161
2026
15 15 114
129
2027
242 242 2411
2653
2028
10 10 66
76
2030
2 2 35
37
2031
5 5 35
40
2032
17 17 183
200
2033
25 25 270
295
2034
9 9 48
57
2035
12 12 84
96
2036
4 4 50
54
2037
18 18 180
198
2039
10 10 118
128
2040
13 13 132
145
2041
23 23 170
193
2042
24 24 134
158
2043
6 6 46
52
2044
4 4 51
55
2045
44 44 492
536
2047
9 9 122
131
2048
2 2 35
37
2049
13 13 91
104
2050
5 5 96
10

47 47 486
533
2598
158 158 1364
1522
2599
35 35 375
410
2604
57 57 720
777
2605
17 17 190
207
2606
5 5 63
68
2608
5 5 62
67
2609
3 3 38
41
2610
13 13 133
146
2611
20 20 116
136
2612
35 35 246
281
2613
7 7 37
44
2614
24 24 165
189
2615
58 58 566
624
2616
113 113 894
1007
2617
5 5 41
46
2619
4 4 32
36
2620
7 7 62
69
2621
34 34 381
415
2622
1 1 3
4
2623
17 17 85
102
2624
10 10 56
66
2625
4 4 31
35
2626
1 1 13
14
2627
197 197 2053
2250
2628
10 10 134
144
2629
7 7 34
41
2631
1 1 10
11
2632
19 19 143
162
2633
10 10 83
93
2634
4 4 27
31
2635
1 1 12
13
2636
4 4 29
33
2637
5 5 31
36
2638
15 15 148
163
2639
117 117 1105
1222
2640
81 81 765
846
2641
40 40 471
511
2642
37 37 295
332
2643
22 22 216
238
2644
16 16 86
102
2645
4 4 35
39
2646
6 6 46
52
2647
49 49 357
406
2648
9 9 56
65
2649
2 2 31
33
2650
10 10 72
82
2651
5 5 46
51
2652
4 4 38
42
2653
11 11 123
134
2654
5 5 42
47
2655
10 10 52
62
2656
106 106 1127
1233
2657
1 1 10
11
2658
3 3 43
46
2659
10 10 125
135
2660
9 9 85
94
2661
69 69 496
565


886
3210
4 4 62
66
3212
24 24 208
232
3213
11 11 86
97
3214
1 1 10
11
3215
1 1 2
3
3216
5 5 86
91
3217
4 4 28
32
3218
48 48 399
447
3220
3 3 26
29
3221
4 4 26
30
3222
2 2 27
29
3223
7 7 70
77
3224
11 11 111
122
3229
7 7 71
78
3231
1 1 4
5
3234
8 8 63
71
3235
1 1 1
2
3237
28 28 239
267
3238
7 7 30
37
3239
3 3 30
33
3240
9 9 48
57
3242
26 26 267
293
3243
42 42 280
322
3245
40 40 435
475
3246
79 79 587
666
3247
18 18 184
202
3248
37 37 303
340
3249
31 31 335
366
3250
39 39 237
276
3251
67 67 488
555
3252
106 106 1014
1120
3253
45 45 388
433
3254
121 121 978
1099
3255
100 100 952
1052
3256
63 63 563
626
3257
35 35 304
339
3258
38 38 365
403
3259
45 45 435
480
3260
49 49 380
429
3261
14 14 149
163
3262
60 60 607
667
3263
53 53 459
512
3264
34 34 293
327
3265
20 20 134
154
3266
44 44 439
483
3267
17 17 162
179
3268
41 41 436
477
3269
16 16 184
200
3270
35 35 293
328
3271
17 17 154
171
3272
67 67 510
577
3273
40 40 398
438
3274
10 10 82
92
3275
1 1 45
46
3280
5 5 44
49
3282
2 2 13
15
3284
46 

30
3843
38 38 370
408
3844
3 3 25
28
3845
11 11 127
138
3846
1 1 24
25
3848
5 5 38
43
3849
3 3 10
13
3850
3 3 37
40
3851
23 23 185
208
3852
1 1 12
13
3853
1 1 19
20
3854
4 4 35
39
3856
5 5 55
60
3857
4 4 65
69
3858
8 8 74
82
3859
2 2 14
16
3860
27 27 214
241
3861
2 2 18
20
3862
73 73 585
658
3863
14 14 129
143
3864
12 12 135
147
3866
1 1 11
12
3867
80 80 612
692
3868
47 47 384
431
3869
2 2 47
49
3870
36 36 269
305
3871
13 13 89
102
3872
30 30 250
280
3873
3 3 21
24
3874
1 1 4
5
3875
1 1 3
4
3876
12 12 170
182
3877
1 1 11
12
3878
8 8 136
144
3881
20 20 214
234
3882
2 2 13
15
3883
3 3 37
40
3884
1 1 25
26
3885
3 3 37
40
3888
11 11 124
135
3889
1 1 0
1
3892
51 51 508
559
3893
2 2 19
21
3894
5 5 89
94
3895
19 19 160
179
3896
101 101 893
994
3897
2 2 43
45
3899
4 4 29
33
3900
10 10 96
106
3901
1 1 23
24
3902
7 7 45
52
3904
2 2 6
8
3905
1 1 31
32
3907
5 5 87
92
3908
6 6 63
69
3909
24 24 176
200
3910
65 65 592
657
3911
1 1 39
40
3912
3 3 29
32
3913
6 6 64
70
3914
12 12 115
127
3915
41 41 396


In [32]:
from sklearn.metrics import mean_squared_error
from math import sqrt

from torch.nn import MSELoss


def test():
    criterion = nn.MSELoss()
    noRatings = 0
    input, target, minibatch = {}, {}, {}

    
    mse = 0        
    with torch.no_grad():
        for bidx, batch in enumerate(minibatches):
                train_batch = batch[0]
                test_batch = batch[1]
                known_batch = batch[2]
                y_predictions = model(train_batch)

                for idx, test in enumerate(test_batch):
                    y_true = test[np.where(known_batch[idx].numpy() == 1)[0]]
                    y_pred = y_predictions[idx][np.where(known_batch[idx].numpy() == 1)[0]]
                    
                   # print(torch.sum((y_pred - y_true)**2))
                    #print(torch.sum(y_pred - y_true)**2)
                    mse += torch.sum((y_pred - y_true)**2)

    print(all_ratings_cnt)  
    mse = mse / all_ratings_cnt
    rmse = sqrt(mse)
    print(rmse  * 2)
    
test()


100021
0.7592135976054041
