In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
movies = pd.read_csv('movies.dat',sep='::',header=None,engine='python',encoding = 'latin-1')

In [3]:
users = pd.read_csv('users.dat',sep='::',header=None,engine='python',encoding = 'latin-1')
print(users)

         0  1   2   3      4
0        1  F   1  10  48067
1        2  M  56  16  70072
2        3  M  25  15  55117
3        4  M  45   7  02460
4        5  M  25  20  55455
5        6  F  50   9  55117
6        7  M  35   1  06810
7        8  M  25  12  11413
8        9  M  25  17  61614
9       10  F  35   1  95370
10      11  F  25   1  04093
11      12  M  25  12  32793
12      13  M  45   1  93304
13      14  M  35   0  60126
14      15  M  25   7  22903
15      16  F  35   0  20670
16      17  M  50   1  95350
17      18  F  18   3  95825
18      19  M   1  10  48073
19      20  M  25  14  55113
20      21  M  18  16  99353
21      22  M  18  15  53706
22      23  M  35   0  90049
23      24  F  25   7  10023
24      25  M  18   4  01609
25      26  M  25   7  23112
26      27  M  25  11  19130
27      28  F  25   1  14607
28      29  M  35   7  33407
29      30  F  35   7  19143
...    ... ..  ..  ..    ...
6010  6011  M  35  15  80538
6011  6012  M  35  15  02871
6012  6013  F 

In [4]:
ratings = pd.read_csv('ratings.dat',sep='::',header=None,engine='python',encoding = 'latin-1')

In [5]:
#Preparing the training set and the test set
tr_set= pd.read_csv('u1.base',delimiter='\t')
tr_set=np.array(tr_set,dtype='int')
te_set= pd.read_csv('u1.test',delimiter='\t')
te_set=np.array(tr_set,dtype='int')

In [6]:
#getting the numbers of users and movies
nb_users=int(max(max(tr_set[:,0]),max(te_set[:,0])))
nb_movies=int(max(max(tr_set[:,1]),max(te_set[:,1])))

In [7]:
#Converting the data into an array with users in lines and movies in columns
def convert(data):
    new_data=[]
    for id_users in range(1,nb_users+1):
        id_movies=data[:,1][data[:,0]==id_users]
        id_ratings=data[:,2][data[:,0]==id_users]
        ratings=np.zeros(nb_movies)
        ratings[id_movies-1] = id_ratings
        new_data.append(list(ratings))
    return new_data
tr_set=convert(tr_set)
te_set=convert(te_set)

In [8]:
#Converting the data into torch sensors
tr_set = torch.FloatTensor(tr_set)
te_set=torch.FloatTensor(te_set)

In [9]:
#Creating the architecture of the neural network
class SAE(nn.Module):
    def __init__(self,):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20,10)
        self.fc3 = nn.Linear(10,20)
        self.fc4 = nn.Linear(20,nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

In [10]:
import torch.tensor as tf
#Training the SAE
nb_epoch = 200
for epoch in range (1, nb_epoch +1):
    train_loss = 0
    s=0.
    for id_user in range(nb_users):
        input = Variable(tr_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data>0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss'+str(train_loss/s))
            

epoch: 1 losstensor(1.7712)
epoch: 2 losstensor(1.0967)
epoch: 3 losstensor(1.0534)
epoch: 4 losstensor(1.0386)
epoch: 5 losstensor(1.0310)
epoch: 6 losstensor(1.0266)
epoch: 7 losstensor(1.0239)
epoch: 8 losstensor(1.0219)
epoch: 9 losstensor(1.0207)
epoch: 10 losstensor(1.0196)
epoch: 11 losstensor(1.0189)
epoch: 12 losstensor(1.0184)
epoch: 13 losstensor(1.0178)
epoch: 14 losstensor(1.0173)
epoch: 15 losstensor(1.0172)
epoch: 16 losstensor(1.0171)
epoch: 17 losstensor(1.0168)
epoch: 18 losstensor(1.0164)
epoch: 19 losstensor(1.0165)
epoch: 20 losstensor(1.0160)
epoch: 21 losstensor(1.0163)
epoch: 22 losstensor(1.0160)
epoch: 23 losstensor(1.0158)
epoch: 24 losstensor(1.0159)
epoch: 25 losstensor(1.0156)
epoch: 26 losstensor(1.0157)
epoch: 27 losstensor(1.0155)
epoch: 28 losstensor(1.0150)
epoch: 29 losstensor(1.0124)
epoch: 30 losstensor(1.0118)
epoch: 31 losstensor(1.0094)
epoch: 32 losstensor(1.0090)
epoch: 33 losstensor(1.0055)
epoch: 34 losstensor(1.0056)
epoch: 35 losstensor(1.

In [13]:
#testing the SAE
test_loss = 0
s=0.
for id_user in range(nb_users):
    input = Variable(tr_set[id_user]).unsqueeze(0)
    target = Variable(te_set[id_user])
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data>0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
        optimizer.step()
print('epoch: '+str(epoch)+' loss'+str(test_loss/s))

IndexError: The shape of the mask [1682] at index 0does not match the shape of the indexed tensor [1, 1682] at index 0