# AutoEncoder

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Importing the dataset

In [3]:
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [5]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [9]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [11]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [12]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Preparing the training set and test set

In [14]:
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')

In [15]:
training_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [16]:
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

In [22]:
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

## Getting the number of users and movies

In [25]:
num_users = len(np.unique(np.concatenate((training_set[:, 0], test_set[:, 0]), axis = 0)))
num_movies = len(np.unique(np.concatenate((training_set[:, 1], test_set[:, 1]), axis = 0)))

In [27]:
num_users, num_movies

(943, 1682)

## Converting the data into an array with users in rows and movies in columns

In [29]:
def convert(data):
    new_data = []
    for id_users in range(1, num_users+1):
        id_movies = data[:, 1][data[:, 0] == id_users]
        id_ratings = data[:, 2][data[:, 0] == id_users]
        ratings = np.zeros(num_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data        

In [31]:
training_set = convert(training_set)
test_set = convert(test_set)

## Converting the data into Torch tensors

In [33]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

## Creating the architecture of the AutoEncoder NN

In [35]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(num_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, num_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x

sae = SAE()
criterion = nn.MSELoss()
optimiser = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

## Training the SAE

In [41]:
num_epoch = 200
for epoch in range(1, num_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(num_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = num_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.item()*mean_corrector)
            s += 1.
            optimiser.step()
    print(f'epoch : {epoch}  train_loss : {train_loss/s}')        

epoch : 1  train_loss : 1.7593828671666991
epoch : 2  train_loss : 1.0964754504741405
epoch : 3  train_loss : 1.053321187933271
epoch : 4  train_loss : 1.0383069534206706
epoch : 5  train_loss : 1.0309720287670403
epoch : 6  train_loss : 1.0264438682527712
epoch : 7  train_loss : 1.023741447343222
epoch : 8  train_loss : 1.0221681979511397
epoch : 9  train_loss : 1.0204873573413666
epoch : 10  train_loss : 1.0195788959338699
epoch : 11  train_loss : 1.0187924639513155
epoch : 12  train_loss : 1.0182754884355214
epoch : 13  train_loss : 1.0178183236578386
epoch : 14  train_loss : 1.0177251280094095
epoch : 15  train_loss : 1.0171427900228798
epoch : 16  train_loss : 1.0166937614785676
epoch : 17  train_loss : 1.0166816171591397
epoch : 18  train_loss : 1.016477817393539
epoch : 19  train_loss : 1.016143568677288
epoch : 20  train_loss : 1.0162166526765168
epoch : 21  train_loss : 1.015962672929332
epoch : 22  train_loss : 1.0158967194070985
epoch : 23  train_loss : 1.0159081308846782
ep

## Testing the SAE

In [53]:
test_loss = 0
s = 0.
for id_user in range(num_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = Variable(test_set[id_user]).unsqueeze(0) 
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = num_movies/float(torch.sum(target.data > 0) + 1e-10)
            test_loss += np.sqrt(loss.item()*mean_corrector)
            s += 1.
print(f'test_loss : {test_loss/s}') 

test_loss : 0.9496223413206735


In [59]:
user_id = 1
movie_title = movies.iloc[:num_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)
 
user_input = Variable(training_set[user_id]).unsqueeze(0)
predicted = sae(user_input)
predicted = predicted.data.numpy().reshape(-1,1)
 
# Join all info in one dataset
result_array = np.hstack([movie_title, user_target, predicted])
result_array = result_array[result_array[:, 1] > 0]
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])

In [61]:
result_df

Unnamed: 0,Movie,Target Rating,Predicted
0,Balto (1995),4.0,3.34195
1,Ace Ventura: When Nature Calls (1995),3.0,4.214255
2,"Usual Suspects, The (1995)",5.0,4.393968
3,Interview with the Vampire (1994),5.0,4.45011
4,Kiss of Death (1995),4.0,3.868694
5,Nobody's Fool (1994),4.0,3.640558
6,Nell (1994),3.0,3.206133
7,New Jersey Drive (1995),3.0,3.595693
8,Outbreak (1995),3.0,3.068423
9,"Perez Family, The (1995)",4.0,3.617159
