# AutoEncoders 

Predict movie rating 1-5

## Import Libraries

In [1]:
import numpy as np                      # To work with arrays
import pandas as pd                     # To import the data set and create the train & test set
import torch
import torch.nn as nn                   # To implement neural networks.
import torch.nn.parallel                # For the parallel computations.
import torch.optim as optim             # For the optimizers
import torch.utils.data
from torch.autograd import Variable     # For stochastic gradient descent in the set - Autograd is an engine to calculate derivatives

# Import dataset

In [2]:
!cd '../../Part5 Boltzmann Machines (BM)/'
!dir 

The system cannot find the path specified.


 Volume in drive C is Windows
 Volume Serial Number is A2C9-9F3D

 Directory of C:\Users\Work Is Fun\Desktop\study\courses\Udemy-A_ZdeepLearning 2020\Excercise-Deep_Learning\Vol2-Unsupervised Deep Learning\Part6 AutoEncoders (AE)\Section 22 - Building an AutoEncoder

08/29/2020  11:58 AM    <DIR>          .
08/29/2020  11:58 AM    <DIR>          ..
10/20/2017  07:02 PM             6,148 .DS_Store
08/22/2020  01:05 PM    <DIR>          .ipynb_checkpoints
08/29/2020  11:58 AM            17,975 Autoencoder_Tut12.ipynb
               2 File(s)         24,123 bytes
               3 Dir(s)  596,839,075,840 bytes free


In [3]:
filepath = r'C:\Users\Work Is Fun\Desktop\study\courses\Udemy-A_ZdeepLearning 2020\Excercise-Deep_Learning\Vol2-Unsupervised Deep Learning\Part5 Boltzmann Machines (BM)\Sec19 - Building a BM\\'
movies = pd.read_csv(filepath+"ml-1m\\movies.dat", sep='::', header=None, engine='python', encoding='latin-1' )
users = pd.read_csv(filepath+"ml-1m\\users.dat", sep='::', header=None, engine='python', encoding='latin-1' )
ratings = pd.read_csv(filepath+"ml-1m\\ratings.dat", sep='::', header=None, engine='python', encoding='latin-1' )

## Preparing training & test set

In [4]:
training_set = pd.read_csv(filepath+'ml-100k\\u1.base', delimiter='\t', header=None)
test_set = pd.read_csv(filepath+'ml-100k\\u1.test', delimiter='\t', header=None)

In [6]:
# Convert the dataframes to array
#training_set = np.array(training_set, dtype=int)
training_set = np.array(training_set)
test_set = np.array(test_set)

In [5]:
type(training_set)

pandas.core.frame.DataFrame

In [16]:
## Getting total no. of users &movies
nb_users = int(max(max(training_set[:][0]), max(test_set[:][0])))
nb_movies = int(max(max(training_set[:][1]), max(test_set[:][1])))

In [20]:
# Prepare dataset where rows are the users,the columns are the movies, the cells are the ratings.
def prepare_dataset(df:np.ndarray) -> list:
    dataset = []
    
    for user_id in range(1,nb_users+1):
        movie_ids = df[:][1][df[:][0]== user_id]
        movie_ratings = df[:][2][df[:][0]== user_id]
        
        user_ratings = np.zeros(nb_movies)
        user_ratings[movie_ids-1] = movie_ratings 
        
        dataset.append(list(user_ratings))
    
    return dataset

In [21]:
training_set = prepare_dataset(training_set)  # Convert training_set into desired array/matrix format
test_set = prepare_dataset(test_set)          # Convert test_set into desired array/matrix format

In [22]:
# Convert the array/list to torch tensors
training_set = torch.FloatTensor(training_set)  # FloatTensor(list_of_list) class creates a multi dimensional array/matrix with element data type as float
test_set = torch.FloatTensor(test_set)

In [23]:
# Creating Architecture of Neural Network - Stacked AutoEncoder
class SAE(nn.Module):
    '''
    The parent class Module contains all the tools to make an auto-encoder - it contains an optimizer function, a criterion
    & tools to make full connections between the layers.
    In Stacked AutoEncoders, we have several hidden layers i.e. we will have several encodings of the input vector features.
    '''
    
    # Initialize the object of the class - defines the architecture of our auto encoders
    def __init__(self):
        
        # Get all the inherited classes and methods of the parent class
        super(SAE,self).__init__() 
        
        # Define the neural architecture by choosing the number of layers & the hidden neurons in each of these hidden layers.
        '''
        In auto-encoders, first We're encoding the input vector into a shorter vector. This will take place in the 
        first hidden layer.
        So we first establish a connection between the input vector features (the ratings of all the movies for one 
        specific user) & the first hidden layer, which is a shorter vector than the input vector.        
        '''
        # fcN = Fully Connected layer N
        # nn.Linear(in_features, out_features, bias=True) : Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
        # ----- Encoding Layers -------
        self.fc1 = nn.Linear(nb_movies, 20)  # no. of neuron in input layer = nb_novies & no. of hidden neurons in first hidden layer = 20 (by r&d)
        self.fc2 = nn.Linear(20,10)  # no. of hidden neurons in first hidden layer = 20 & no. of hidden neurons in 2nd hidden layer = 10 (by r&d)
        
        # ----- Decoding Layers -------
        self.fc3 = nn.Linear(10,20)  # no. of hidden neurons in 2nd hidden layer = 10 & no. of hidden neurons in 3rd hidden layer = no. of hidden neurons in first hidden layer
        self.fc4 = nn.Linear(20,nb_movies)  # no. of hidden neurons in 3rd hidden layer = no. of hidden neurons in first hidden layer & no. of output neurons = no. of input neurons
        
        # Define activation function - that will, activate the neurons when the observation goes into the network.
        self.activation = nn.Sigmoid()   # You can use rectifier activation function or the sigmoid activation function
        
        
    # Perform the action of an auto-encoder - encoding and decoding.
    '''
    This function will not only perform the action of encoding & decoding, but also will apply to different activation 
    functions inside the full connections.
    The main purpose of making this function is that it will return in the end the vector of predicted ratings that 
    we will compare to the vector of real ratings, i.e. the input vector.
    '''
    def forward(self, vector_x):
        
        # Encoding
        vector_x = self.activation(self.fc1(vector_x))  # Update input vector to 1st hidden layer vector of neurons
        vector_x = self.activation(self.fc2(vector_x))  # Update 1st hidden layer vector of neurons to 2nd hidden layer vector of neurons
        
        # Decoding
        vector_x = self.activation(self.fc3(vector_x))  # Update 2nd hidden layer vector of neurons to 3rd hidden layer vector of neurons
        vector_x = self.fc4(vector_x)                   # Update 3rd hidden layer vector of neurons to output vector
        
        return vector_x

Custom/User-defined Objects are callable. Refer [link](https://dbader.org/blog/python-first-class-functions#:~:text=Object%27s%20aren%27t%20functions%20in,like%20functions%20in%20many%20cases.&text=Behind%20the%20scenes%2C%20%E2%80%9Ccalling%E2%80%9D,object%27s%20__call__%20method.)

In [24]:
sae = SAE()                  # Create autoencoder
creterion = nn.MSELoss()     # Get the creterion to measure mean squared error
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)  # Define optimizer to apply stocastic gradient descent to update the 
                             # different weights in order to reduce the error at each epoch.
                             # U can use Atom class for the Atom optimizer or RMS prop class for the RMS prop optimizer.
'''
RMSprop class takes in 3 input arguments:
1) all the parameters of our AutoEncoders that define the architecture
     - no. of neurons in each layer (input, output, hidden) & the Sigmoid activation function.
   An attribute from our SAE object will get us all these parameters. -> sae.parameters()

2) learning rate/ LR 
3) decay/ weight_decay - used to reduce the learning rate after every few epochs in order to regulate the convergence.
   This parameter can improve your model even more.
'''

'\nRMSprop class takes in 3 input arguments:\n1) all the parameters of our AutoEncoders that define the architecture\n     - no. of neurons in each layer (input, output, hidden) & the Sigmoid activation function.\n   An attribute from our SAE object will get us all these parameters. -> sae.parameters()\n\n2) learning rate/ LR \n3) decay/ weight_decay - used to reduce the learning rate after every few epochs in order to regulate the convergence.\n   This parameter can improve your model even more.\n'

In [45]:
# Training the SAE
nb_epoch = 100    # No. of epochs

# Loop over all the epochs
for epoch in range(1,nb_epoch+1):
    
    train_loss = 0       # Initialize training error loss
    s = 0.               # Initialize a variable to save memory by ignoring the users who did not rate any movie
                         # Using this variable we keep track of no. of users who rated
                         # This is a float variable as we will use this to caculate the Root Mean Squared Error
    
    # Loop over all the observations i.e. users
    for id_user in range(nb_users):
        
        # Get input vector of features that contains all the ratings of all the movies given by a user
        # Here we also specify where we want add an axis.
        input_vector = Variable(training_set[id_user]).unsqueeze(0)   # unsqueeze(new_index) inserts singleton dim at position given as new_index & return the tensor
        '''
        training set[id_user] is a vector of 1D and a network in PyTorch or even on Kerras generally can not 
        accept a single vector of one dimension. Rather it accepts a batch of input vectors. So We add an additional 
        dimension like a fake dimension, which will correspond to a batch, by specifying the index of this new dimension.
        We put this new dimension in first position (at index zero).
        Variable().unsqueeze(index) will create a batch of, a single input vector. The batch can have several input vectors.
        But here since we are not doing batch learning but online learning (we'll update the weights after each observation
        going to the network), we create a batch of one input vector.
        '''
        
        # Before the input vector updates, We would need a clone of it for comparison at the target side
        target_vector = input_vector.clone()
        
        # Make Prediction
        # Ignore users who did not even vote 1 movie
        if torch.sum(target_vector.data >0) > 0:    
            
            # Get vector of our predicted ratings
            output_predicted = sae(input_vector)    # OR    sae.forward(input_vector)
        
            # Perform Optimization for optimizing the memory and the computations.
            ''' 
            1. When we apply stochastic gradient descent, we want to make sure the gradient is computed only with respect 
            to the input and not the target. This will same computation & memory.
            requires_grad: This member, if true starts tracking all the operation history and forms a backward graph for 
            gradient calculation.
            '''
            target_vector.require_grad = False
            
            '''
            2.  We also don't deal with the movies that the user didn't rate, where the ratings are equal to zero,
            but that is only for the output vector.
            So using the indexes for the movies that has not been rated by a user i.e. the input vector = 0, we set the  
            rating of these movies to 0 in the output vector.
            While updating the weights these values won't count.
            '''
            output_predicted[target_vector == 0] = 0
            
            
            # Loss Computation
            loss = creterion(output_predicted,target_vector)   # Find loss by using (predicted_outcome, expected_outcome)
            
            mean_corrector = nb_movies/float(torch.sum(target_vector.data > 0) +1e-10)  # (number of movies/ number of movies that have positive ratings)
            '''
            mean_corrector represents the average of the error, but by only considering the movies that were rated i.e. 
            the movies that at least got one to five ratings.
            Since, no. of movies that have positive ratings may be = 0, we add the demoniator to a very tiny value 1e-10, 
            to avoid divide by 0 error & also not create any bias to the calculation.
            '''
            
            # Backpropogation / Backward method for the loss - this will tell in which direction we need to update the
            # different weights. i.e. do we need to increase the weight or decrease the weight.
            loss.backward()
            
            # Compute RMSE                        
            train_loss += np.sqrt(loss.data*mean_corrector)  # Sum up the error (difference between predicted & expected rating)
                                                       # we're adjusting this loss with this mean corrector factor 
            
            s += 1.        # Increment s to denote user has rated atleast one movie
            
            # Apply optimizer to update the weights.
            '''            
            After we've measured the error, the weights are updated by the RMSprop Optimizer.
            Backward operation provides the direction in which the weights should be updated & optimizer decides the 
            intensity of the updates i.e. the amount by which the weights will be updated.
            '''            
            optimizer.step()

            
    # Check the loss at each epoch    
    print("epoch: "+str(epoch) + "\t Average Loss: "+str(train_loss.item()/s))
    '''
    To print value in a pytorch tensor, there are 2 ways:
    1) loss.numpy()[0]
    2) loss.item() -> Valid only for tensor with 1 dimension
    '''


epoch: 1	 Average Loss: 0.922285158838978
epoch: 2	 Average Loss: 0.9139579700134213
epoch: 3	 Average Loss: 0.9114530043536254
epoch: 4	 Average Loss: 0.9112882806286453
epoch: 5	 Average Loss: 0.9113469857046991
epoch: 6	 Average Loss: 0.9110108716363998
epoch: 7	 Average Loss: 0.9103412972022468
epoch: 8	 Average Loss: 0.9109248528424907
epoch: 9	 Average Loss: 0.9101167680855647
epoch: 10	 Average Loss: 0.9105088040785724
epoch: 11	 Average Loss: 0.9095769014489992
epoch: 12	 Average Loss: 0.9091301085713812
epoch: 13	 Average Loss: 0.9088167775094446
epoch: 14	 Average Loss: 0.9084796925743969
epoch: 15	 Average Loss: 0.9078022864818067
epoch: 16	 Average Loss: 0.9073882441758351
epoch: 17	 Average Loss: 0.9098802649456522
epoch: 18	 Average Loss: 0.9072815135579599
epoch: 19	 Average Loss: 0.906076667923184
epoch: 20	 Average Loss: 0.9065048848836824
epoch: 21	 Average Loss: 0.905307612009544
epoch: 22	 Average Loss: 0.9060476066451816
epoch: 23	 Average Loss: 0.9050386171957847


In [44]:
# Testing the SAE
test_loss = 0        # Initialize test loss
s = 0.               # Initialize a variable to save memory by ignoring the users who did not rate any movie
                     # Using this variable we keep track of no. of users who rated
                     # This is a float variable as we will use this to caculate the Root Mean Squared Error

# Loop over all the observations i.e. users
for id_user in range(nb_users):

    # Get input vector of features that contains the ratings of all the movies given by a user
    # Here we also specify where we want add an axis.
    input_vector = Variable(training_set[id_user]).unsqueeze(0)   # unsqueeze(new_index) inserts singleton dim at position given as new_index & return the tensor
    '''
    The test set contains those ratings which are not rated in the training set. Our motto is to predict these movies 
    which are not rated in the training set based on the ratings given by the user to other movies. Hence, here we
    pick the users from training set & not test set.
    
    So, the training set is like, the set that contains all the ratings of the movies by a specific user, 
    up to a certain point in time. Then we predict the ratings of the other movies that the user hasn't watched yet,
    and then we have the test set in the future that contains the real answers, like it contains the real ratings 
    for these movies that were not part of the training set.
    '''

    # Expected outcome 
    target_vector = Variable(test_set[id_user]).unsqueeze(0) 

    # Make Prediction
    # Ignore users who did not even vote 1 movie
    if torch.sum(target_vector.data >0) > 0:    

        # Get vector of our predicted ratings
        output_predicted = sae(input_vector)   # OR    sae.forward(input_vector)
        '''
        SAE contains the forward() that returns the vector of predicted ratings. Thus, by calling our object on the
        input here, we will get our vector of predicted ratings for the movies that the user hasn't watched yet,
        and this will go into output.
        '''

        # Perform Optimization for optimizing the memory and the computations.
        '''override the computations of the gradient with respect to the target '''
        target_vector.require_grad = False

        '''consider the ratings of the movies that are non-zero ratings in the test set 
         & we don't measure the future loss on the movies that didn't get any rating. '''
        output_predicted[target_vector == 0] = 0  


        # Loss Computation
        loss = creterion(output_predicted,target_vector)   # Find loss by using (predicted_outcome, expected_outcome)

        mean_corrector = nb_movies/float(torch.sum(target_vector.data > 0) +1e-10)  # (number of movies/ number of movies that have positive ratings)
        '''
        mean_corrector represents the average of the error, but by only considering the movies that were rated i.e. 
        the movies that at least got one to five ratings.
        Since, no. of movies that have positive ratings may be = 0, we add the demoniator to a very tiny value 1e-10, 
        to avoid divide by 0 error & also not create any bias to the calculation.
        '''       

        # Compute RMSE                        
        test_loss += np.sqrt(loss.data*mean_corrector)  # Sum up the error (difference between predicted & expected rating)
                                                   # we're adjusting this loss with this mean corrector factor 

        s += 1.        # Increment s to denote user has rated atleast one movie



# Check the test loss    
print("Average Test Loss: "+str(test_loss.item()/s))
'''
if, on average, our model predicts a rating that will be different from the real rating by less than one star.
Then, that means that our recommended system will be pretty powerful.
'''


Average Test Loss: 0.9588069209346065


'\nif, on average, our model predicts a rating that will be different from the real rating by less than one star.\nThen, that means that our recommended system will be pretty powerful.\n'