<a href="https://colab.research.google.com/github/Sandeep354/Movie-Ratings-RBM-AE/blob/main/AutoEncoders_RecommendSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#AutoEncoders

##Downloading the dataset

###ML-100K

In [41]:
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
!unzip ml-100k.zip
!ls

--2020-10-31 16:48:34--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.2’


2020-10-31 16:48:35 (22.6 MB/s) - ‘ml-100k.zip.2’ saved [4924029/4924029]

Archive:  ml-100k.zip
replace ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: ml-100k      ml-100k.zip.1  ml-1m      ml-1m.zip.1
ml-100k.zip  ml-100k.zip.2  ml-1m.zip  sample_data


###ML-1M

In [42]:
!wget "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
!unzip ml-1m.zip
!ls

--2020-10-31 16:48:41--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.2’


2020-10-31 16:48:41 (25.5 MB/s) - ‘ml-1m.zip.2’ saved [5917549/5917549]

Archive:  ml-1m.zip
replace ml-1m/movies.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: ml-100k      ml-100k.zip.1  ml-1m      ml-1m.zip.1  sample_data
ml-100k.zip  ml-100k.zip.2  ml-1m.zip  ml-1m.zip.2


##Importing the libraries

In [43]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn 
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Importing the dataset


In [44]:
movies = pd.read_csv("ml-1m/movies.dat", 
                     sep = '::',
                     header = None,
                     engine = 'python',
                     encoding = 'latin-1')

In [45]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [46]:
users = pd.read_csv("ml-1m/users.dat", 
                     sep = '::',
                     header = None,
                     engine = 'python',
                     encoding = 'latin-1')

In [47]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [48]:
ratings = pd.read_csv("ml-1m/ratings.dat", 
                     sep = '::',
                     header = None,
                     engine = 'python',
                     encoding = 'latin-1')
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [49]:
ratings.isna().sum()

0    0
1    0
2    0
3    0
dtype: int64

## Preparing the training set and the test set


In [50]:
training_set = pd.read_csv("ml-100k/u1.base", delimiter = '\t')

In [51]:
training_set.head()

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [52]:
# Convert into Array
training_set = np.array(training_set, dtype = "int")

In [53]:
len(training_set)

79999

In [54]:
test_set = pd.read_csv("ml-100k/u1.test", delimiter = '\t')
test_set = np.array(test_set, dtype = "int")

## Getting the number of users and movies


In [55]:
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_users

943

In [56]:
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_movies

1682

## Converting the data into an array with users in lines and movies in columns


In [57]:
def convert(data):
  new_data = []
  for id_user in range(1, nb_users + 1):
    id_movies = data[:,1][data[:,0] == id_user]
    id_ratings = data[:,2][data[:,0] == id_user]
    # Make the unrated movies as zeros
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data

In [58]:
training_set = convert(training_set)
test_set = convert(test_set)

## Converting the data into Torch tensors


In [59]:
# NumpyArray --> TorchTensor
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

## Creating the architecture of the Neural Network


In [60]:
# SAE - Stacked Auto Encoders

class SAE(nn.Module):

  def __init__(self, ):
    # To inherit methods of `parent class : nn.Module`
    super(SAE, self).__init__()

    # Full Connection (associated with the parent class) with the Linear Class of torch.nn
    # FC1 --> Input to first encoded/hidden layer (**20 neurons taken for example - can be adjusted**)
    self.fc1 = nn.Linear(in_features = nb_movies, out_features = 20)
    self.fc2 = nn.Linear(in_features = 20, out_features = 10)
    self.fc3 = nn.Linear(in_features = 10, out_features = 20)
    # Last layer should have same dimensions as input vector : nb_movies
    self.fc4 = nn.Linear(in_features = 20, out_features = nb_movies)

    # Activation Function
    self.activation = nn.Sigmoid()

  # Encoding and Decoding the input vector
  def forward(self, x):
    # First encoded vector
    x = self.activation(self.fc1(x))
    # Second encoded vector
    x = self.activation(self.fc2(x))
    # Now decoding - part I
    x = self.activation(self.fc3(x))
    # Decoding - part II
    x = self.fc4(x)
    return x

sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

## Training the SAE


In [62]:
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
  train_loss = 0
  s = 0.
  for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = input.clone()
    if torch.sum(target.data > 0) > 0:
      output = sae(input)
      target.require_grad = False
      output[target == 0] = 0
      loss = criterion(output, target)
      mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += np.sqrt(loss.data*mean_corrector)
      s += 1.
      optimizer.step()

  print ('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s))

epoch: 1 loss: tensor(1.7658)
epoch: 2 loss: tensor(1.0965)
epoch: 3 loss: tensor(1.0533)
epoch: 4 loss: tensor(1.0384)
epoch: 5 loss: tensor(1.0309)
epoch: 6 loss: tensor(1.0267)
epoch: 7 loss: tensor(1.0238)
epoch: 8 loss: tensor(1.0221)
epoch: 9 loss: tensor(1.0206)
epoch: 10 loss: tensor(1.0197)
epoch: 11 loss: tensor(1.0189)
epoch: 12 loss: tensor(1.0182)
epoch: 13 loss: tensor(1.0180)
epoch: 14 loss: tensor(1.0173)
epoch: 15 loss: tensor(1.0173)
epoch: 16 loss: tensor(1.0169)
epoch: 17 loss: tensor(1.0169)
epoch: 18 loss: tensor(1.0166)
epoch: 19 loss: tensor(1.0163)
epoch: 20 loss: tensor(1.0162)
epoch: 21 loss: tensor(1.0162)
epoch: 22 loss: tensor(1.0162)
epoch: 23 loss: tensor(1.0161)
epoch: 24 loss: tensor(1.0157)
epoch: 25 loss: tensor(1.0157)
epoch: 26 loss: tensor(1.0157)
epoch: 27 loss: tensor(1.0154)
epoch: 28 loss: tensor(1.0152)
epoch: 29 loss: tensor(1.0130)
epoch: 30 loss: tensor(1.0115)
epoch: 31 loss: tensor(1.0101)
epoch: 32 loss: tensor(1.0072)
epoch: 33 loss: t

## Testing the SAE


In [65]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
  input = Variable(training_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user]).unsqueeze(0)
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data*mean_corrector)
    s += 1.

print ('Loss: ' + str(test_loss/s))

Loss: tensor(0.9496)
