<a href="https://colab.research.google.com/github/TirilaPatricGabriel/ML/blob/main/StackedAutoencoderForMovieRatings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [88]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.parallel
import torch.utils.data

# Data Preprocessing

In [89]:
# userId - gender - age - userJob - zip code
users = pd.read_csv('users.dat', header=None, sep='::', engine='python', encoding='latin-1')

# userId - movieId - grade
ratings = pd.read_csv('ratings.dat', header=None, sep='::', engine='python', encoding='latin-1')

# movieId - title - genre
movies = pd.read_csv('movies.dat', header=None, sep='::', engine='python', encoding='latin-1')

print(users.values[:2, :])
print(ratings.values[:2, :])
print(movies.values[:2, :])

[[1 'F' 1 10 '48067']
 [2 'M' 56 16 '70072']]
[[        1      1193         5 978300760]
 [        1       661         3 978302109]]
[[1 'Toy Story (1995)' "Animation|Children's|Comedy"]
 [2 'Jumanji (1995)' "Adventure|Children's|Fantasy"]]


In [90]:
training_set = pd.read_csv('u1.base', delimiter='\t')
test_set = pd.read_csv('u1.test', delimiter='\t')
training_set = np.array(training_set, dtype='int')
test_set = np.array(test_set, dtype='int')

In [91]:
n_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
n_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

def convert(data):
  matrix = []
  for user_id in range(n_users):
    movies_id = data[:, 1][data[:, 0] == user_id]
    reviews = data[:, 2][data[:, 0] == user_id]

    user_row = np.zeros(n_movies)
    user_row[movies_id-1] = reviews
    matrix.append(user_row)
  return matrix

training_set = torch.FloatTensor(np.array(convert(training_set)))
test_set = torch.FloatTensor(np.array(convert(test_set)))

# Defining model

In [92]:
class SAE(nn.Module):
  def __init__(self, ):
    super(SAE, self).__init__()
    self.fc1 = nn.Linear(n_movies, 20)
    self.fc2 = nn.Linear(20, 10)
    self.fc3 = nn.Linear(10, 20)
    self.fc4 = nn.Linear(20, n_movies)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)

# Training

In [93]:
n_epochs = 121
for epoch in range(n_epochs):
  total_loss = 0
  s = 0

  for user_id in range(n_users):
    input = training_set[user_id].unsqueeze(0)
    target = input.clone().detach()

    if torch.sum(target > 0) > 0:
      output = sae(input)
      output[target == 0] = 0

      loss = criterion(output, target)
      mean_corrector = n_movies / float(torch.sum(target > 0) + 1e-10)
      loss.backward()
      total_loss += np.sqrt(loss.data*mean_corrector)
      s += 1
      optimizer.step()

  print('epoch: ' + str(epoch) + ' loss: ' + str(total_loss/s))

epoch: 0 loss: tensor(1.7718)
epoch: 1 loss: tensor(1.0972)
epoch: 2 loss: tensor(1.0538)
epoch: 3 loss: tensor(1.0396)
epoch: 4 loss: tensor(1.0324)
epoch: 5 loss: tensor(1.0280)
epoch: 6 loss: tensor(1.0255)
epoch: 7 loss: tensor(1.0234)
epoch: 8 loss: tensor(1.0223)
epoch: 9 loss: tensor(1.0208)
epoch: 10 loss: tensor(1.0201)
epoch: 11 loss: tensor(1.0195)
epoch: 12 loss: tensor(1.0188)
epoch: 13 loss: tensor(1.0185)
epoch: 14 loss: tensor(1.0183)
epoch: 15 loss: tensor(1.0182)
epoch: 16 loss: tensor(1.0179)
epoch: 17 loss: tensor(1.0176)
epoch: 18 loss: tensor(1.0174)
epoch: 19 loss: tensor(1.0171)
epoch: 20 loss: tensor(1.0175)
epoch: 21 loss: tensor(1.0172)
epoch: 22 loss: tensor(1.0171)
epoch: 23 loss: tensor(1.0169)
epoch: 24 loss: tensor(1.0168)
epoch: 25 loss: tensor(1.0166)
epoch: 26 loss: tensor(1.0166)
epoch: 27 loss: tensor(1.0165)
epoch: 28 loss: tensor(1.0166)
epoch: 29 loss: tensor(1.0162)
epoch: 30 loss: tensor(1.0160)
epoch: 31 loss: tensor(1.0158)
epoch: 32 loss: te

# Testing

In [97]:
test_loss = 0
s = 0.
for user_id in range(n_users):
    input = training_set[user_id].unsqueeze(0)
    target = test_set[user_id].unsqueeze(0).detach()

    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        output[target == 0] = 0

        loss = criterion(output, target)
        mean_corrector = n_movies / float(torch.sum(target.data > 0) + 1e-10)

        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))


test loss: tensor(0.9580)


# Full Code

In [74]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.parallel
from torch.autograd import Variable

users = pd.read_csv('users.dat', sep='::', header=None, engine='python', encoding='latin-1')
movies = pd.read_csv('movies.dat', sep='::', header=None, engine='python', encoding='latin-1')
reviews = pd.read_csv('ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')

training_set = pd.read_csv('u1.base', delimiter='\t')
test_set = pd.read_csv('u1.test', delimiter='\t')
training_set = np.array(training_set, dtype='int')
test_set = np.array(test_set, dtype='int')

n_users = max(max(training_set[:, 0]), max(test_set[:, 0]))
n_movies = max(max(training_set[:, 1]), max(test_set[:, 1]))

def convert(data):
  matrix = []
  for user_id in range(n_users):
    movies_reviewed = data[:, 1][data[:, 0] == user_id]
    grades = data[:, 2][data[:, 0] == user_id]

    review = np.zeros(n_movies)
    review[movies_reviewed-1] = grades
    matrix.append(review)
  return matrix

training_set = torch.FloatTensor(convert(training_set))
test_set = torch.FloatTensor(convert(test_set))

class SAE(nn.Module):
  def __init__(self, ):
    super(SAE, self).__init__()
    self.fc1 = nn.Linear(n_movies, 20)
    self.fc2 = nn.Linear(20, 10)
    self.fc3 = nn.Linear(10, 20)
    self.fc4 = nn.Linear(20, n_movies)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)

n_epochs = 121
for epoch in range(n_epochs):
  total_loss = 0
  s = 0

  for user_id in range(n_users):
    input = training_set[user_id].unsqueeze(0)
    target = input.clone().detach()

    if torch.sum(target > 0) > 0:
      output = sae(input)
      output[target == 0] = 0

      loss = criterion(output, target)
      mean_corrector = n_movies / float(torch.sum(target > 0) + 1e-10)
      loss.backward()
      total_loss += np.sqrt(loss.data*mean_corrector)
      s += 1
      optimizer.step()

  print('epoch: ' + str(epoch) + ' loss: ' + str(total_loss/s))


test_loss = 0
s = 0.
for user_id in range(n_users):
    input = training_set[user_id].unsqueeze(0)
    target = test_set[user_id].unsqueeze(0).detach()

    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        output[target == 0] = 0

        loss = criterion(output, target)
        mean_corrector = n_movies / float(torch.sum(target.data > 0) + 1e-10)

        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

epoch: 0 loss: tensor(1.7726)
epoch: 1 loss: tensor(1.0973)
epoch: 2 loss: tensor(1.0540)
epoch: 3 loss: tensor(1.0398)
epoch: 4 loss: tensor(1.0325)
epoch: 5 loss: tensor(1.0284)
epoch: 6 loss: tensor(1.0255)
epoch: 7 loss: tensor(1.0233)
epoch: 8 loss: tensor(1.0221)
epoch: 9 loss: tensor(1.0209)
epoch: 10 loss: tensor(1.0201)
epoch: 11 loss: tensor(1.0196)
epoch: 12 loss: tensor(1.0191)
epoch: 13 loss: tensor(1.0186)
epoch: 14 loss: tensor(1.0182)
epoch: 15 loss: tensor(1.0181)
epoch: 16 loss: tensor(1.0179)
epoch: 17 loss: tensor(1.0174)
epoch: 18 loss: tensor(1.0174)
epoch: 19 loss: tensor(1.0173)
epoch: 20 loss: tensor(1.0171)
epoch: 21 loss: tensor(1.0172)
epoch: 22 loss: tensor(1.0171)
epoch: 23 loss: tensor(1.0168)
epoch: 24 loss: tensor(1.0169)
epoch: 25 loss: tensor(1.0169)
epoch: 26 loss: tensor(1.0166)
epoch: 27 loss: tensor(1.0164)
epoch: 28 loss: tensor(1.0164)
epoch: 29 loss: tensor(1.0160)
epoch: 30 loss: tensor(1.0160)
epoch: 31 loss: tensor(1.0146)
epoch: 32 loss: te

In [81]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.regularizers import l2

users = pd.read_csv('users.dat', sep='::', header=None, engine='python', encoding='latin-1')
movies = pd.read_csv('movies.dat', sep='::', header=None, engine='python', encoding='latin-1')
reviews = pd.read_csv('ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')

training_set = pd.read_csv('u1.base', delimiter='\t', header=None)
test_set = pd.read_csv('u1.test', delimiter='\t', header=None)

training_set = np.array(training_set, dtype=int)
test_set = np.array(test_set, dtype=int)

n_users = max(max(training_set[:, 0]), max(test_set[:, 0])) + 1
n_movies = max(max(training_set[:, 1]), max(test_set[:, 1])) + 1

def convert(data):
    matrix = np.zeros((n_users, n_movies), dtype=np.float32)
    for user_id in range(n_users):

        movies_reviewed = data[:, 1][data[:, 0] == user_id]
        grades = data[:, 2][data[:, 0] == user_id]

        if len(movies_reviewed) > 0:
            matrix[user_id, movies_reviewed - 1] = grades

    return matrix

training_data = convert(training_set)
test_data = convert(test_set)

batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((training_data, training_data))
train_dataset = train_dataset.shuffle(buffer_size=n_users).batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_data)).batch(batch_size)

class SAE(Model):
    def __init__(self):
        super(SAE, self).__init__()
        self.fc1 = Dense(20, activation='sigmoid', kernel_regularizer=l2(0.5))
        self.fc2 = Dense(10, activation='sigmoid', kernel_regularizer=l2(0.5))
        self.fc3 = Dense(20, activation='sigmoid', kernel_regularizer=l2(0.5))
        self.fc4 = Dense(n_movies, kernel_regularizer=l2(0.5))

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return x

sae = SAE()

sae.compile(optimizer=RMSprop(learning_rate=0.01),
            loss=keras.losses.MeanSquaredError(),
            metrics=['mse'])

n_epochs = 121
sae.fit(train_dataset, epochs=n_epochs, validation_data=test_dataset)

test_loss = sae.evaluate(test_dataset)
print(f"test Loss: {test_loss[0]:.4f}")


Epoch 1/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 1211.9124 - mse: 1.1160 - val_loss: 490.4383 - val_mse: 0.2509
Epoch 2/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 395.2484 - mse: 0.7213 - val_loss: 178.7442 - val_mse: 0.1786
Epoch 3/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 140.3985 - mse: 0.6587 - val_loss: 53.6859 - val_mse: 0.1786
Epoch 4/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 40.1968 - mse: 0.6408 - val_loss: 11.3989 - val_mse: 0.1870
Epoch 5/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 8.3325 - mse: 0.6110 - val_loss: 1.9406 - val_mse: 0.1936
Epoch 6/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.9414 - mse: 0.6130 - val_loss: 0.9872 - val_mse: 0.1978
Epoch 7/121
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8m