# Data preparation

We just get the data from our source. In this case we used google colab to train our model so the path represents a path in our shared drive folder. When running it on your own make sure to set the right path here.

In [1]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("drive/MyDrive/data_mining/netflix/data/movies.csv", sep =';', error_bad_lines = False, names = ["movie_id", "year", "name"], header = None)
users_df = pd.read_csv("drive/MyDrive/data_mining/netflix/data/users.csv", sep =';', error_bad_lines = False, names = ["user_id", "gender", "age", "work"], header = None)
ratings_df = pd.read_csv("drive/MyDrive/data_mining/netflix/data/ratings.csv", sep =';', error_bad_lines = False, names = ["user_id", "movie_id", "rating"], header = None)
output_df = pd.read_csv("drive/MyDrive/data_mining/netflix/data/predictions.csv", sep =';', error_bad_lines = False, names = ["user_id", "movie_id"], header = None)

Mounted at /content/drive




  exec(code_obj, self.user_global_ns, self.user_ns)


We convert the data to numpy

In [2]:
movies_data = movies_df.to_numpy()
users_data = users_df.to_numpy()
ratings_data = ratings_df.to_numpy()
output_data = output_df.to_numpy()

We now create our train, valid and test matrices. We will use these to train and test our model.

For the best submission we have used a very small validation set leaving as much data as possible for the training. This gave us a bad estimate of performance so a larger validation set is recommended for reliable results.

Using the below ratio we can easily achieve an RMSE score of around 0.828
Making the validation smaller we can achieve 0.825
The scores are from the public leaderboard.

In [3]:
import tensorflow as tf

# Shuffling the data so we do not have the same validation set every time.
np.random.shuffle(ratings_data)

# The reviews we will use for training.
train_reviews = ratings_data[:890190]
# The reviews we will use for validation.
valid_reviews = ratings_data[890190:908190]
# The reviews we will use for test.
test_reviews = ratings_data[908190:]

# The 3 different reviews matrices
train_matrix = np.zeros((3706,6040))
valid_matrix = np.zeros((3706,6040))
test_matrix = np.zeros((3706,6040))

# We initialize each with only the corresponding reviews. Having a value of 0 on a entry would mean missing review.
for user_id, movie_id, rating in train_reviews:
  train_matrix[movie_id - 1, user_id - 1] = rating
for user_id, movie_id, rating in valid_reviews:
  valid_matrix[movie_id - 1, user_id - 1] = rating
for user_id, movie_id, rating in test_reviews:
  test_matrix[movie_id - 1, user_id - 1] = rating

# Model definition

**Model Hyper Parameters**

In [4]:
# The number of kernel dimensions we will use.
kernel_dimensions = 5

# Regularization parameters
lambda_w = 84.
lambda_w_kernelized = 0.0182

# The architecture of the model.
architecture = [
    6040,
    500,
    500,
    6040
]

# The activation functions that will be used.
activations = [
    tf.nn.sigmoid,
    tf.nn.sigmoid,
    tf.identity,
]

# Just to check we don't mess up the number of activations and the architecture
assert len(activations) + 1 == len(architecture)

We will use tensorflow to optimize our model. We we will make each trainable variable from our model as a tf.Variable. Below is the creation of said variables.



*   **weights** - will represent the classic neural network weights of our model. Each will have size (node_in, node_out).
*   **in_vectors** - will represent the vectors used for distance. Each entry in the list will represent the position of the input of the layer with index i. Of size (nodes_in, 1, kernel_dimensions).
*   **out_vectors** - will represent the vectors used for distance. Each entry in the list will represent the position of the output of the layer with index i. Of size (1, nodes_out, kernel_dimensions).
*   **biases** - will represent simple neural network biases that will be applied for each neuron.

Note: It's important to initialize the in and out vectors to small values as large values will produce a gradient that is hard to escape from using our optimization function.

In [5]:
# Creates the weight variables
weights = []
for i in range(len(architecture) - 1):
  # We take the amount of nodes in and out
  nodes_in = architecture[i]
  nodes_out = architecture[i + 1]
  # The normal
  W = tf.Variable(np.zeros((nodes_in, nodes_out)))
  weights.append(W)

# Creates the node vector positions
in_vectors = []
out_vectors = []
for i in range(len(architecture) - 1):
  # We take the amount of nodes in and out
  nodes_in = architecture[i]
  nodes_out = architecture[i + 1]
  # We create the in and out vectors for this layer.
  # These vectors represent the position in the space.
  in_vector = tf.Variable(np.random.uniform(0,1e-3, size = (nodes_in, 1, kernel_dimensions)))
  out_vector = tf.Variable(np.random.uniform(0,1e-3, size = (1, nodes_out, kernel_dimensions)))
  in_vectors.append(in_vector)
  out_vectors.append(out_vector)

# Creates the biases
biases = []
for i in range(len(architecture) - 1):
  # We take the amount of nodes out.
  nodes_out = architecture[i + 1]
  # The biases are initialized to 0 by default.
  b = tf.Variable(np.zeros((nodes_out)))
  biases.append(b)

# Loss function and helper methods.

In [6]:
# The kernel method we will use.
# Small values mean less importance and larger values mean more importance.
def kernel(u, v):
    distance = tf.norm(u - v, ord=2, axis=2)

    # We cast here because our u,v will be float64.
    importance = tf.maximum(tf.cast(0., tf.float64), 1. - distance**2)
    return importance

# The layer function. Takes the input from the previous layer and the index of the current layer.
def layer(x, index):

  # Getting the variables for this specific layer.
  W = weights[index]
  in_vector = in_vectors[index]
  out_vector = out_vectors[index]
  bias = biases[index]
  activation = activations[index]

  # We perform the kernel function on the vectors
  w_kernelized = kernel(in_vector, out_vector)

  # We compute the regularizations for the w_kernelized and for W.
  # We compute the regularization for w_kernelized s. t. the model is pushed towards choosing a small number of really close neurons.
  # We compute the regularization for W as we would in normal neural networks.
  reg_w_kernelized = tf.keras.regularizers.L2(lambda_w_kernelized)([w_kernelized])
  reg_w = tf.keras.regularizers.L2(lambda_w)([W])

  # We now use the importance to scale our weights
  W_final = W * w_kernelized

  # We now just compute the outputs as we would normally.
  M = x @ W_final + bias
  M = activation(M)

  # Returns both the outputs and the regularization values summed.
  return M, reg_w_kernelized + reg_w


# This function predicts the output using the model given an input matrix
def predict(matrix):
  for i in range(len(architecture) - 1):
    matrix, _ = layer(matrix, i)
  return matrix

# This function predicts the output using the model given an input matrix
# But also gives the regularization loss.
def predict_with_loss(matrix):
  loss_sum = 0
  for i in range(len(architecture) - 1):
    matrix, loss = layer(matrix, i)
    loss_sum += loss
  return matrix, loss_sum

# This function calculates the loss in a general way.
# It has the input and output matrixes configurable.
def calc_loss(in_matrix = train_matrix, out_matrix = train_matrix):
  matrix, reg_loss = predict_with_loss(in_matrix)
  dist = (out_matrix > 0) * (matrix - out_matrix)
  loss = tf.sqrt(tf.reduce_sum(tf.pow(dist, 2)) / tf.reduce_sum((out_matrix > 0) * 1.0))
  return loss

# This is the actual loss function that will be used in the training procedure.
def loss_train():
  loss_sum = 0
  matrix = train_matrix
  for i in range(len(architecture) - 1):
    matrix, loss = layer(matrix, i)
    loss_sum += loss
  dist = (train_matrix > 0.1) * (matrix - train_matrix)
  loss_sum += tf.reduce_sum(tf.pow(dist, 2))
  return loss_sum

This code is just to be able to save our model with every parameter and load it again later.

In [7]:
import os
def save_model(name):
  os.makedirs('drive/MyDrive/data_mining/netflix/saved/' + name, exist_ok=True)
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/weights', 'wb') as f:
    for i in range(len(architecture)-1):
      np.save(f, weights[i].numpy())
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/in_vectors', 'wb') as f:
    for i in range(len(architecture)-1):
      np.save(f, in_vectors[i].numpy())
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/out_vectors', 'wb') as f:
    for i in range(len(architecture)-1):
      np.save(f, out_vectors[i].numpy())
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/biases', 'wb') as f:
    for i in range(len(architecture)-1):
      np.save(f, biases[i].numpy())
def load_model(name):
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/weights', 'rb') as f:
    for i in range(len(architecture)-1):
      weights[i] = tf.Variable(np.load(f))
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/in_vectors', 'rb') as f:
    for i in range(len(architecture)-1):
      in_vectors[i] = tf.Variable(np.load(f))
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/out_vectors', 'rb') as f:
    for i in range(len(architecture)-1):
      out_vectors[i] = tf.Variable(np.load(f))
  with open('drive/MyDrive/data_mining/netflix/saved/' + name + '/biases', 'rb') as f:
    for i in range(len(architecture)-1):
      biases[i] = tf.Variable(np.load(f))

# Our Optimizer

As our optimizer we decided to go with **BFGS**. We opted for this algorithm as a lot of papers in the space use it and the creator of the paper this algorithm is based on also suggested it.

**BFGS** is an optimization algorithm that employs the use of second order derivatives. It uses a **Hessian Matrix** to calculate the next step in the optimization. **BFGS** uses line search to ensure the stability of the optimization. **Line search** requires a higher precission so we will use **float64** throughout our model.

## L-BFGS adapter

This is some code we found on github to help us use the L-BFGS optimizer in tensorflow 2.0 with trainable variables. It packs and unpacks our parameters accordingly to satisfy the parameters of the optimizer function.

It uses the L-BFGS optimizer found in the tensorflow probability.

In [8]:
from typing import Sequence, Union, List
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from matplotlib import pyplot as plt

def pack_tensors(tensors: Sequence[Union[tf.Tensor, tf.Variable]]) -> tf.Tensor:
    flats = [tf.reshape(tensor, (-1,)) for tensor in tensors]
    tensors_vector = tf.concat(flats, axis=0)
    return tensors_vector


def unpack_tensors(
    to_tensors: Sequence[Union[tf.Tensor, tf.Variable]], from_vector: tf.Tensor
) -> List[tf.Tensor]:
    s = 0
    values = []
    for target_tensor in to_tensors:
        shape = tf.shape(target_tensor)
        dtype = target_tensor.dtype
        tensor_size = tf.reduce_prod(shape)
        tensor_vector = from_vector[s : s + tensor_size]
        tensor = tf.reshape(tf.cast(tensor_vector, tf.float64), shape)
        values.append(tensor)
        s += tensor_size
    return values


def assign_tensors(to_tensors: Sequence[tf.Variable], values: Sequence[tf.Tensor]) -> None:
    if len(to_tensors) != len(values):
        raise ValueError("to_tensors and values should have same length")
    for target, value in zip(to_tensors, values):
        target.assign(value)


def create_value_and_gradient_function(loss_closure, trainable_variables, verbose=1):
    """A factory to create a function required by tfp.optimizer.lbfgs_minimize.
    Args:
        loss_closure:
        trainable_variables:
    Returns:
        A function that has a signature of:
            loss_value, gradients = f(model_parameters).
    """
    i = tf.Variable(0)

    def assign_param_values_to_variables(x):
        values = unpack_tensors(trainable_variables, x)
        assign_tensors(trainable_variables, values)

    @tf.function
    def f_value_and_gradients(x):
        """A function that can be used by tfp.optimizer.lbfgs_minimize"""
        # update params
        assign_param_values_to_variables(x)
        # compute loss value and gradients w.r.t. trainable variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(trainable_variables)
            loss_value = loss_closure()
        grads = tape.gradient(loss_value, trainable_variables)

        i.assign_add(1)
        if verbose > 0:
            tf.print("Iter:", i, "loss:", loss_value)

        # return loss and flattened gradients
        return loss_value, pack_tensors(grads)

    return f_value_and_gradients, assign_param_values_to_variables

class LBFGSOptimizer():

    def __init__(self, loss_closure, trainable_variables, steps=50):
        tf.keras.backend.set_floatx("float64")
        self.initial_position = pack_tensors(trainable_variables)
        self.results = None
        func, assign = create_value_and_gradient_function(
            loss_closure=loss_closure,
            trainable_variables=trainable_variables
        )
        self.func = func
        self.assign = assign
        self.steps = steps

    @property
    def epoch(self):
        if self.results is None:
            return 0
        return int(self.results.num_iterations.numpy())
    
    @property
    def loss(self):
        if self.results is None:
            return None
        return float(self.results.objective_value.numpy())

    def minimize(self):
        if self.results is None:
            initial_position = self.initial_position
        else:
            initial_position = None
        self.results = tfp.optimizer.lbfgs_minimize(
            value_and_gradients_function=self.func,
            initial_position=initial_position,
            previous_optimizer_results=self.results,
            max_iterations=tf.cast(self.epoch + self.steps, dtype=tf.int32)
        )
        print(self.results.position)
        self.assign(self.results.position)

# Training the algorithm

Having defined all the necessary steps before, the training of the algorithm is very easy.

In [9]:
# To check with what gpu we are training
! nvidia-smi
# We define our optimizer. We will need to unpack every variable for the optimizer in a list. We will use 5 steps per minimize call.
opt = LBFGSOptimizer(loss_train, [*weights, *in_vectors, *out_vectors, *biases], steps = 5)

# We iterate for 300 epochs * 5 * 3 steps but we may and should probably break earlier as needed.
epochs = 300
for i in range(epochs):
  
  # We just call optimize here. The adapter will handle actually updating our variables. 
  optim_results = opt.minimize()
  
  # The rest of the code is to just output actual training score and validation.
  # By actual we mean without the regularization part.
  loss1 = calc_loss().numpy()
  loss2 = calc_loss(out_matrix = valid_matrix).numpy()
  print("Real training loss:", loss1)
  print("Validation loss:", loss2)

  # Once every 50 iterations we will also save the model.
  if i % 50 == 0:
    save_model("train"+("%.2f" % loss1) + " valid"+("%.2f" % loss2))

Fri Jan 13 15:14:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    51W / 400W |    760MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

KeyboardInterrupt: ignored

Using this we just calculate the error on the test set

In [None]:
calc_loss(out_matrix = test_matrix)

# Outputting our solution

We will just predict on the train matrix, get each review into a numpy array and perform a clip on the values as values below 1 or above 5 make no sense.

In [None]:
resulting_matrix = predict(train_matrix)
preds = np.array([resulting_matrix[i[1]-1,i[0]-1] for i in output_data])
preds = np.array([min(5, max(1,i)) for i in preds])

We now just output our predictions to submit.csv

In [None]:
result = np.array(list(zip(list(range(1,1 + len(preds))),(preds))))
data_types_dict = {'Id': int, 'Rating':float}
pd.DataFrame(result, columns=['Id', 'Rating']).astype(data_types_dict).to_csv("submit.csv", index=False)