## Loading the necessary libraries for the Notebook

In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)


2023-06-02 13:23:58.757063: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-02 13:23:58.759775: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-02 13:23:58.813431: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-02 13:23:58.814426: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


#### Add some convenience functions to Pandas DataFrame.

In [3]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

### Load The datasets to be considered

In [4]:
Ratingsdf =  pd.read_csv ("Netflix_Dataset_Rating.csv")
Moviesdf = pd.read_csv ("Netflix_Dataset_Movie.csv")
MoviesInfo_df = pd.read_csv ("imdb_processed.csv")
# Create one merged DataFrame containing all the Netflix data.
MoviesDF = Moviesdf.merge(Ratingsdf, on='Movie_ID').merge(MoviesInfo_df, on=['Name','Year'])
MoviesDF.head()

Unnamed: 0,Movie_ID,Year,Name,User_ID,Rating,kind,genre,imdb_rating,vote,country,language,cast,director,composer,writer,runtime
0,17,2005,7 Seconds,2187374,4,video movie,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,7153.0,"['United Kingdom', 'Romania', 'Switzerland', '...",['English'],"['Wesley Snipes', 'Tamzin Outhwaite', 'Deobia ...",['Simon Fellows'],"['Barry Taylor', 'Neal Acree']",['Martin Wheeler'],118.0
1,17,2005,7 Seconds,2503129,5,video movie,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,7153.0,"['United Kingdom', 'Romania', 'Switzerland', '...",['English'],"['Wesley Snipes', 'Tamzin Outhwaite', 'Deobia ...",['Simon Fellows'],"['Barry Taylor', 'Neal Acree']",['Martin Wheeler'],118.0
2,17,2005,7 Seconds,263315,3,video movie,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,7153.0,"['United Kingdom', 'Romania', 'Switzerland', '...",['English'],"['Wesley Snipes', 'Tamzin Outhwaite', 'Deobia ...",['Simon Fellows'],"['Barry Taylor', 'Neal Acree']",['Martin Wheeler'],118.0
3,17,2005,7 Seconds,608309,3,video movie,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,7153.0,"['United Kingdom', 'Romania', 'Switzerland', '...",['English'],"['Wesley Snipes', 'Tamzin Outhwaite', 'Deobia ...",['Simon Fellows'],"['Barry Taylor', 'Neal Acree']",['Martin Wheeler'],118.0
4,17,2005,7 Seconds,2336678,2,video movie,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,7153.0,"['United Kingdom', 'Romania', 'Switzerland', '...",['English'],"['Wesley Snipes', 'Tamzin Outhwaite', 'Deobia ...",['Simon Fellows'],"['Barry Taylor', 'Neal Acree']",['Martin Wheeler'],118.0


#### Remove the columns that wont be of use during the recommendation algorithm 

In [5]:
new_MoviesDF = MoviesDF.drop(columns=['kind','vote','country','cast','composer','runtime'])
new_MoviesDF.head()

Unnamed: 0,Movie_ID,Year,Name,User_ID,Rating,genre,imdb_rating,language,director,writer
0,17,2005,7 Seconds,2187374,4,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,['English'],['Simon Fellows'],['Martin Wheeler']
1,17,2005,7 Seconds,2503129,5,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,['English'],['Simon Fellows'],['Martin Wheeler']
2,17,2005,7 Seconds,263315,3,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,['English'],['Simon Fellows'],['Martin Wheeler']
3,17,2005,7 Seconds,608309,3,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,['English'],['Simon Fellows'],['Martin Wheeler']
4,17,2005,7 Seconds,2336678,2,"['Action', 'Comedy', 'Crime', 'Thriller']",4.8,['English'],['Simon Fellows'],['Martin Wheeler']


### Helping Functions for the model 

### Sparse Represantation of the Rating Matrix

In [6]:
## This function splits the dataframe into training and testing dataframe.
def split_dataframe(df, holdout_fraction=0.1):
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test


In [8]:
## This function results in a scalar Tensor representing the MSE between the true ratings and the
## model's predictions.
def sparse_mean_square_error(sparse_ratings, user_embeddings, movie_embeddings):
  
  predictions = tf.gather_nd(
      tf.matmul(user_embeddings, movie_embeddings, transpose_b=True),
      sparse_ratings.indices)
  loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
  return loss

#### The reason for the sparse represantation is to have an efficient matrix represenation of the users. 

In [7]:
## This function builds the sparse tensor matrix of the movies dataframe.
def build_rating_sparse_tensor(new_MoviesDF):
  indices = new_MoviesDF[['User_ID', 'Movie_ID']].values
  values = new_MoviesDF['Rating'].values
  return tf.SparseTensor(
      indices=indices,
      values=values,
      dense_shape=[Ratingsdf.shape[0], Moviesdf.shape[0]])

## A Collaborative Filtering Helper Class 

In [9]:
class CFModel(object):
  """Simple class that represents a collaborative filtering model"""
  def __init__(self, embedding_vars, loss, metrics=None):
    """Initializes a CFModel.
    Args:
      embedding_vars: A dictionary of tf.Variables.
      loss: A float Tensor. The loss to optimize.
      metrics: optional list of dictionaries of Tensors. The metrics in each
        dictionary will be plotted in a separate figure during training.
    """
    self._embedding_vars = embedding_vars
    self._loss = loss
    self._metrics = metrics
    self._embeddings = {k: None for k in embedding_vars}
    self._session = None

  @property
  def embeddings(self):
    """The embeddings dictionary."""
    return self._embeddings

  def train(self, num_iterations=100, learning_rate=1.0, plot_results=True,
            optimizer=tf.train.GradientDescentOptimizer):
    """Trains the model.
    Args:
      iterations: number of iterations to run.
      learning_rate: optimizer learning rate.
      plot_results: whether to plot the results at the end of training.
      optimizer: the optimizer to use. Default to GradientDescentOptimizer.
    Returns:
      The metrics dictionary evaluated at the last iteration.
    """
    with self._loss.graph.as_default():
      opt = optimizer(learning_rate)
      train_op = opt.minimize(self._loss)
      local_init_op = tf.group(
          tf.variables_initializer(opt.variables()),
          tf.local_variables_initializer())
      if self._session is None:
        self._session = tf.Session()
        with self._session.as_default():
          self._session.run(tf.global_variables_initializer())
          self._session.run(tf.tables_initializer())
          tf.train.start_queue_runners()

    with self._session.as_default():
      local_init_op.run()
      iterations = []
      metrics = self._metrics or ({},)
      metrics_vals = [collections.defaultdict(list) for _ in self._metrics]

      # Train and append results.
      for i in range(num_iterations + 1):
        _, results = self._session.run((train_op, metrics))
        if (i % 10 == 0) or i == num_iterations:
          print("\r iteration %d: " % i + ", ".join(
                ["%s=%f" % (k, v) for r in results for k, v in r.items()]),
                end='')
          iterations.append(i)
          for metric_val, result in zip(metrics_vals, results):
            for k, v in result.items():
              metric_val[k].append(v)

      for k, v in self._embedding_vars.items():
        self._embeddings[k] = v.eval()

      if plot_results:
        # Plot the metrics.
        num_subplots = len(metrics)+1
        fig = plt.figure()
        fig.set_size_inches(num_subplots*10, 8)
        for i, metric_vals in enumerate(metrics_vals):
          ax = fig.add_subplot(1, num_subplots, i+1)
          for k, v in metric_vals.items():
            ax.plot(iterations, v, label=k)
          ax.set_xlim([1, num_iterations])
          ax.legend()
      return results

#### Building the Matrix Factorization model and training it

In [13]:
## The function takes the New Movie data frame, the dimensions for the embeddings and the random intial embedding`s standard deviation.
def build_model(new_MoviesDF, embedding_dim=3, init_stddev=1.):

  # Split the new_MoviesDF DataFrame into train and test.
  train_ratings, test_ratings = split_dataframe(new_MoviesDF)
  # SparseTensor representation of the train and test datasets.
  A_train = build_rating_sparse_tensor(train_ratings)
  A_test = build_rating_sparse_tensor(test_ratings)
  # Initialize the embeddings using a normal distribution.
  U = tf.Variable(tf.random_normal(
      [A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
  V = tf.Variable(tf.random_normal(
      [A_train.dense_shape[1], embedding_dim], stddev=init_stddev))
  train_loss = sparse_mean_square_error(A_train, U, V)
  test_loss = sparse_mean_square_error(A_test, U, V)
  metrics = {
      'train_error': train_loss,
      'test_error': test_loss
  }
  embeddings = {
      "user_id": U,
      "movie_id": V
  }
  return CFModel(embeddings, train_loss, [metrics])

### Build the CF model and train it.

In [16]:
model = build_model(new_MoviesDF, embedding_dim=5, init_stddev=0.05)
model.train(num_iterations=10, learning_rate=2.)

2023-06-02 14:12:49.454922: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (mklcpu) ran out of memory trying to allocate 1.12TiB (rounded to 1232346514688)requested by op MatMul_6
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-06-02 14:12:49.454990: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for mklcpu
2023-06-02 14:12:49.455013: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-06-02 14:12:49.455029: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-06-02 14:12:49.455044: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (1024):

ResourceExhaustedError: Graph execution error:

Detected at node 'MatMul_6' defined at (most recent call last):
    File "/opt/anaconda3/envs/ds320/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/anaconda3/envs/ds320/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 724, in start
      self.io_loop.start()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/asyncio/base_events.py", line 1899, in _run_once
      handle._run()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 512, in dispatch_queue
      await self.process_one()
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 501, in process_one
      await dispatch(*args)
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 408, in dispatch_shell
      await result
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 731, in execute_request
      reply_content = await reply_content
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 417, in do_execute
      res = shell.run_cell(
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2945, in run_cell
      result = self._run_cell(
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3000, in _run_cell
      return runner(coro)
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3203, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3382, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_1964062/1898073120.py", line 1, in <module>
      model = build_model(new_MoviesDF, embedding_dim=5, init_stddev=0.05)
    File "/tmp/ipykernel_1964062/949738005.py", line 14, in build_model
      train_loss = sparse_mean_square_error(A_train, U, V)
    File "/tmp/ipykernel_1964062/3768727053.py", line 6, in sparse_mean_square_error
      tf.matmul(user_embeddings, movie_embeddings, transpose_b=True),
Node: 'MatMul_6'
OOM when allocating tensor with shape[17337458,17770] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node MatMul_6}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

Original stack trace for 'MatMul_6':
  File "/opt/anaconda3/envs/ds320/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/envs/ds320/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/traitlets/config/application.py", line 1041, in launch_instance
    app.start()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 724, in start
    self.io_loop.start()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/asyncio/base_events.py", line 1899, in _run_once
    handle._run()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 512, in dispatch_queue
    await self.process_one()
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 501, in process_one
    await dispatch(*args)
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 408, in dispatch_shell
    await result
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 731, in execute_request
    reply_content = await reply_content
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 417, in do_execute
    res = shell.run_cell(
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2945, in run_cell
    result = self._run_cell(
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3000, in _run_cell
    return runner(coro)
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3203, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3382, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/opt/anaconda3/envs/ds320/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1964062/1898073120.py", line 1, in <module>
    model = build_model(new_MoviesDF, embedding_dim=5, init_stddev=0.05)
  File "/tmp/ipykernel_1964062/949738005.py", line 14, in build_model
    train_loss = sparse_mean_square_error(A_train, U, V)
  File "/tmp/ipykernel_1964062/3768727053.py", line 6, in sparse_mean_square_error
    tf.matmul(user_embeddings, movie_embeddings, transpose_b=True),
  File "/home/LC/chakth01/.local/lib/python3.10/site-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "/home/LC/chakth01/.local/lib/python3.10/site-packages/tensorflow/python/util/dispatch.py", line 1176, in op_dispatch_handler
    return dispatch_target(*args, **kwargs)
  File "/home/LC/chakth01/.local/lib/python3.10/site-packages/tensorflow/python/ops/math_ops.py", line 3766, in matmul
    return gen_math_ops.mat_mul(
  File "/home/LC/chakth01/.local/lib/python3.10/site-packages/tensorflow/python/ops/gen_math_ops.py", line 6034, in mat_mul
    _, _, _op, _outputs = _op_def_library._apply_op_helper(
  File "/home/LC/chakth01/.local/lib/python3.10/site-packages/tensorflow/python/framework/op_def_library.py", line 795, in _apply_op_helper
    op = g._create_op_internal(op_type_name, inputs, dtypes=None,
  File "/home/LC/chakth01/.local/lib/python3.10/site-packages/tensorflow/python/framework/ops.py", line 3814, in _create_op_internal
    ret = Operation(
