#User-Movie Mapping Sparse Matrix Dataset
**No need to execute if dataset_matrix.npz already present**

In [None]:
import numpy as np
from scipy import sparse
def save_sparse_matrix(filename, x):
    x_coo = x.tocoo()
    row = x_coo.row
    col = x_coo.col
    data = x_coo.data
    shape = x_coo.shape
    np.savez(filename, row=row, col=col, data=data, shape=shape)

def load_sparse_matrix(filename):
    y = np.load(filename)
    z = sparse.coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape'])
    return z

In [None]:
import pandas as pd
ratings_df = pd.read_csv('clean/rating_updated_clean.csv')

In [None]:
movies_df = pd.read_csv('clean/movies_clean.csv')
movies_df.iloc[0].movieId

In [None]:
from scipy.sparse import lil_matrix
from tqdm import tqdm
user_movie = lil_matrix((ratings_df['userId'].unique().shape[0]+1, movies_df.shape[0]))
for i in tqdm(range(0,movies_df.shape[0])):
  movies = ratings_df[ratings_df.movieId == movies_df.iloc[i]['movieId']]
  userIdList = movies['userId'].values
  for j in userIdList:
        user_movie[j,i] = 1

In [None]:
save_sparse_matrix('tmp/dataset_matrix',user_movie)

In [None]:
z = load_sparse_matrix('tmp/dataset_matrix.npz').tolil()
z



---


#<h2>HERE LIES THE HVAE CODE THE ONE AND THE ONLY CODE</h2>

---
Download Required:

[PreTrained Weights](https://drive.google.com/file/d/1ikZOBA46TEZJLm88lsuXPG8bqtC7eJU4/view)

[User-Movie Sparse Matrix dataset.npz](https://drive.google.com/file/d/1onaqEkTF-Fo7iHTztcJUrep1l5Ht5rz6/view)

[Embed Matrix](https://drive.google.com/file/d/1YF4BGBIklBRso-7rAmYkccT9REVTNXbK/view)

[Movies Dataset for movie title and index](https://drive.google.com/file/d/1-BvShIGsXyWzvQ_ssXqp9E5wnbnxSXA7/view)

Instruction to Execute:
1. Download necessary dataset and matrices. Update below cell and fix path according to downloaded data path. 
2. Execute First Cell to load required datasest, matrices and define VAE model architecture and loss function
3. Load PreSaved Weights from checkpoint
4. Generate Predictions 

### <h1>Complete HVAE CODE </h1>

In [1]:
import numpy as np
import os
import pandas as pd

'''
  Load Movies Dataset (62000k)
  Load [User -> Movie Map] : [162k,62K] Sparse Matrix 
  Load [Embed Movie Feature Vector] : [62k, 3] Embedding generated from MVAE
'''
movies_df = pd.read_csv('clean/movies_clean.csv')

embed_movie_feature = np.load('tmp/embed_movie.npy')

import numpy as np
from scipy import sparse
def save_sparse_matrix(filename, x):
    x_coo = x.tocoo()
    row = x_coo.row
    col = x_coo.col
    data = x_coo.data
    shape = x_coo.shape
    np.savez(filename, row=row, col=col, data=data, shape=shape)

def load_sparse_matrix(filename):
    y = np.load(filename)
    z = sparse.coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape'])
    return z



x_train = load_sparse_matrix('tmp/dataset_matrix.npz').tolil()

# Convert to CSR format from stored COO format remove initial empty 
x_train = x_train.tocsr()
x_train = x_train[1:]


'''
  VAE Architecture Design
'''
import tensorflow as tf
from tensorflow.keras import layers
keras = tf.keras

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding movie."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


class Encoder(keras.Model):
    """Maps movie vector to a triplet (z_mean, z_log_var, z)."""

    def __init__(self, latent_dim=32, intermediate_dim=64, vocab_size=1000, embed_dim=3, seq_length=1000, weights=[],name="encoder", **kwargs):
        super(Encoder, self).__init__(name=name, **kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation="tanh")
        self.dense_mean = layers.Dense(latent_dim)
        self.dense_log_var = layers.Dense(latent_dim)
        self.embedding_layer = layers.Embedding(vocab_size,embed_dim ,weights=weights, input_length=seq_length, trainable=True)
        self.flatten_layer = layers.Flatten()
        self.sampling = Sampling()

    def call(self, inputs):
        embed = self.embedding_layer(inputs)
        flat_embed = self.flatten_layer(embed)
        x = self.dense_proj(flat_embed)
        # x = self.dense_proj(inputs)
        z_mean = self.dense_mean(x)
        z_log_var = self.dense_log_var(x)
        z = self.sampling((z_mean, z_log_var))
        return z_mean, z_log_var, z


class Decoder(keras.Model):
    """Converts z, the encoded movie vector, back into a movie feature vector."""

    def __init__(self, original_dim, intermediate_dim=64, name="decoder", **kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation="tanh")
        self.dense_output = layers.Dense(original_dim, activation="softmax")

    def call(self, inputs):
        x = self.dense_proj(inputs)
        return self.dense_output(x)


class VariationalAutoEncoder(keras.Model):
    """Combines the encoder and decoder into an end-to-end model for training."""

    def __init__(
        self,
        original_dim,
        intermediate_dim=64,
        latent_dim=32,
        vocab_size=1000,
        embed_dim=3,
        weights=[],
        name="autoencoder",
        **kwargs
    ):
        super(VariationalAutoEncoder, self).__init__(name=name, **kwargs)
        self.original_dim = original_dim
        self.encoder = Encoder(latent_dim=latent_dim, intermediate_dim=intermediate_dim, vocab_size=vocab_size, embed_dim=embed_dim, seq_length=vocab_size, weights=weights)
        self.decoder = Decoder(original_dim, intermediate_dim=intermediate_dim)

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Add KL divergence regularization loss.
        kl_loss = tf.reduce_mean(-0.5 * tf.reduce_sum(
            z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1
        ,axis=1))
        self.add_loss(kl_loss)
        return reconstructed
    

# custom loss function with tf.nn.sigmoid_cross_entropy_with_logits
def custom_sigmoid_cross_entropy_loss_with_logits(x_true, x_recons_logits):
    raw_cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(
                                            labels=x_true, logits=x_recons_logits)
    neg_log_likelihood = tf.math.reduce_sum(raw_cross_entropy, axis=[1])
    return tf.math.reduce_mean(neg_log_likelihood)



# <h2>Training is Expensive Especially for Currennt X_train dataset containing [162k,62k] matrix. Load Model With  PreSaved Weights for evaluation </h2>

In [None]:
import multiprocessing

'''
  Multiprocessing for managing GPU RAM in Google Colab for deallocation of GPU Usage
  fit_generator(deprecated) generator useful when training large dataset when entire dataset cannot fit in RAM
  fit used for training small dataset that can fit in RAM
  IN TF2 fit function can take generator. fit_generator hence deprecated.
  Train Using model.fit(generator(x,y,batch_size), ... )
'''
def create_model_and_train():

    SAMPLES_PER_EPOCH = x_train.shape[0]
    INPUT_DIM = x_train.shape[1] 
    INTERMEDIATE_DIM = 600
    LATENT_DIM = 200
    EPOCHS = 5
    BATCH_SIZE = 128
    LEARNING_RATE = 1e-3
    VOCAB_SIZE = x_train.shape[1]
    EMBED_DIM = 3
    STEPS_PER_EPOCH = np.math.ceil(SAMPLES_PER_EPOCH/BATCH_SIZE)

    # checkpoint_path = '/content/tmp/training/cp-{epoch:04d}.ckpt'
    checkpoint_path = '/content/drive/My Drive/Colab Notebooks/Data/tmp/training/cp-{epoch:04d}.h5'

    checkpoint_dir = os.path.dirname(checkpoint_path)
    SAVE_PERIOD = 5

    cp_callback = keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1, save_freq='epoch',period=SAVE_PERIOD,
        monitor='loss',
        mode='auto',
         save_best_only=True)

    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    tf.keras.backend.set_floatx('float64')
    vae = VariationalAutoEncoder( original_dim=INPUT_DIM,
            intermediate_dim=INTERMEDIATE_DIM,
            latent_dim=LATENT_DIM,
            vocab_size=VOCAB_SIZE,
            embed_dim=EMBED_DIM,
            weights=[embed_movie_feature])
    # train_history = vae.fit(train_dataset, train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True, validation_data=(validation_dataset, validation_dataset))
    vae.compile(optimizer, loss=custom_sigmoid_cross_entropy_loss_with_logits)



    vae.fit(nn_batch_generator(x_train, x_train, BATCH_SIZE, SAMPLES_PER_EPOCH) , steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS, callbacks=[cp_callback])


p = multiprocessing.Process(target=create_model_and_train())
p.start()
p.join()


In [None]:
# Save in H5 format for compact file
new_vae.save_weights('/content/drive/My Drive/Colab Notebooks/Data/tmp/model/weights.h5')

# <h2>Load PreSaved Weights from checkpoint</h2>

In [2]:

SAMPLES_PER_EPOCH = x_train.shape[0]
INPUT_DIM = x_train.shape[1] 
INTERMEDIATE_DIM = 600
LATENT_DIM = 200
EPOCHS = 5
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
VOCAB_SIZE = x_train.shape[1]
EMBED_DIM = 3
STEPS_PER_EPOCH = np.math.ceil(SAMPLES_PER_EPOCH/BATCH_SIZE)

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

tf.keras.backend.set_floatx('float64')
new_vae = VariationalAutoEncoder( original_dim=INPUT_DIM,
            intermediate_dim=INTERMEDIATE_DIM,
            latent_dim=LATENT_DIM,
            vocab_size=VOCAB_SIZE,
            embed_dim=EMBED_DIM,
            weights=[embed_movie_feature])
    # train_history = vae.fit(train_dataset, train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True, validation_data=(validation_dataset, validation_dataset))
new_vae.compile(optimizer, loss=custom_sigmoid_cross_entropy_loss_with_logits)

''''
  Above model creation same for both training and loading
  Below  load model weights from saved checkpoint
'''

# pass input shape for initial build(necessary if no input shape defined i.e. Input Layer not defined  in mode)
new_vae.predict(np.array(x_train[0].todense()).reshape(1,62000))

# checkpoint = new_vae.load_weights("/content/drive/My Drive/Colab Notebooks/Data/tmp/training/cp-0005.ckpt")

checkpoint = new_vae.load_weights("saved_model/weights.h5")

# <h2>Generate Predictions</h2>

In [None]:
k=20

user_ID_rand = int(np.random.randint(610, size=1))

test = np.array(x_train[user_ID_rand].todense()).reshape((1,INPUT_DIM))
_,_,z=new_vae.encoder(test)
test_reconstructed = new_vae.decoder(z)
test_reconstructed = test_reconstructed.numpy()

top_rated_movies_idx = [i for i, x in enumerate(test[0].tolist()) if x == 1.0]
print(f'User liked {len(top_rated_movies_idx)} movies')

if len(top_rated_movies_idx) == 0:
  print('Emptylist')
else:
  # print(top_rated_movies_idx)
  sorted_ratings = test_reconstructed[0].tolist()

  top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
  # print(top_predicted_movies_idx)

print('\nLiked movies indices')
count=0
for i in top_rated_movies_idx:
  print(movies_df.iloc[i]['movieId'], end= ' ')
  count += 1
  if count >10:
    break
print() 
print('\nPredicted Movies indices')
for i in top_predicted_movies_idx:
  print(movies_df.iloc[i]['movieId'], end= ' ')
print() 

print('\nLiked Movies\n')
count=0
for i in top_rated_movies_idx:

  print(movies_df[movies_df.movieId == movies_df.iloc[i]['movieId']]['title'].values, end= '->')
  print(movies_df[movies_df.movieId ==  movies_df.iloc[i]['movieId']]['genres'].values)
  count += 1
  if count >10:
    break
  # print(movies_df[movies_df.movieId ==  movie_dataset_df.columns[i]].head())
  # print(i)
  # print(movie_dataset_df.columns[i])
print('*'*100)

print('\nPredicted Movies\n')
for i in top_predicted_movies_idx:
  print(movies_df[movies_df.movieId == movies_df.iloc[i]['movieId']]['title'].values, end= '->')
  print(movies_df[movies_df.movieId ==  movies_df.iloc[i]['movieId']]['genres'].values)