<a href="https://colab.research.google.com/github/SupreethRao99/NeuRec/blob/main/Sony_NeuRec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NeuRec
Recommending Content using Neural Colaborative Filtering. 

Trained Model checkpoint can be found [here](https://drive.google.com/drive/folders/1--3T3Mn0L0UCAH0thAkINIL2I-hdKNNA?usp=sharing)

Model Achieves 72% training recall score and 99.9% validation score


## Setup

In [None]:
# installing required libraries
%%capture
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q tensorflow-ranking
!pip install -q tf-nightly
!pip install -q ml_collections

In [None]:
# importing required libraries
import os
import random as rn
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_ranking as tfr
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras import layers
from keras.layers import Dense, Dropout
from tensorflow import keras
import ml_collections

In [None]:
from google.colab import drive

# Dataset is stored on google drive for easy acess on Google Colab
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define a Hyperparameter dictionary for easy experimentation and hyperparameter
# optmization 

def model_config():
  cfg_dictionary = {
      "root_dir":'/content/drive/MyDrive/Sony Dataset',
      "relationships_file":"relationship.csv",
      "content_file":"content.csv",

      "validation_split": 0.9,

      "epochs": 10,
      "batch_size": 256,

      "embedding_size": 256,
      "random_seed": 42,
      "model_checkpoint": "NCF99",
  }
  cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

  return cfg

cfg = model_config()

In [None]:
# Setting random seed for experiment reproducibility 
def set_seed(seed=cfg.random_seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    rn.seed(seed)
set_seed(cfg.random_seed)

## Pre Processing

In [None]:
def preprocess_df(root_dir, relationships_file, content_file):
  required_columns = ["user_id", "content_id", "rating", "date"]

  df1 = pd.read_csv(os.path.join(root_dir, relationships_file))
  df2 = pd.read_csv(os.path.join(root_dir, content_file))
  joined_df = pd.merge(df1, df2, on="content_id", how="left")
  df = joined_df[required_columns]
  df = df.sort_values("date")
  df = df.astype({"rating": float})
  return df

In [None]:
df = preprocess_df(cfg.root_dir, cfg.relationships_file, cfg.content_file)
user_ids = df["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}

content_ids = df["content_id"].unique().tolist()
content2content_encoded = {x: i for i, x in enumerate(content_ids)}
content_encoded2content = {i: x for i, x in enumerate(content_ids)}
df["user"] = df["user_id"].map(user2user_encoded)
df["content"] = df["content_id"].map(content2content_encoded)

num_users = len(user2user_encoded)
num_content = len(content_encoded2content)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    f"Number of users: {num_users}, Number of Movies: {num_content}, Min rating: {min_rating}, Max rating: {max_rating}"
)

df = df.sample(frac=1, random_state=1490251)
x = df[["user", "content"]].values
y = (
    df["rating"]
    .apply(lambda x: (x - min_rating) / (max_rating - min_rating))
    .values
)
train_indices = int(cfg.validation_split * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

Number of users: 10923, Number of Movies: 44223, Min rating: 0.0, Max rating: 10.0


## Model

In [None]:
class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_content, embedding_size, **kwargs):
      super(RecommenderNet, self).__init__(**kwargs)
      self.num_users = num_users
      self.num_content = num_content
      self.embedding_size = embedding_size
      self.user_embedding = layers.Embedding(
          num_users,
          embedding_size,
          embeddings_initializer="he_normal",
          embeddings_regularizer=keras.regularizers.l2(1e-6),
      )
      self.user_bias = layers.Embedding(num_users, 1)
      self.content_embedding = layers.Embedding(
          num_content,
          embedding_size,
          embeddings_initializer="he_normal",
          embeddings_regularizer=keras.regularizers.l2(1e-6),
      )
      self.content_bias = layers.Embedding(num_content, 1)

      # Add dense layers head to the model.
      self.d1 = Dense(1024, activation="relu")
      self.d2 = Dense(512, activation="relu")
      self.d3 = Dense(64, activation="relu")
      self.d4 = Dense(1)

      self.dr1 = Dropout(0.3)

    def call(self, inputs):
      user_vector = self.user_embedding(inputs[:, 0])
      user_bias = self.user_bias(inputs[:, 0])
      content_vector = self.content_embedding(inputs[:, 1])
      content_bias = self.content_bias(inputs[:, 1])
      dot_user_content = tf.tensordot(user_vector, content_vector, 2)
      # Add all the components (including bias)
      x = dot_user_content + user_bias + content_bias
      x = self.d1(x)
      x = self.dr1(x)
      x = self.d2(x)
      x = self.dr1(x)
      x = self.d3(x)
      x = self.dr1(x)
      x = self.d4(x)

      # The sigmoid activation forces the rating to between 0 and 1
      return tf.nn.sigmoid(x)


## Training

In [None]:
model = RecommenderNet(num_users, num_content , cfg.embedding_size)

model.compile(
    loss=tfr.keras.losses.PairwiseHingeLoss(),
    optimizer="adam",
    metrics=[tf.keras.metrics.Recall()],
)

callbacks = [
    EarlyStopping(patience=3),
    ReduceLROnPlateau(monitor="val_loss", patience=1),
]

model.fit(
    x=x_train,
    y=y_train,
    batch_size=cfg.batch_size,
    epochs=cfg.epochs,
    verbose=1,
    validation_data=(x_val, y_val),
    callbacks=callbacks,
)

model.save(cfg.model_checkpoint)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
INFO:tensorflow:Assets written to: NCF99/assets


## Inference

In [None]:
# Loading saved model
model = tf.keras.models.load_model("/content/NCF99")

In [None]:
content = pd.read_csv(os.path.join(cfg.root_dir,"content.csv"))
recommendations = {}
def make_recommendations(user_id):
  recs = []
  content_watched_by_user = df[df.user_id == user_id]
  content_not_watched = content[
    ~content["content_id"].isin(content_watched_by_user.content_id.values)
    ]["content_id"]
  content_not_watched = list(
    set(content_not_watched).intersection(set(content2content_encoded.keys()))
  )
  content_not_watched = [[content2content_encoded.get(x)] for x in content_not_watched]
  user_encoder = user2user_encoded.get(user_id)
  user_content_array = np.hstack(
    ([[user_encoder]] * len(content_not_watched), content_not_watched)
  )
  ratings = model.predict(user_content_array).flatten()
  top_ratings_indices = ratings.argsort()[-10:][::-1]
  recommended_content_ids = [
    content_encoded2content.get(content_not_watched[x][0]) for x in top_ratings_indices
  ]
  recommended_content = content[content["content_id"].isin(recommended_content_ids)]
  for row in recommended_content.itertuples():
    recs.append(row.content_id)

  recommendations[user_id]=recs

In [None]:
import tqdm as tqdm
test_df = pd.read_csv(os.path.join(cfg.root_dir,"test.csv"))
test_list = list(test_df["user_id"])
for user in tqdm.tqdm(test_list):
  try:
    make_recommendations(user)
  except:
    pass

## Exporting Results 

In [None]:
import json
with open('submission.json', 'w') as fp:
    json.dump(recommendations, fp)

In [None]:
print("Done Done!")

Done Done!
