# 2.0. Recomendations Engine using Bert

## 2.1. Install libraries

In [None]:
!pip install transformers
!pip install faiss # k-nn  to calculate semantic similarity (FACEBOOK AI SIMILARITY SEARCH)
!pip install faiss-gpu

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 13.7MB/s 
[?25hCollecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 56.1MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 55.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)


## 2.2. Mount Drive and read data

In [None]:
from google.colab import drive
import os 
drive.mount('/content/drive', force_remount=True)
COLAB = True
print("Note: using Google CoLab")

Mounted at /content/drive
Note: using Google CoLab


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os 
import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig # model
from transformers import BertTokenizer # tokenizer
from keras.preprocessing.sequence import pad_sequences # add padding
import torch
import time
import faiss # sentence similarity
import plotly.express as px #plots

FOLDER_PATH = '/content/drive/My Drive/projects/netflix'
DATA_FILE_PATH = os.path.join(FOLDER_PATH, 'netflix_titles.csv') 
df = pd.read_csv(DATA_FILE_PATH,na_values=['NA','?'])
df.head()



Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


## 2.3. Load model and tokenizer

In [None]:
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

output_dir = '/content/drive/My Drive/projects/netflix'
model = BertForSequenceClassification.from_pretrained(
     output_dir,#"bert-base-uncased"
    output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained(
     output_dir
    )
model.to(device)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## 2.4. Getting embeddings of descriptions

In [None]:
def time_elapsed(sec):
  h = int(sec/3600)
  m = int(sec/60)
  s = sec % 60
  return "{}:{:>02}:{:>05.2f}".format(h,m,s)

def description_embedding(tokenizer,model,description):
  MAX_LEN = 128
  # need to format inputs (decriptions)
  #  1.Add additional needed tokens
  input_ids = tokenizer.encode(
                        description,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
  #  2.every input must be the same length, but descriptions are different so me must add padding (adding token id0 to shorter inputs)
  results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", 
                            truncating="post", padding="post")
  input_ids = results[0] #pad sequences requires list of a lists, co to take only one list we make this
  #  3. Creating attention masks
  attention_masks = [int(token_id > 0) for token_id in input_ids]
  # 4. Create torch tensor and add dimension for number of batches (1)
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  input_ids = input_ids.unsqueeze(0)
  attention_masks = attention_masks.unsqueeze(0)
  # put model in eval mode
  model.eval()
  # copy inputs to GPU
  input_ids = input_ids.to(device)
  attention_masks = attention_masks.to(device)
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
  with torch.no_grad():        
    logits, encoded_layers = model(input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=attention_masks)
    
  layer_nr = 12 #last layer
  batch_nr = 0 # nr of inputs in batch 0 = 1
  token_nr = 0 # nr of first token
  emb_vec = encoded_layers[layer_nr][batch_nr][token_nr]
  # Move to cpu
  result = emb_vec.detach().cpu().numpy()
  return result

In [None]:
descriptions = df.description.values
start_time = time.time()
embeddings = []
# embedding in loop
for i, desc in enumerate(descriptions):
  embedding = description_embedding(tokenizer,model,desc)
  embeddings.append(embedding)
  if ((i % 1000 ==0) and (i>0) or (i == len(descriptions)-1)):
    elapsed_time = time_elapsed(time.time()-start_time)
    print(f'Progress: {round(i/len(descriptions)*100,2)}%')
    print(f'Time elapsed: {elapsed_time}')
embeddings = np.array(embeddings)


Progress: 16.04%
Time elapsed: 0:00:13.06
Progress: 32.08%
Time elapsed: 0:00:26.24
Progress: 48.12%
Time elapsed: 0:00:39.45
Progress: 64.16%
Time elapsed: 0:00:52.07
Progress: 80.21%
Time elapsed: 0:01:04.76
Progress: 96.25%
Time elapsed: 0:01:17.71
Progress: 99.98%
Time elapsed: 0:01:20.91


After getting embeddings, every description is represented by vector of 768 numbers. We need now to compare them and see which are similar and which are not.

## 2.5. Initialize FAISS (Facebook AI Similarity Search)

In [None]:
# k-NN to calculate simillarity
# initialize of FAISS
cpu_index = faiss.IndexFlatL2(embeddings.shape[1]) #embeddings.shape[1] is number of features in feature vector
co = faiss.GpuMultipleClonerOptions()
co.shard = True
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co,ngpu=1)
gpu_index.add(embeddings) # adding dataset


## 2.6. Function to calculate 10 similar movies to the choosen one

In [None]:
# function to calculate recomendations
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
def get_recomendations(title):
  idx = indices[title]
  distances, movies = gpu_index.search(embeddings[idx].reshape(1,768),k=11)
  movie_titles = []
  for i in range(movies.shape[1]-1):
    movie_titles.append(df.iloc[movies[0,i+1]].title)
  likehood = (1-distances[0,1:]/(max(max(distances)))*0.9)
  likehood = likehood/max(likehood)
  return movie_titles,likehood

## 2.7. Choose your movie here!

In [None]:
movie_you_watched = 'The Two Popes'
recomendations, likehood = get_recomendations(movie_you_watched)

## 2.8. Create plot with recomendations

In [None]:
# creating plot with recomendations
df_temp = pd.DataFrame(columns = df.columns)
temp_tittle=[]
for i,t in enumerate(recomendations):
  df_temp = df_temp.append(df[(df.title==t)])
  temp_tittle.append(' ')

plot_title = 'Recomendations after watching ' + movie_you_watched 
fig = px.treemap(
    data_frame = df_temp,
    names = df_temp['title'],
    values = likehood,
    parents = temp_tittle,
    hover_name = df_temp['title'],
    hover_data=['director','release_year','rating','country','cast'],
    title = plot_title
)
fig.show()