### Part 4 - Load the chosen model & Extract its embeddings (latest changes on 25.03.2020)

#### Import the libraries

In [1]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction import text

import string
import itertools

from scipy import stats

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

In [2]:
%%time

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from time import time

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

import tensorflow_hub as hub
import bert
from bert import tokenization
from bert.tokenization import FullTokenizer

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

TensorFlow version:  2.1.0
Version:  2.1.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE
Wall time: 3.86 s


#### Import the dataset from part 3.1

In [3]:
dataset = pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\dataset_part_3.1_25032020.pkl"))
dataset.shape

(48992, 38)

#### Load the already trained chosen model
This is the model that perfomed better than those trained on part 3. <br>
<i> The chosen model is the "Multi-input keras model". <i>

In [4]:
"""
Load the weights of the model saved with EarlyStopping
"""
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(100), str(16))),'r') as f:
    model_json = json.load(f)

model = model_from_json(model_json)

model.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(100), str(16))))

print(type(model))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Load the saved tokenizers

In [6]:
"""
IMport the tokenizers of each input, fitted on part 3.1
"""
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\actors_tokenizer_20000_25032020.pkl'),'rb') as f:
    actors_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\plot_tokenizer_20000_25032020.pkl'),'rb') as f:
    plot_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\features_tokenizer_20000_25032020.pkl'),'rb') as f:
    features_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\reviews_tokenizer_20000_25032020.pkl'),'rb') as f:
    reviews_tokenizer = pickle.load(f)
    
print("Tokenizers are loaded successfully!")

Tokenizers are loaded successfully!


In [7]:
len(actors_tokenizer.word_index)

20000

#### Two Fucntions have been assembled to complete the word embeddings extraction

In [8]:
def extract_word_embeddings(variable, model, tokenizer):
    
    if variable == "actors":
        
        embeddings_layer = model.layers[4].get_weights()[0]

        word_embeddings = {w:embeddings_layer[idx] for w, idx in tokenizer.word_index.items() if idx <= len(tokenizer.word_index)+1}
        
        print("\nActor's word embeddings length: {}\n".format(embeddings_layer.shape))
        
    elif variable == "plot":
        
        embeddings_layer = model.layers[5].get_weights()[0]

        word_embeddings = {w:embeddings_layer[idx] for w, idx in tokenizer.word_index.items() if idx <= len(tokenizer.word_index)+1}
        print("Plot Summary's word embeddings length: {}\n".format(embeddings_layer.shape))
        
    elif variable == "features":
        
        embeddings_layer = model.layers[6].get_weights()[0]

        word_embeddings = {w:embeddings_layer[idx] for w, idx in tokenizer.word_index.items() if idx <= len(tokenizer.word_index)+1}
        print("Movie's Features word embeddings length: {}\n".format(embeddings_layer.shape))
        
    elif variable == "reviews":
        
        embeddings_layer = model.layers[7].get_weights()[0]

        word_embeddings = {w:embeddings_layer[idx] for w, idx in tokenizer.word_index.items() if idx <= len(tokenizer.word_index)+1}
        print("Movie's Reviews word embeddings length: {}\n".format(embeddings_layer.shape))
        
    return embeddings_layer, word_embeddings

def assign_word_embeddings(variable, dataset, word_embeddings):
    
    if variable == "actors":
    
        average_vector_list_cast = []

        min_vector_list_cast = []

        max_vector_list_cast = []

        actors_embeddings_list = []

        for i in range(len(dataset.index)):
            
            actors = dataset.loc[:, "clean_actors"].iloc[i].split(",")
    
            assert [word.islower() for word in actors] # assert that all actors are present in lower case
    
            actors_embeddings_list.append([word_embeddings[word] if word in word_embeddings else word_embeddings['<OOV>'] for word in actors])
    
        dataset.loc[:, 'actors_embeddings_list'] = actors_embeddings_list

        for i in range(len(dataset.index)):
            
            actor_embeddings = dataset["actors_embeddings_list"].iloc[i]
    
            minimum = np.min([element for element in actor_embeddings], axis=0)
            maximum = np.max([element for element in actor_embeddings], axis=0)
            average = np.mean([element for element in actor_embeddings], axis=0)
    
            min_vector_list_cast.append(minimum)
            max_vector_list_cast.append(maximum)
            average_vector_list_cast.append(average)

        dataset['minimum_cast_vectors'] = min_vector_list_cast
        dataset['maximum_cast_vectors'] = max_vector_list_cast
        dataset['average_cast_vectors'] = average_vector_list_cast
        
    elif variable == "plot":
    
        average_vector_list_plot = []

        min_vector_list_plot = []

        max_vector_list_plot = []

        plot_embeddings_list = []

        for i in range(len(dataset.index)):
            
            plot = dataset["clean_plot_summary"].iloc[i]
    
            plot_split = plot.split(' ')
    
            assert [word.islower() for word in plot_split]
    
            plot_embeddings_list.append([word_embeddings[word] if word in word_embeddings else word_embeddings['<OOV>'] for word in plot_split])
    
        dataset['plot_embeddings_list'] = plot_embeddings_list

        for i in range(len(dataset.index)):
            
            plot_embeddings = dataset["plot_embeddings_list"].iloc[i]
    
            minimum = np.min([element for element in plot_embeddings], axis=0)
            maximum = np.max([element for element in plot_embeddings], axis=0)
            average = np.mean([element for element in plot_embeddings], axis=0)
    
            min_vector_list_plot.append(minimum)
            max_vector_list_plot.append(maximum)
            average_vector_list_plot.append(average)

        dataset['minimum_plot_vectors'] = min_vector_list_plot
        dataset['maximum_plot_vectors'] = max_vector_list_plot
        dataset['average_plot_vectors'] = average_vector_list_plot
        
    elif variable == "features":
    
        average_vector_list_combined_features = []

        min_vector_list_combined_features = []

        max_vector_list_combined_features = []

        combined_features_embeddings_list = []

        for i in range(len(dataset.index)):
    
            combined_features = dataset["clean_combined_features"].iloc[i]
    
            combined_features_split = combined_features.split(' ')
    
            assert [word.islower() for word in combined_features_split]
    
            combined_features_embeddings_list.append([word_embeddings[word] if word in word_embeddings else word_embeddings['<OOV>'] for word in combined_features_split])
    
        dataset['combined_features_embeddings_list'] = combined_features_embeddings_list

        for i in range(len(dataset.index)):
            
            combined_features_embeddings = dataset["combined_features_embeddings_list"].iloc[i]
    
            minimum = np.min([element for element in combined_features_embeddings], axis=0)
            maximum = np.max([element for element in combined_features_embeddings], axis=0)
            average = np.mean([element for element in combined_features_embeddings], axis=0)
    
            min_vector_list_combined_features.append(minimum)
            max_vector_list_combined_features.append(maximum)
            average_vector_list_combined_features.append(average)

        dataset['minimum_combined_features_vectors'] = min_vector_list_combined_features
        dataset['maximum_combined_features_vectors'] = max_vector_list_combined_features
        dataset['average_combined_features_vectors'] = average_vector_list_combined_features
        
    elif variable == "reviews":
    
        average_vector_list_reviews = []

        min_vector_list_reviews = []

        max_vector_list_reviews = []

        reviews_embeddings_list = []

        for i in range(len(dataset.index)):
    
            reviews = dataset["clean_reviews"].iloc[i]
    
            reviews_split = reviews.split(' ')
    
            assert [word.islower() for word in reviews_split]
    
            reviews_embeddings_list.append([word_embeddings[word] if word in word_embeddings else word_embeddings['<OOV>'] for word in reviews_split])
    
        dataset['reviews_embeddings_list'] = reviews_embeddings_list

        for i in range(len(dataset.index)):
            
            reviews_embeddings = dataset["reviews_embeddings_list"].iloc[i]
    
            minimum = np.min([element for element in reviews_embeddings], axis=0)
            maximum = np.max([element for element in reviews_embeddings], axis=0)
            average = np.mean([element for element in reviews_embeddings], axis=0)
    
            min_vector_list_reviews.append(minimum)
            max_vector_list_reviews.append(maximum)
            average_vector_list_reviews.append(average)

        dataset['minimum_reviews_vectors'] = min_vector_list_reviews
        dataset['maximum_reviews_vectors'] = max_vector_list_reviews
        dataset['average_reviews_vectors'] = average_vector_list_reviews

In [9]:
%%time
# Extract the word embeddings
print("\n---------------------------------------------------------------------------------")
print("\nExtract the word embeddings")

actors_embedding_layer, word_embeddings_actors = extract_word_embeddings("actors", model, actors_tokenizer)
print("\nWord embeddings for actors extracted\n")

plot_embedding_layer, word_embeddings_plot = extract_word_embeddings("plot", model, plot_tokenizer)
print("\nWord embeddings for plot summary extracted\n")

features_embedding_layer, word_embeddings_features = extract_word_embeddings("features", model, features_tokenizer)
print("\nWord embeddings for movie features extracted\n")

reviews_embedding_layer, word_embeddings_reviews = extract_word_embeddings("reviews", model, reviews_tokenizer)
print("\nWord embeddings for movie reviews extracted\n")


---------------------------------------------------------------------------------

Extract the word embeddings

Actor's word embeddings length: (20002, 100)


Word embeddings for actors extracted

Plot Summary's word embeddings length: (20002, 100)


Word embeddings for plot summary extracted

Movie's Features word embeddings length: (20002, 100)


Word embeddings for movie features extracted

Movie's Reviews word embeddings length: (20002, 100)


Word embeddings for movie reviews extracted

Wall time: 74.8 ms


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
features_embedding_layer = features_embedding_layer / np.linalg.norm(features_embedding_layer, axis = 1).reshape((-1, 1))
features_embedding_layer

In [None]:
features_embedding_layer.shape

In [None]:
from sklearn.manifold import TSNE

In [None]:
def reduce_dim(weights, components = 3, method = 'tsne'):
    """Reduce dimensions of embeddings"""
    if method == 'tsne':
        return TSNE(components, metric = 'cosine').fit_transform(weights)
    elif method == 'umap':
        # Might want to try different parameters for UMAP
        return UMAP(n_components=components, metric = 'cosine', 
                    init = 'random', n_neighbors = 5).fit_transform(weights)

In [None]:
movie_features = reduce_dim(features_embedding_layer, components = 2, method = 'tsne')
movie_features.shape

In [None]:
genres=dataset.columns[13:30].tolist()
ints, gen = pd.factorize(genres)
gen[:5]

In [None]:
idx_include = [range(20002)]

In [None]:
plt.figure(figsize = (10, 8))

# Plot embedding
plt.scatter(movie_features[idx_include, 0], movie_features[idx_include, 1], 
            c = ints, cmap = plt.cm.tab10)

# Add colorbar and appropriate labels
cbar = plt.colorbar()
cbar.set_ticks([])
for j, lab in enumerate(gen):
    cbar.ax.text(1, (2 * j + 1) / ((10) * 2), lab, ha='left', va='center')
cbar.ax.set_title('Genre', loc = 'left')


plt.xlabel('TSNE 1'); plt.ylabel('TSNE 2'); plt.title('TSNE Visualization of Book Embeddings');

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [10]:
%%time
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Assign the word embeddings to each different actor
print("\n---------------------------------------------------------------------------------")
print("\nAssign the word embeddings to each different word")

print("Assign word embeddings to actors")
assign_word_embeddings("actors", dataset, word_embeddings_actors)

print("Assign word embeddings to plot summary")
assign_word_embeddings("plot", dataset, word_embeddings_plot)

print("Assign word embeddings to movie features")
assign_word_embeddings("features", dataset, word_embeddings_features)

print("Assign word embeddings to movie reviews")
assign_word_embeddings("reviews", dataset, word_embeddings_reviews)


---------------------------------------------------------------------------------

Assign the word embeddings to each different word
Assign word embeddings to actors
Assign word embeddings to plot summary
Assign word embeddings to movie features
Assign word embeddings to movie reviews
Wall time: 29.6 s


In [11]:
print("\nThe word embedding vector of the actor 'tobey maguire' is:\n\n", word_embeddings_actors['tobey maguire'])


The word embedding vector of the actor 'tobey maguire' is:

 [-0.2724324  -0.24405827 -0.42759097 -0.488678   -0.37412882 -0.32075557
 -0.46312344 -0.22833727 -0.4938505  -0.23705494 -0.3767111  -0.31893927
 -0.29804748 -0.22008938 -0.37356162 -0.19021493 -0.3106991  -0.5046652
 -0.5067489  -0.13628903 -0.5100186  -0.4387468  -0.4045188  -0.42895505
 -0.17405698 -0.4196416  -0.32258913 -0.6304357  -0.5015337  -0.38341716
  0.20488445 -0.34742785 -0.39623627 -0.3919231  -0.351513   -0.48129812
 -0.27094525 -0.4885028  -0.30965883 -0.348081   -0.35770744 -0.32492277
 -0.36964163 -0.44122627 -0.3689114  -0.42301044 -0.70987767 -0.3271487
 -0.23144028 -0.5712749  -0.5027816  -0.3755143  -0.47165275 -0.3278778
 -0.39265478 -0.3043721  -0.4021658  -0.47702324 -0.4514447  -0.32985735
 -0.37301627 -0.4577185  -0.4199124  -0.27355987 -0.43950403 -0.32568955
 -0.3056007  -0.45140788 -0.45237774 -0.39233974 -0.39421234 -0.38454905
  1.4188163  -0.4648103  -0.31668818 -0.37490577 -0.44342712 -0.3

In [12]:
%%time
keras_embeddings_array_cast = np.hstack([dataset['average_cast_vectors'].apply(pd.Series).values,
                                         dataset['minimum_cast_vectors'].apply(pd.Series).values,
                                         dataset['maximum_cast_vectors'].apply(pd.Series).values])

keras_embeddings_array_plot = np.hstack([dataset['average_plot_vectors'].apply(pd.Series).values,
                                         dataset['minimum_plot_vectors'].apply(pd.Series).values,
                                         dataset['maximum_plot_vectors'].apply(pd.Series).values])

keras_embeddings_array_combined_features = np.hstack([dataset['average_combined_features_vectors'].apply(pd.Series).values,
                                                      dataset['minimum_combined_features_vectors'].apply(pd.Series).values,
                                                      dataset['maximum_combined_features_vectors'].apply(pd.Series).values])

keras_embeddings_array_reviews = np.hstack([dataset['average_reviews_vectors'].apply(pd.Series).values,
                                            dataset['minimum_reviews_vectors'].apply(pd.Series).values,
                                            dataset['maximum_reviews_vectors'].apply(pd.Series).values])

keras_embeddings_array_cast_plot_combined_features_reviews = np.hstack([keras_embeddings_array_cast, 
                                                                        keras_embeddings_array_plot, 
                                                                        keras_embeddings_array_combined_features,
                                                                        keras_embeddings_array_reviews])

print("Shape of the Actors embeddings: {}".format(keras_embeddings_array_cast.shape))
print("\nShape of the Plot Summary embeddings: {}".format(keras_embeddings_array_plot.shape))
print("\nShape of the Combined Features embeddings: {}".format(keras_embeddings_array_combined_features.shape))
print("\nShape of the Reviews embeddings: {}".format(keras_embeddings_array_reviews.shape))
print("\nShape of the concatenated embeddings(cast, plot, combined features): {}".format(keras_embeddings_array_cast_plot_combined_features_reviews.shape))

Shape of the Actors embeddings: (48992, 300)

Shape of the Plot Summary embeddings: (48992, 300)

Shape of the Combined Features embeddings: (48992, 300)

Shape of the Reviews embeddings: (48992, 300)

Shape of the concatenated embeddings(cast, plot, combined features): (48992, 1200)
Wall time: 1min 5s


In [13]:
"""
Since the chosen model is the "Model_1: Multi-Input Keras Model", we saved the relevant word embeddings to the folder "model_one"
"""
with open(os.path.join(os.getcwd(), 'model_one\\keras_embeddings_array_cast_{0}_{1}_25032020.pkl'.format(str(100), str(16))), 'wb') as f:
    pickle.dump(keras_embeddings_array_cast, f)
    
with open(os.path.join(os.getcwd(), 'model_one\\keras_embeddings_array_plot_{0}_{1}_25032020.pkl'.format(str(100), str(16))), 'wb') as f:
    pickle.dump(keras_embeddings_array_plot, f)
    
with open(os.path.join(os.getcwd(), 'model_one\\keras_embeddings_array_combined_features_{0}_{1}_25032020.pkl'.format(str(100), str(16))), 'wb') as f:
    pickle.dump(keras_embeddings_array_combined_features, f)
    
with open(os.path.join(os.getcwd(), 'model_one\\keras_embeddings_array_reviews_{0}_{1}_25032020.pkl'.format(str(100), str(16))), 'wb') as f:
    pickle.dump(keras_embeddings_array_combined_features, f)
    
with open(os.path.join(os.getcwd(), 'model_one\\keras_embeddings_array_concatenated_{0}_{1}_25032020.pkl'.format(str(100), str(16))), 'wb') as f:
    pickle.dump(keras_embeddings_array_cast_plot_combined_features_reviews, f)

#### Pickle the dataset with the word embeddings

In [14]:
dataset.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_4_29032020.pkl'))

### END OF Part 4 - Load the chosen model & Extract Word Embeddings