### Part 3.2 - Model 2: GloVe embeddings model (latest changes on 08.03.2020)

#### Import the libraries

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook, tqdm

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

import string
import itertools

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

#### Keras Text Classification (For creating the word embeddings)

In [None]:
from nltk.stem import WordNetLemmatizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from time import time

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

import tensorflow_hub as hub
import bert
from bert import tokenization
from bert.tokenization import FullTokenizer

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import hamming_loss, zero_one_loss, f1_score

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

#### Import the dataset

In [None]:
X_train_seq_actors=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_actors_80-20_non-balanced_06032020.npy"))
X_train_seq_plot=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_plot_80-20_non-balanced_06032020.npy"))
X_train_seq_features=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_features_80-20_non-balanced_06032020.npy"))
X_train_seq_reviews=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_reviews_80-20_non-balanced_06032020.npy"))

print("X_train data inputs have been loaded!\n")

X_test_seq_actors=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_actors_80-20_non-balanced_06032020.npy"))
X_test_seq_plot=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_plot_80-20_non-balanced_06032020.npy"))
X_test_seq_features=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_features_80-20_non-balanced_06032020.npy"))
X_test_seq_reviews=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_reviews_80-20_non-balanced_06032020.npy"))

print("X_test data inputs have been loaded!\n")

y_train=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\y_train_80-20_non-balanced_06032020.npy"))
y_test=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\y_test_80-20_non-balanced_06032020.npy"))

print("y_train & y_test have been loaded!\n")

assert X_train_seq_actors.shape[0]==X_train_seq_plot.shape[0]==X_train_seq_features.shape[0]==X_train_seq_reviews.shape[0]==y_train.shape[0]

assert X_test_seq_actors.shape[0]==X_test_seq_plot.shape[0]==X_test_seq_features.shape[0]==X_test_seq_reviews.shape[0]==y_test.shape[0]

#### Import the saved tokenizers

In [None]:
"""
IMport the tokenizers of each input, fitted on part 3.1
"""
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\actors_tokenizer_06032020.pkl'),'rb') as f:
    actors_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\plot_tokenizer_06032020.pkl'),'rb') as f:
    plot_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\features_tokenizer_06032020.pkl'),'rb') as f:
    features_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\reviews_tokenizer_06032020.pkl'),'rb') as f:
    reviews_tokenizer = pickle.load(f)
    
print("Tokenizers are loaded successfully!")

# <b>- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  </b>

### MODEL 2: GloVe Pre-trained embeddings Model

In [None]:
### Load the pretrained word embeddings

import os

def load_glove_embeddings(dim: int = 300) -> dict:
    """

    :param dim: The embeddings size (dimensions)
    :return:
    
    """
    #glove_dir = os.path.join(os.getcwd(), 'glove.42B.300d')  # This is the folder with the embeddings
    glove_dir = os.path.join(os.getcwd(), 'glove.6B')  # This is the folder with the embeddings

    print('Loading word vectors')

    embed_index = dict()  # We create a dictionary of word -> embedding

    #fname = os.path.join(glove_dir, 'glove.42B.{}d.txt'.format(dim))
    fname = os.path.join(glove_dir, 'glove.6B.{}d.txt'.format(dim))

    f = open(fname,  'r', errors = 'ignore', encoding='utf8')  # Open file

    # In the dataset, each line represents a new word embedding
    # The line starts with the word and the embedding values follow
    for line in tqdm(f, desc='Loading Embeddings', unit='word'):
        values = line.split()
        # The first value is the word, the rest are the values of the embedding
        word = values[0]
        # Load embedding
        embedding = np.asarray(values[1:], dtype='float32')

        # Add embedding to our embedding dictionary
        embed_index[word] = embedding
    f.close()

    print('Found %s word vectors.' % len(embed_index))

    return embed_index

In [None]:
embedding_dim = 300  # We now use larger embeddings

embeddings_index = load_glove_embeddings(dim=embedding_dim)

In [None]:
"""
Based on the imported embedding index of GloVe embeddings, below we present the embedding vector of the word "adventure"
"""
embeddings_index['adventure']

In [None]:
def create_embeddings_matrix(emb_index: dict,
                             tokenizer: Tokenizer,
                             emb_dim: int = 300) -> np.ndarray:
    """
    :param emb_index: Embeddings Index
    :param tokenizer: Keras fitted tokenizer.
    :param emb_dim: Embeddings dimension.
    :return: A matrix of shape (nb_words, emb_dim) containing the globe embeddings.
    
    """
    assert emb_dim in [50, 100, 200, 300]

    # Create a matrix of all embeddings
    # (stacking=concatenating all the vectors)
    all_embs = np.stack(
        emb_index.values())  # .values() gets the all the arrays from the keys

    # Calculate mean
    emb_mean = all_embs.mean()
    # Calculate standard deviation
    emb_std = all_embs.std()

    print("Embeddings AVG: {} | STD: {}".format(emb_mean, emb_std))

    # We can now create an embedding matrix holding all word vectors.

    word_index = tokenizer.word_index

    # How many words are there actually. Because we may have requested X most common tokens
    # and the total tokens are X/2
    max_words = len(word_index)+2 #check if +1 is needed
    nb_words = max(max_words,
                   len(word_index))
    
    print(nb_words)

    # Create a random matrix with the same mean and std as the embeddings

    embedding_matrix = np.random.normal(emb_mean,  # mean
                                        emb_std,  # std
                                        (nb_words, emb_dim)  # shape of the matrix
                                        )

    # The vectors need to be in the same position as their index.
    # Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on

    # Loop over all words in the word index
    for word, i in word_index.items():  # .items() return a tuple with (word, word_index)

        # If we are above the amount of words we want to use we do nothing
        if i >= max_words:
            continue

        # Get the embedding vector for the word
        embedding_vector = emb_index.get(word)

        # If there is an embedding vector, put it in the embedding matrix
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [None]:
# Actors
embedding_matrix_actors = create_embeddings_matrix(emb_index=embeddings_index,
                                                   tokenizer=actors_tokenizer,
                                                   emb_dim=embedding_dim)

# Plot Summary
embedding_matrix_plot = create_embeddings_matrix(emb_index=embeddings_index,
                                                 tokenizer=plot_tokenizer,
                                                 emb_dim=embedding_dim)

# Movie Features
embedding_matrix_features = create_embeddings_matrix(emb_index=embeddings_index,
                                                     tokenizer=features_tokenizer,
                                                     emb_dim=embedding_dim)

# Movie Reviews
embedding_matrix_reviews = create_embeddings_matrix(emb_index=embeddings_index,
                                                     tokenizer=reviews_tokenizer,
                                                     emb_dim=embedding_dim)

In [None]:
%load_ext tensorboard
# %reload_text tensorboard

logdir=".\\logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Callback function with early stopping to avodid overfit

class Callback_Configurations():
    
    MONITOR_METRIC = 'val_loss'
    MINIMUM_DELTA = 0.1
    PATIENCE = 2
    VERBOSE = 1
    MODE = 'min'
    
def callback(saved_model, model, logdir):
    
    weights_fname = os.path.join(os.getcwd(), 'model_two\\{}.h5'.format(saved_model))

    try:
        with open(os.path.join(os.getcwd(), 'model_two\\{}.json'.format(save_model)), 'r') as f:
            
            model_json = json.load(f)
        
            model = model_from_json(model_json)
        
            model.load_weights('{}').format(weights_fname)

    except:
        print('\nPre-trained weights not found. Fitting from start')
        pass

    monitor_metric = Callback_Configurations.MONITOR_METRIC
    
    callbacks = [
        tfmodel.EpochDots(),
        
        EarlyStopping(monitor=monitor_metric,
                      min_delta=Callback_Configurations.MINIMUM_DELTA,
                      patience=Callback_Configurations.PATIENCE,
                      verbose=Callback_Configurations.VERBOSE,
                      mode=Callback_Configurations.MODE,
                      restore_best_weights=True),

        ModelCheckpoint(filepath=weights_fname,
                        monitor=monitor_metric,
                        verbose=Callback_Configurations.VERBOSE,
                        save_best_only=True,
                        save_weights_only=True), #True, False
        
        tf.keras.callbacks.TensorBoard(logdir)
        
]
    return callbacks

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function to fit the keras multy input model

def fitting_model(method,
                  model,
                  x_train_seq_actors, 
                  x_train_seq_plot, 
                  x_train_seq_features,
                  x_train_seq_reviews,
                  x_test_seq_actors, 
                  x_test_seq_plot, 
                  x_test_seq_features,
                  x_test_seq_reviews,
                  y_train, 
                  y_test,
                  callbacks,
                  epoch,
                  verbose_fit,
                  batch_size_fit,
                  val_split):
    """
    Instead of using validation_data, I used the validation_split parameter
    """
    s = time()
    
    if method=="validation_split":
        
        fit_model = model.fit([x_train_seq_actors, x_train_seq_plot, x_train_seq_features, x_train_seq_reviews], y_train,
                              epochs=epoch,
                              verbose=verbose_fit,
                              batch_size=batch_size_fit,
                              validation_split=val_split, # 90% for train and 10% for validation
                              #validation_data=([x_test_seq_actors, x_test_seq_plot, x_test_seq_features, x_test_seq_reviews], y_test),
                              callbacks=callbacks) #(callbacks)
    elif method=="validation_data":
        
        fit_model = model.fit([x_train_seq_actors, x_train_seq_plot, x_train_seq_features, x_train_seq_reviews], y_train,
                              epochs=epoch,
                              verbose=verbose_fit,
                              batch_size=batch_size_fit,
                              validation_data=([x_test_seq_actors, x_test_seq_plot, x_test_seq_features, x_test_seq_reviews], y_test),
                              callbacks=callbacks) #(callbacks)

    duration = time() - s
    print("\nTraining time finished. Duration {} secs".format(duration))
    
    return fit_model

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

def save_model(model, model_name):
    
    model_json = model.to_json()

    with open(os.path.join(os.getcwd(), "model_two\\{}.json".format(model_name)), "w") as json_file:
        json.dump(model_json, json_file)

    model.save_weights(os.path.join(os.getcwd(), "model_two\\{}.h5".format(model_name)))
    
    print("\nModel's weights are saved")
    
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# function to plot the model metrics (deprecated)

def plot_model_metrics(fit_model):

    rcParams['figure.figsize'] = 12, 6

    plt.plot(fit_model.history['accuracy'] , 'g') # acc
    plt.plot(fit_model.history['val_accuracy'] , 'b') # val_acc
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.grid(True)
    plt.show()

    rcParams['figure.figsize'] = 12, 6

    plt.plot(fit_model.history['loss'] , 'g')
    plt.plot(fit_model.history['val_loss'] , 'b')
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.grid(True)
    plt.show()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# alternative function to plot the model metrics (used)

def plot_keras_history(history): #where history =  model.fit()
    """
    
    :param history: 
    :return: 
    """
    # the history object gives the metrics keys. 
    # we will store the metrics keys that are from the training sesion.
    metrics_names = [key for key in history.history.keys() if not key.startswith('val_')]

    for i, metric in enumerate(metrics_names):
        
        # getting the training values
        metric_train_values = history.history.get(metric, [])
        
        # getting the validation values
        metric_val_values = history.history.get("val_{}".format(metric), [])

        # As loss always exists as a metric we use it to find the 
        epochs = range(1, len(metric_train_values) + 1)
        
        # leaving extra spaces to allign with the validation text
        training_text = "   Training {}: {:.5f}".format(metric,
                                                        metric_train_values[-1])

        # metric
        plt.figure(i, figsize=(12, 6))

        plt.plot(epochs,
                 metric_train_values,
                 'b',
                 label=training_text)
        
        # if we validation metric exists, then plot that as well
        if metric_val_values:
            validation_text = "Validation {}: {:.5f}".format(metric,
                                                             metric_val_values[-1])

            plt.plot(epochs,
                     metric_val_values,
                     'g',
                     label=validation_text)
        
        # add title, xlabel, ylabe, and legend
        plt.title('Model Metric: {}'.format(metric))
        plt.xlabel('Epochs')
        plt.ylabel(metric.title())
        plt.legend()
    
    fig1 = plt.gcf()
    #plt.savefig('multi-input-keras.png')
    plt.show()
    plt.draw()
    fig1.savefig(os.path.join(os.getcwd(), 'model_two\\glove_embeddings_model.png'), dpi=100)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Create function that will get as input a dataframe with the metrics (validation, accuracy) and create a plot per epoch
# Proposed modules: seaborn, plotly, matplolib.

In [None]:
# Neural Network Logging parameters

neural_network_parameters = {}
optimizer_parameters = {}
fit_parameters = {}

# Create the neural network
neural_network_parameters['embedding_dimension'] = 300
neural_network_parameters['pool_size'] = None
neural_network_parameters['padding'] = 'valid'
neural_network_parameters['batch_size'] = 16
neural_network_parameters['l2_regularization'] = 0.01
neural_network_parameters['dropout_rate'] = 0.0
neural_network_parameters['dense_activation'] = 'relu'
neural_network_parameters['output_activation'] = 'sigmoid'
neural_network_parameters['number_target_variables'] = len(y_train[0])
neural_network_parameters['model_loss'] = "binary_crossentropy" #'sparse_categorical_crossentropy'
neural_network_parameters['model_metric'] = "accuracy" #'sparse_categorical_accuracy'
#--------------------------------------------------------------------------------------

# Fit the neural network
fit_parameters["steps_per_epoch"] = len(X_train_seq_actors)//neural_network_parameters['batch_size']
fit_parameters["epoch"] = 150
fit_parameters["verbose_fit"] = 0
fit_parameters["batch_size_fit"] = 16

#---------------------------------------------------------------------------------------

# Optimize the neural network

# Optimizer: ADAM (version_1)
optimizer_parameters['adam_learning_rate'] = 0.001
optimizer_parameters['adam_beta_1'] = 0.99
optimizer_parameters['adam_beta_2'] = 0.999
optimizer_parameters['adam_amsgrad'] = False

def optimizer_adam_v1():
    
    return keras.optimizers.Adam(learning_rate=optimizer_parameters['adam_learning_rate'], 
                                 beta_1=optimizer_parameters['adam_beta_1'], 
                                 beta_2=optimizer_parameters['adam_beta_2'], 
                                 amsgrad=optimizer_parameters['adam_amsgrad'])
#---------------------------------------------------------------------------------------

# Optimizer: ADAM (version_2)
optimizer_parameters['steps_per_epoch'] = len(X_train_seq_actors)//neural_network_parameters['batch_size']
optimizer_parameters['lr_schedule_learning_rate'] = 0.01
optimizer_parameters['lr_schedule_decay_steps'] = optimizer_parameters['steps_per_epoch']*1000
optimizer_parameters['lr_schedule_decay_rate'] = 1
optimizer_parameters['staircase'] = False

#STEPS_PER_EPOCH = len(X_train_seq_features)//neural_network_parameters['batch_size'] #(512 = BATCH SIZE)
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    optimizer_parameters['lr_schedule_learning_rate'],
    decay_steps=optimizer_parameters['lr_schedule_decay_steps'],
    decay_rate=optimizer_parameters['lr_schedule_decay_rate'],
    staircase=optimizer_parameters['staircase'])

def optimizer_adam_v2():
    
    return keras.optimizers.Adam(lr_schedule)
#---------------------------------------------------------------------------------------

# Optimizer: SDG (version 1)

optimizer_parameters['SGD_learning_rate'] = 0.01
optimizer_parameters['SGD_decay'] = 1e-6
optimizer_parameters['SGD_momentum'] = 0.9
optimizer_parameters['SGD_nesterov'] = True

def optimizer_SDG_v1():
    
    return keras.optimizers.SGD(lr=optimizer_parameters['SGD_learning_rate'],
                                decay=optimizer_parameters['SGD_decay'],
                                momentum=optimizer_parameters['SGD_momentum'],
                                nesterov=optimizer_parameters['SGD_nesterov'])

#---------------------------------------------------------------------------------------

In [None]:
def keras_glove_embeddings_v1(maxlen_actors,
                              maxlen_plot,
                              maxlen_features,
                              maxlen_reviews,
                              actors_max_words, 
                              plot_max_words,
                              features_max_words,
                              reviews_max_words,
                              embedding_matrix_actors,
                              embedding_matrix_plot,
                              embedding_matrix_features,
                              embedding_matrix_reviews,
                              optimizer_version = None):
    
    sentenceLength_actors = maxlen_actors
    vocab_size_frequent_words_actors = actors_max_words+2
    
    sentenceLength_plot = maxlen_plot
    vocab_size_frequent_words_plot = plot_max_words+2
    
    sentenceLength_features = maxlen_features
    vocab_size_frequent_words_features = features_max_words+2
    
    sentenceLength_reviews = maxlen_reviews
    vocab_size_frequent_words_reviews = reviews_max_words+2
    
    model = keras.Sequential(name='GloVe pre-trained model')
    
    actors = keras.Input(shape=(sentenceLength_actors,), name='actors_input')
    plot = keras.Input(shape=(sentenceLength_plot,), name='plot_input')
    features = keras.Input(shape=(sentenceLength_features,), name='features_input')
    reviews = keras.Input(shape=(sentenceLength_reviews,), name='reviews_input')
    
    emb1 = layers.Embedding(input_dim = vocab_size_frequent_words_actors,
                            output_dim = neural_network_parameters['embedding_dimension'],
                            input_length = sentenceLength_actors,
                            weights=[embedding_matrix_actors],
                            trainable=False,
                            name="actors_embedding_layer")(actors)
    
    encoded_layer1 = layers.GlobalMaxPooling1D(name="globalmaxpooling1")(emb1)
    
    emb2 = layers.Embedding(input_dim = vocab_size_frequent_words_plot,
                            output_dim = neural_network_parameters['embedding_dimension'],
                            input_length = sentenceLength_plot,
                            weights=[embedding_matrix_plot],
                            trainable=False,
                            name="plot_embedding_layer")(plot)
    
    encoded_layer2 = layers.GlobalMaxPooling1D(name="globalmaxpooling2")(emb2)

    emb3 = layers.Embedding(input_dim = vocab_size_frequent_words_features,
                            output_dim = neural_network_parameters['embedding_dimension'],
                            input_length = sentenceLength_features,
                            weights=[embedding_matrix_features],
                            trainable=False,
                            name="features_embedding_layer")(features)
    
    encoded_layer3 = layers.GlobalMaxPooling1D(name="globalmaxpooling3")(emb3)
    
    emb4 = layers.Embedding(input_dim = vocab_size_frequent_words_reviews,
                        output_dim = neural_network_parameters['embedding_dimension'],
                        input_length = sentenceLength_reviews,
                        weights=[embedding_matrix_reviews],
                        trainable=False,
                        name="reviews_embedding_layer")(reviews)
    
    encoded_layer4 = layers.GlobalMaxPooling1D(name="globalmaxpooling4")(emb4)
    
    merged = layers.concatenate([encoded_layer1, encoded_layer2, encoded_layer3, encoded_layer4], axis=-1)

    dense_layer_1 = layers.Dense(neural_network_parameters['batch_size'], 
                                 kernel_regularizer=regularizers.l2(neural_network_parameters['l2_regularization']),
                                 activation=neural_network_parameters['dense_activation'],
                                 name="1st_dense_layer")(merged)
    layers.Dropout(neural_network_parameters['dropout_rate'])(dense_layer_1)
    
    output_layer = layers.Dense(neural_network_parameters['number_target_variables'], activation=neural_network_parameters['output_activation'],
                                name='output_layer')(dense_layer_1)
    
    model = keras.Model(inputs=[actors, plot, features, reviews], outputs=output_layer)
    
    print(model.output_shape)

    print(model.summary())
    
    # Version_1 of Adam
    if optimizer_version is None:
        
        optimizer = optimizer_adam_v2()
    
    model.compile(optimizer=optimizer,
                  loss=neural_network_parameters['model_loss'],
                  metrics=[neural_network_parameters['model_metric']])
    
    plot_model(model, to_file=os.path.join(os.getcwd(), 'model_two\\structure_glove_pretrained_embeddings.png'))
    
    return model

<b>- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  </b>

In [None]:
"""
For a long time we had the following error:

InvalidArgumentError: indices[26,0] = 10084 is not in [0, 10083)

This error was fixed by inceasing the vocabulary size by +2, this increase was also observed in part 4, where each tokenizer,
has vocabulary size increased by +2 more embedding vector arrays.

This is a common error and we should be cautious about it. Otherwise the keras input layers won't be crafted correctly.
"""

# RUN MLFLOW program
mlflow.set_experiment("/glove_embeddings")
with mlflow.start_run():
    
    model_repository = {}
    model_repository['glove_model'] = keras_glove_embeddings_v1(X_train_seq_actors.shape[1],
                                                                X_train_seq_plot.shape[1],
                                                                X_train_seq_features.shape[1],
                                                                X_train_seq_reviews.shape[1],
                                                                len(actors_tokenizer.word_index),
                                                                len(plot_tokenizer.word_index),
                                                                len(features_tokenizer.word_index),
                                                                len(reviews_tokenizer.word_index),
                                                                embedding_matrix_actors,
                                                                embedding_matrix_plot,
                                                                embedding_matrix_features,
                                                                embedding_matrix_reviews,
                                                                optimizer_version = None)
    
    logdir = ".\\logs_test\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    #callbacks = callback("glove_model", model_repository['glove_model'], logdir)
    model_history = {}
    
    # Fit
    model_history['experiment'] = fitting_model("validation_split",
                                                model_repository['glove_model'],
                                                X_train_seq_actors, #input_1
                                                X_train_seq_plot, #input_2
                                                X_train_seq_features, #input_3
                                                X_train_seq_reviews, #input4
                                                X_test_seq_actors,
                                                X_test_seq_plot,
                                                X_test_seq_features,
                                                X_test_seq_reviews,
                                                y_train, #output
                                                y_test,
                                                callback("glove_embeddings", model_repository['glove_model'], logdir), #callback function
                                                fit_parameters["epoch"],
                                                fit_parameters["verbose_fit"],
                                                fit_parameters["batch_size_fit"],
                                                0.2)

    hist = pd.DataFrame(model_history['experiment'].history)
    hist['epoch'] = model_history['experiment'].epoch
    print("\nTable of training the keras text classification model\n")
    print(tabulate(hist, headers='keys', tablefmt='psql'))
    
    hist.to_pickle(os.path.join(os.getcwd(), "model_two\\metrics_histogram_glove_embeddings.pkl"))
    
    save_model(model_repository['glove_model'], "model_glove_embeddings")
    
    #version_1 of plot model
    #plot_model_metrics(model_history['experiment'])
    
    #version_2 of plot model
    plot_keras_history(model_history['experiment'])
    
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    # Evaluate
    model_evaluation = model_repository['glove_model'].evaluate([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews], 
                                                                 y_test,
                                                                 batch_size=fit_parameters["batch_size_fit"],
                                                                 verbose=2)
    print('\nTest Score:', model_evaluation[0])

    print('\nTest Accuracy:', model_evaluation[1])
    
    #neural_model params
    mlflow.log_param("embedding_dimension", neural_network_parameters['embedding_dimension'] )
    mlflow.log_param("pool_size", neural_network_parameters['pool_size'])
    mlflow.log_param("padding", neural_network_parameters['padding'])
    mlflow.log_param("batch_size", neural_network_parameters['batch_size'])
    mlflow.log_param("l2_regularization", neural_network_parameters['l2_regularization'])
    mlflow.log_param("dropout_rate", neural_network_parameters['dropout_rate'])
    mlflow.log_param("dense_activation", neural_network_parameters['dense_activation'])
    mlflow.log_param("output_activation",neural_network_parameters['output_activation'])
    mlflow.log_param("model_loss",neural_network_parameters['model_loss']) #takes any data type
    mlflow.log_param("model_metric",neural_network_parameters['model_metric'])
    
    #optimizer params
    mlflow.log_param("adam_learning_rate",optimizer_parameters['adam_learning_rate'])
    mlflow.log_param("adam_beta_1",optimizer_parameters['adam_beta_1'])
    mlflow.log_param("adam_beta_2",optimizer_parameters['adam_beta_2'])
    mlflow.log_param("adam_amsgrad",optimizer_parameters['adam_amsgrad'])
    
    #fit_model params
    mlflow.log_param("steps_per_epoch",fit_parameters['steps_per_epoch'])
    mlflow.log_param("fit_epoch",fit_parameters['epoch'])
    mlflow.log_param("verbose_fit",fit_parameters['verbose_fit'])
    mlflow.log_param("batch_size_fit",fit_parameters['batch_size_fit']) #in generl batch_size_fit = neurons batch size
    
    #logging the model metrics
    mlflow.log_metric("model_validation_loss",model_evaluation[0]) #take only floats/integers
    mlflow.log_metric("model_validation_accuracy",model_evaluation[1])
    
    mlflow.keras.log_model(model_repository['glove_model'], "Glove-model")

In [None]:
# Use to yield probability distribution over the categories
"""
Why all of my predictions are 0?...Because the 0.5 prob is too high! I have to increase the callback patience and epoch fitting
"""
y_test_pred_probs = model_repository['glove_model'].predict([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews])
y_test_pred_probs[0]

y_test_predictions = (y_test_pred_probs>0.5).astype(int)
# y_predicted probabilities for each class

In [None]:
y_test_predictions[0]

In [None]:
y_test[0]

In [None]:
np.save(os.path.join(os.getcwd(), "model_two//y_predictions_glove_embeddings_08032020"), y_test_predictions)
np.save(os.path.join(os.getcwd(), "model_two//y_true_glove_embeddings_08032020"), y_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report


classification_table= classification_report(y_true=y_test,
                                            y_pred=y_test_predictions)
print("Classification report\n" + str(classification_table))

# Hamming Loss
print("Hamming loss: ", hamming_loss(y_test, y_test_predictions))

# Zero_one loss
print("Zero one loss: ", zero_one_loss(y_test, y_test_predictions, normalize=False))

In [None]:
conf_mat=confusion_matrix(y_test.argmax(axis=1), y_test_predictions.argmax(axis=1))

conf_matrix=pd.DataFrame(conf_mat,
             columns=genres_list,
             index=genres_list)
conf_matrix.to_pickle(os.path.join(os.getcwd(), "model_two\\confusion_matrix_08032020.pkl"))
conf_matrix

In [None]:
df_scores = pd.DataFrame({'Keras Model':pd.Series("GloVe pre-trained embeddings", dtype='str'),
                          'Test Loss':pd.Series([model_evaluation[0]], dtype='float'),
                          'Test Accuracy':pd.Series([model_evaluation[1]], dtype='float'),
                          'Hamming Loss':pd.Series([hamming_loss(y_test, y_test_predictions)], dtype='float'),
                          'Zero_one Loss':pd.Series([zero_one_loss(y_test, y_test_predictions, normalize=False)], dtype='float'),
                          'F1_score':pd.Series([f1_score(y_test, y_test_predictions, average="micro")], dtype='float')})

df_scores.to_pickle(os.path.join(os.getcwd(), "model_two\\df_metrics_glove_embeddings_08032020.pkl"))

df_scores

In [None]:
def predict_genre_tags(indx, model, genres_list):
        
    test_sequence_actors = X_test_seq_actors[indx:indx+1]
    
    test_sequence_plot = X_test_seq_plot[indx:indx+1]
    
    test_sequence_features = X_test_seq_features[indx:indx+1]
    
    test_sequence_reviews = X_test_seq_reviews[indx:indx+1]
    
    text_prediction = model.predict([test_sequence_actors, test_sequence_plot, test_sequence_features, test_sequence_reviews])
    
    [float(i) for i in text_prediction[0]]
    
    tag_probabilities = text_prediction[0][np.argsort(text_prediction[0])[-3:]]
    
    indexes = np.argsort(text_prediction[0])[::-1][:3]

    predicted_tags = []
    
    for i, tag in enumerate(genres_list):
        if i in indexes:
            predicted_tags.append(genres_list[i])
    
    return predicted_tags

In [None]:
with open(os.path.join(os.getcwd(), "pickled_data_per_part\\genres_list_06032020.pkl"), 'rb') as handle:
    genres_list = pickle.load(handle)

In [None]:
random_numbers = [2596, 9824, 839, 9664, 7137] #these are the random numbers generated once to make comparisons among different models

save_index_of_numbers = random_numbers

print("Randomly saved numbers to make predictions: {}".format(save_index_of_numbers))

In [None]:
X_test=pd.read_pickle(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_06032020.pkl"))

In [None]:
df_predictions = pd.DataFrame({'Movie Title':pd.Series([X_test['title'].iloc[save_index_of_numbers[0]]], dtype='str'),
                               'Predicted Genre tags':pd.Series([predict_genre_tags(save_index_of_numbers[0], model_repository['glove_model'], genres_list)], dtype='str'),
                               'Real Genre tags':pd.Series([X_test['reduced_genres'].iloc[save_index_of_numbers[0]]], dtype='str')})

for i in range(len(save_index_of_numbers)):

    df_predictions = df_predictions.append({'Movie Title':X_test['title'].iloc[save_index_of_numbers[i]], 
                                            'Predicted Genre tags':predict_genre_tags(save_index_of_numbers[i], model_repository['glove_model'], genres_list),
                                            'Real Genre tags':X_test['reduced_genres'].iloc[save_index_of_numbers[i]]} , ignore_index=True)

df_predictions = df_predictions.drop(df_predictions.index[0])
df_predictions.to_pickle("model_two\\model_two_df_predictions_08032020.pkl")
df_predictions

### END OF FILE