### Part 3.2 - Model 7: Universal-Sentence-Encoder (latest changes on 08.03.2020)

#### Import the libraries

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

import string
import itertools

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

#### Keras Text Classification (For creating the word embeddings)

In [None]:
from nltk.stem import WordNetLemmatizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss, zero_one_loss, f1_score

from time import time

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

import tensorflow_hub as hub
import bert
from bert import tokenization
from bert.tokenization import FullTokenizer

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

#### Import the data already tokenized and transformed from Part 3.1

* 80-20 split - Non-balanced data
* In contrast to the input used in training the multy-input Keras model, for tensorflow-hub we encode the data only in binary format. Thus, we imported the data splited in train and test sets although not in a tokenized format but rather to a plain text format.

In [None]:
"""
Import the X_train, X_test, y_train & y_test data pickled from dataset part 3.1
"""
X_train=pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\X_train_all_inputs_06032020.pkl"))
X_test=pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\X_test_all_inputs_06032020.pkl"))
y_train=pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\y_train_all_inputs_06032020.pkl"))
y_test=pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\y_test_all_inputs_06032020.pkl"))

assert X_train.shape[0]==y_train.shape[0]
assert X_test.shape[0]==y_test.shape[0]

#### Multiple Input

In [None]:
train_text_features = X_train['clean_combined_features'].tolist() #input 1
test_text_features = X_test['clean_combined_features'].tolist()

train_text_plot = X_train['clean_plot_summary'].tolist() #input 2
test_text_plot = X_test['clean_plot_summary'].tolist()

train_text_actors = X_train['clean_actors'].tolist() #input 3
test_text_actors = X_test['clean_actors'].tolist()

train_text_reviews = X_train['clean_reviews'].tolist() #input 4
test_text_reviews = X_test['clean_reviews'].tolist()

train_label = y_train.values
test_label = y_test.values

train_bytes_list_features = []
train_bytes_list_plot = []
train_bytes_list_actors = []
train_bytes_list_reviews = []

# actor_list = []

test_bytes_list_features = []
test_bytes_list_plot = []
test_bytes_list_actors = []
test_bytes_list_reviews = []

for i in train_text_features:
    train_bytes_list_features.append(str.encode(i))
for i in train_text_plot:
    train_bytes_list_plot.append(str.encode(i))
for i in train_text_actors:
    train_bytes_list_actors.append(list(map(lambda x: str.encode(x), i.split(','))))
for i in train_text_reviews:
    train_bytes_list_reviews.append(str.encode(i))

for i in test_text_features:
    test_bytes_list_features.append(str.encode(i))
for i in test_text_plot:
    test_bytes_list_plot.append(str.encode(i))
for i in test_text_actors:
    test_bytes_list_actors.append(list(map(lambda x: str.encode(x), i.split(','))))
for i in test_text_reviews:
    test_bytes_list_reviews.append(str.encode(i))

train_bytes_list_features = np.asarray(train_bytes_list_features)
train_bytes_list_plot = np.asarray(train_bytes_list_plot)
train_bytes_list_actors = np.asarray(train_bytes_list_actors)
train_bytes_list_reviews = np.asarray(train_bytes_list_reviews)

test_bytes_list_features = np.asarray(test_bytes_list_features)
test_bytes_list_plot = np.asarray(test_bytes_list_plot)
test_bytes_list_actors = np.asarray(test_bytes_list_actors)
test_bytes_list_reviews = np.asarray(test_bytes_list_reviews)

partial_x_train_features, x_val_features, partial_y_train, y_val = train_test_split(train_bytes_list_features, train_label, test_size=0.20, random_state=42)
partial_x_train_plot, x_val_plot, partial_y_train, y_val = train_test_split(train_bytes_list_plot, train_label, test_size=0.20, random_state=42)
partial_x_train_actors, x_val_actors, partial_y_train, y_val = train_test_split(train_bytes_list_actors, train_label, test_size=0.20, random_state=42)
partial_x_train_reviews, x_val_reviews, partial_y_train, y_val = train_test_split(train_bytes_list_reviews, train_label, test_size=0.20, random_state=42)

In [None]:
print(x_val_features[0],"\n")
print(x_val_plot[0], "\n")
print(x_val_actors[0], "\n")
print(x_val_reviews[0], "\n")

#### Token based text embedding trained on English Google News 130GB corpus. (without OOV tokens)

#### Python Cell no.1
--------------------------

In this below python cell I create 2 functions, which will save my model and will stop it early in case the results do not improve any further. More specifically, the <b>callback function</b> is very usefull when someone wants to test the <i>overfitting boundaries</i> of a neural network. Each time the fitting of the model achieves a better value of the monitored metric (e.g val_loss) then the model is automatically saved. On the contrary, if during the epoch fitting the model reaches a worse state than the previous epoch then the training of the model automatically stops.

In [None]:
%load_ext tensorboard
# %reload_text tensorboard

logdir=".\\logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Callback function with early stopping to avodid overfit

class Callback_Configurations():
    
    MONITOR_METRIC = 'val_loss'
    MINIMUM_DELTA = 1
    PATIENCE = 9
    VERBOSE = 0
    MODE = 'min'
    
def callback(saved_model, model, logdir):
    
    weights_fname = os.path.join(os.getcwd(), 'model_seven\\{}.h5'.format(saved_model))

    try:
        with open(os.path.join(os.getcwd(), 'model_seven\\{}.json'.format(save_model)), 'r') as f:
            
            model_json = json.load(f)
        
            model = model_from_json(model_json)
        
            model.load_weights('{}').format(weights_fname)

    except:
        print('\nPre-trained weights not found. Fitting from start')
        pass

    monitor_metric = Callback_Configurations.MONITOR_METRIC
    
    callbacks = [
        tfmodel.EpochDots(),
        
        EarlyStopping(monitor=monitor_metric,
                      min_delta=Callback_Configurations.MINIMUM_DELTA,
                      patience=Callback_Configurations.PATIENCE,
                      verbose=Callback_Configurations.VERBOSE,
                      mode=Callback_Configurations.MODE,
                      restore_best_weights=True),

        ModelCheckpoint(filepath=weights_fname,
                        monitor=monitor_metric,
                        verbose=Callback_Configurations.VERBOSE,
                        save_best_only=True,
                        save_weights_only=True), #True, False
        
        tf.keras.callbacks.TensorBoard(logdir)
        
]
    return callbacks

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

def save_model(model, model_name):
    
    model_json = model.to_json()

    with open(os.path.join(os.getcwd(), "model_seven\\{}.json".format(model_name)), "w") as json_file:
        json.dump(model_json, json_file)

    model.save_weights(os.path.join(os.getcwd(), "model_seven\\{}.h5".format(model_name)))
    
    print("\nModel's weights are saved")
    
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# function to plot the model metrics (deprecated)

def plot_model_metrics(fit_model):

    rcParams['figure.figsize'] = 12, 6

    plt.plot(fit_model.history['accuracy'] , 'g') # acc
    plt.plot(fit_model.history['val_accuracy'] , 'b') # val_acc
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.grid(True)
    plt.show()

    rcParams['figure.figsize'] = 12, 6

    plt.plot(fit_model.history['loss'] , 'g')
    plt.plot(fit_model.history['val_loss'] , 'b')
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.grid(True)
    plt.show()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# alternative function to plot the model metrics (used)

def plot_keras_history(history): #where history =  model.fit()

    # the history object gives the metrics keys. 
    # we will store the metrics keys that are from the training sesion.
    metrics_names = [key for key in history.history.keys() if not key.startswith('val_')]

    for i, metric in enumerate(metrics_names):
        
        # getting the training values
        metric_train_values = history.history.get(metric, [])
        
        # getting the validation values
        metric_val_values = history.history.get("val_{}".format(metric), [])

        # As loss always exists as a metric we use it to find the 
        epochs = range(1, len(metric_train_values) + 1)
        
        # leaving extra spaces to allign with the validation text
        training_text = "   Training {}: {:.5f}".format(metric,
                                                        metric_train_values[-1])

        # metric
        plt.figure(i, figsize=(12, 6))

        plt.plot(epochs,
                 metric_train_values,
                 'b',
                 label=training_text)
        
        # if we validation metric exists, then plot that as well
        if metric_val_values:
            validation_text = "Validation {}: {:.5f}".format(metric,
                                                             metric_val_values[-1])

            plt.plot(epochs,
                     metric_val_values,
                     'g',
                     label=validation_text)
        
        # add title, xlabel, ylabe, and legend
        plt.title('Model Metric: {}'.format(metric))
        plt.xlabel('Epochs')
        plt.ylabel(metric.title())
        plt.legend()
    
    fig1 = plt.gcf()
    #plt.savefig('multi-input-keras.png')
    plt.show()
    plt.draw()
    fig1.savefig(os.path.join(os.getcwd(), 'model_seven\\universal_sentence_encoder.png'), dpi=100)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Create function that will get as input a dataframe with the metrics (validation, accuracy) and create a plot per epoch
# Proposed modules: seaborn, plotly, matplolib.

#### Python Cell no.2
------------------------------

In this below python cell I keep track of the model parameters used to:

* create the neural network model,
* to fit the neural network,
* to optimize the neural network.

Storing the values of the parameters to a dictionary, I could then change dynamically the value of a parameter, rerun the neural model and then monitor the difference in the results.

In [None]:
# Neural Network Logging parameters

neural_network_parameters = {}
optimizer_parameters = {}
fit_parameters = {}

# Create the neural network
neural_network_parameters['embedding_dimension'] = 50
neural_network_parameters['pool_size'] = None
neural_network_parameters['padding'] = 'valid'
neural_network_parameters['batch_size'] = 64
neural_network_parameters['l2_regularization'] = 0.01
neural_network_parameters['dropout_rate'] = 0.0
neural_network_parameters['dense_activation'] = 'relu'
neural_network_parameters['output_activation'] = 'sigmoid'
neural_network_parameters['model_loss'] = "binary_crossentropy" #'sparse_categorical_crossentropy'
neural_network_parameters['model_metric'] = "accuracy" #'sparse_categorical_accuracy'
#--------------------------------------------------------------------------------------

# Fit the neural network
fit_parameters["steps_per_epoch"] = len(partial_x_train_features)//neural_network_parameters['batch_size']
fit_parameters["epoch"] = 150
fit_parameters["verbose_fit"] = 0
fit_parameters["batch_size_fit"] = 64

#---------------------------------------------------------------------------------------

# Optimize the neural network

# Optimizer: ADAM (version_1)
optimizer_parameters['adam_learning_rate'] = 0.001
optimizer_parameters['adam_beta_1'] = 0.99
optimizer_parameters['adam_beta_2'] = 0.999
optimizer_parameters['adam_amsgrad'] = False

def optimizer_adam_v1():
    
    return keras.optimizers.Adam(learning_rate=optimizer_parameters['adam_learning_rate'], 
                                 beta_1=optimizer_parameters['adam_beta_1'], 
                                 beta_2=optimizer_parameters['adam_beta_2'], 
                                 amsgrad=optimizer_parameters['adam_amsgrad'])
#---------------------------------------------------------------------------------------

# Optimizer: ADAM (version_2)
optimizer_parameters['steps_per_epoch'] = len(partial_x_train_features)//neural_network_parameters['batch_size']
optimizer_parameters['lr_schedule_learning_rate'] = 0.01
optimizer_parameters['lr_schedule_decay_steps'] = optimizer_parameters['steps_per_epoch']*1000
optimizer_parameters['lr_schedule_decay_rate'] = 1
optimizer_parameters['staircase'] = False

#STEPS_PER_EPOCH = len(X_train_seq_features)//neural_network_parameters['batch_size'] #(512 = BATCH SIZE)
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    optimizer_parameters['lr_schedule_learning_rate'],
    decay_steps=optimizer_parameters['lr_schedule_decay_steps'],
    decay_rate=optimizer_parameters['lr_schedule_decay_rate'],
    staircase=optimizer_parameters['staircase'])

def optimizer_adam_v2():
    
    return keras.optimizers.Adam(lr_schedule)
#---------------------------------------------------------------------------------------

# Optimizer: SDG (version 1)

optimizer_parameters['SGD_learning_rate'] = 0.01
optimizer_parameters['SGD_decay'] = 1e-6
optimizer_parameters['SGD_momentum'] = 0.9
optimizer_parameters['SGD_nesterov'] = True

def optimizer_SDG_v1():
    
    return keras.optimizers.SGD(lr=optimizer_parameters['SGD_learning_rate'],
                                decay=optimizer_parameters['SGD_decay'],
                                momentum=optimizer_parameters['SGD_momentum'],
                                nesterov=optimizer_parameters['SGD_nesterov'])

#---------------------------------------------------------------------------------------

#### Python Cell no.3
------------------------------

In the python cell below, I run MLFLOW program to train, fit, save and log the parameters, weights of the neural network.
The code below is splitted into different parts indicated by short-name subtitle (<i>e.g import the pre-trained model</i>, <i>create the model structure</i>, <i>fit the model, etc.</i>).

In [None]:
%%time
mlflow.set_experiment("/universal_sentence_encoder")
with mlflow.start_run():
    
    # import the pre-trained model
    model = "https://tfhub.dev/google/universal-sentence-encoder/4"
    hub_layer = hub.KerasLayer(model, output_shape=[512], input_shape=[], dtype=tf.string, trainable=True)
    
    # create the model structure
    model = tf.keras.Sequential(name="universal_sentence_encoder")
    model.add(hub_layer)
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(neural_network_parameters['batch_size'],
                                    kernel_regularizer=regularizers.l2(neural_network_parameters['l2_regularization']),
                                    activation=neural_network_parameters['dense_activation']))
    model.add(tf.keras.layers.Dropout(neural_network_parameters['dropout_rate']))
    model.add(tf.keras.layers.Dense(y_val.shape[1], activation=neural_network_parameters['output_activation']))

    print(model.summary())

    optimizer = optimizer_adam_v2()

    model.compile(optimizer=optimizer,
                  loss=neural_network_parameters['model_loss'],
                  metrics=[neural_network_parameters['model_metric']])
    
    plot_model(model, to_file=os.path.join(os.getcwd(), 'model_seven\\structure_universal_sentence_encoder.png'))
    
    logdir = ".\\logs_test\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    
    # fit the model
    history = model.fit([partial_x_train_features, partial_x_train_plot, partial_x_train_actors, partial_x_train_reviews],
                        partial_y_train,
                        epochs=fit_parameters["epoch"],
                        batch_size=fit_parameters["batch_size_fit"],
                        validation_data=([x_val_features, x_val_plot, x_val_actors, x_val_reviews], y_val),
                        verbose=fit_parameters["verbose_fit"],
                        callbacks=callback("universal_sentence_encoder", model, logdir))
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    
    # plot the model's progress per epoch
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['epoch']+= 1
    hist.index += 1
    print("\nTable of training the keras text classification model\n")
    print(tabulate(hist, headers='keys', tablefmt='psql'))
    
    hist.to_pickle(os.path.join(os.getcwd(), "model_seven\\metrics_histogram_universal_sentence_encoder_08032020.pkl"))\

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    
    # evaluate the model
    model_evaluation = model.evaluate([test_bytes_list_features, test_bytes_list_plot, test_bytes_list_actors, test_bytes_list_reviews], 
                                      test_label,
                                      batch_size=fit_parameters["batch_size_fit"],
                                      verbose=2)
    
    print('\nTest Score:', model_evaluation[0])

    print('\nTest Accuracy:', model_evaluation[1])

    print(model_evaluation)
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    
    # save the model
    save_model(model, "universal_sentence_encoder")
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

    #plot_model_metrics(history)
    plot_keras_history(history)
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    
    # Log model parameters and metrics
    #neural_model params
    mlflow.log_param("batch_size", neural_network_parameters['batch_size'])
    mlflow.log_param("l2_regularization", neural_network_parameters['l2_regularization'])
    mlflow.log_param("dropout_rate", neural_network_parameters['dropout_rate'])
    mlflow.log_param("dense_activation", neural_network_parameters['dense_activation'])
    mlflow.log_param("output_activation",neural_network_parameters['output_activation'])
    mlflow.log_param("model_loss",neural_network_parameters['model_loss']) #takes any data type
    mlflow.log_param("model_metric",neural_network_parameters['model_metric'])
    
    #optimizer params
    mlflow.log_param("lr_schedule_learning_rate",optimizer_parameters['lr_schedule_learning_rate'])
    mlflow.log_param("lr_schedule_decay_steps",optimizer_parameters['lr_schedule_decay_steps'])
    mlflow.log_param("lr_schedule_decay_rate",optimizer_parameters['lr_schedule_decay_rate'])
    mlflow.log_param("adam_amsgrad",optimizer_parameters['staircase'])
    
    #fit_model params
    #mlflow.log_param("steps_per_epoch",fit_parameters['steps_per_epoch'])
    mlflow.log_param("fit_epoch",fit_parameters['epoch'])
    mlflow.log_param("verbose_fit",fit_parameters['verbose_fit'])
    mlflow.log_param("batch_size_fit",fit_parameters['batch_size_fit']) #in generl batch_size_fit = neurons batch size
    
    #logging the model metrics
    mlflow.log_metric("model_validation_loss",model_evaluation[0]) #take only floats/integers
    mlflow.log_metric("model_validation_accuracy",model_evaluation[1])
    
    mlflow.keras.log_model(model, "universal_sentence_encoder")

#     mlflow.tensorflow.save_model(model, model_dir_path)

In [None]:
# Use to yield probability distribution over the categories
y_test_pred_probs = model.predict([test_bytes_list_features, test_bytes_list_plot, test_bytes_list_actors, test_bytes_list_reviews])
y_test_pred_probs[0]

y_test_predictions = (y_test_pred_probs>0.5).astype(int)
y_test=test_label
# y_predicted probabilities for each class

In [None]:
np.save(os.path.join(os.getcwd(), "model_seven//y_predictions_universal_sentence_encoder_08032020"), y_test_predictions)
np.save(os.path.join(os.getcwd(), "model_seven//y_true_universal_sentence_encoder_08032020"), y_test)

In [None]:
classification_table= classification_report(y_true=y_test,
                                            y_pred=y_test_predictions)
print("Classification report\n" + str(classification_table))

# Hamming Loss
print("Hamming loss: ", hamming_loss(y_test, y_test_predictions))

# Zero_one loss
print("Zero one loss: ", zero_one_loss(y_test, y_test_predictions, normalize=False))

In [None]:
with open(os.path.join(os.getcwd(), "pickled_data_per_part\\genres_list_06032020.pkl"), 'rb') as handle:
    genres_list = pickle.load(handle)

In [None]:
conf_mat=confusion_matrix(y_test.argmax(axis=1), y_test_predictions.argmax(axis=1))

conf_matrix=pd.DataFrame(conf_mat,
             columns=genres_list,
             index=genres_list)
conf_matrix.to_pickle(os.path.join(os.getcwd(), "model_seven\\confusion_matrix_08032020.pkl"))
conf_matrix

#### Python Cell no.4
------------------------------

Store to dataframe the training and validation loss of the neural model. The result of the cell below is a dataframe which is then pickled locally. Having stored the dataframe locally I can then import all the dataframes related to the neural models and compare them to each other.

In [None]:
df_scores = pd.DataFrame({'Keras Model':pd.Series("Universal Sentece Encoder", dtype='str'),
                         'Test Loss':pd.Series([model_evaluation[0]], dtype='float'),
                         'Test Accuracy':pd.Series([model_evaluation[1]], dtype='float'),
                         'Hamming Loss':pd.Series([hamming_loss(y_test, y_test_predictions)], dtype='float'),
                         'Zero_one Loss':pd.Series([zero_one_loss(y_test, y_test_predictions, normalize=False)], dtype='float'),
                         'F1_score':pd.Series([f1_score(y_test, y_test_predictions, average="micro")], dtype='float')})

df_scores.to_pickle(os.path.join(os.getcwd(), "model_seven\\df_metrics_universal_sentence_encoder_08032020.pkl"))

df_scores

#### Python Cell no.5
------------------------------

Predict the genres tags on data the model never seen before.

In [None]:
"""
With the actors test values, I observed the following error when inserted to model.predict()

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
This error was due to the fact that actors are an array of lists. To fix this error I transformed this array of lists to a nested array of arrays.

This transformation will be also implemented to models 4, 5, 6 and 7.
"""
def predict_genre_tags(indx, model, genres_list):
    
    test_sequence_actors = test_bytes_list_actors[indx:indx+1]

    test_sequence_actors_array=np.array(list(x for x in test_sequence_actors))
    test_sequence_actors_array = test_sequence_actors_array.reshape(-1,)

    test_sequence_plot = test_bytes_list_plot[indx:indx+1]

    test_sequence_features = test_bytes_list_features[indx:indx+1]

    test_sequence_reviews = test_bytes_list_reviews[indx:indx+1]

    text_prediction = model.predict([test_sequence_actors_array, test_sequence_plot, test_sequence_features, test_sequence_reviews])
    
    [float(i) for i in text_prediction[0]]
    
    tag_probabilities = text_prediction[0][np.argsort(text_prediction[0])[-3:]]
    
    indexes = np.argsort(text_prediction[0])[::-1][:3]

    predicted_tags = []
    
    for i, tag in enumerate(genres_list):
        if i in indexes:
            predicted_tags.append(genres_list[i])
    
    return predicted_tags

In [None]:
random_numbers = [2596, 9824, 839, 9664, 7137] #these are the random numbers generated once to make comparisons among different models

save_index_of_numbers = random_numbers

print("Randomly saved numbers to make predictions: {}".format(save_index_of_numbers))

In [None]:
df_predictions = pd.DataFrame({'Movie Title':pd.Series([X_test['title'].iloc[save_index_of_numbers[0]]], dtype='str'),
                               'Predicted Genre tags':pd.Series([predict_genre_tags(save_index_of_numbers[0], model, genres_list)], dtype='str'),
                               'Real Genre tags':pd.Series([X_test['reduced_genres'].iloc[save_index_of_numbers[0]]], dtype='str')})

for i in range(len(save_index_of_numbers[0:])):

    df_predictions = df_predictions.append({'Movie Title' : X_test['title'].iloc[save_index_of_numbers[i]], 
                                            'Predicted Genre tags' : predict_genre_tags(save_index_of_numbers[i], model, genres_list),
                                            'Real Genre tags': X_test['reduced_genres'].iloc[save_index_of_numbers[i]]} , ignore_index=True)

df_predictions = df_predictions.drop(df_predictions.index[0])
df_predictions.to_pickle("model_seven\\model_seven_df_predictions_08032020.pkl")
df_predictions

### END OF FILE