### Part 3.2.1 - Compare the different models trained on part 3.2 for the model 1 (latest development on 29.03.2020)

The purpose of the notebook is to select the best model among the 16 models trained with the multi-input keras approach.
The final model selected will be then compared to the rest of the models trained.

To select the best model we used the following guidelines:

* 1) The model with the lowest hamming loss & zero one loss
* 2) The model with the lowest test score and the highest test accuracy values
* 3) The model with the most accurate predictions among the 17 labels. It is of high importance the best model to identify correctly the most of the genre tags. Models that cannot identify more than 2 genre tags will not be prefered.
* 4) Compare model predictions on movie never seen before.
* 5) Training-Validation metrics comparison.

#### Import standard libraries

In [264]:
import collections

try:
    collectionsAbc = collections.abc
except AttributeError:
    collectionsAbc = collections

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss, zero_one_loss, f1_score, roc_auc_score

import string
import itertools

from time import time

import plotly.express as px
import plotly.graph_objects as go

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

#### Tensorflow - Keras, Mlflow libraries

In [99]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow import keras

import keras.backend as K

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

import tensorflow_addons as tfa

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

TensorFlow version:  2.1.0
Version:  2.1.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE


#### Import the data already tokenized and transformed from Part 3.1

* 80-20 split - Non-balanced data

In [3]:
X_train_seq_actors=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_train_seq_actors_80-20_non-balanced_20000_25032020.npy"))
X_train_seq_plot=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_train_seq_plot_80-20_non-balanced_20000_25032020.npy"))
X_train_seq_features=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_train_seq_features_80-20_non-balanced_20000_25032020.npy"))
X_train_seq_reviews=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_train_seq_reviews_80-20_non-balanced_20000_25032020.npy"))

print("X_train data inputs have been loaded!\n")

X_test_seq_actors=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_test_seq_actors_80-20_non-balanced_20000_25032020.npy"))
X_test_seq_plot=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_test_seq_plot_80-20_non-balanced_20000_25032020.npy"))
X_test_seq_features=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_test_seq_features_80-20_non-balanced_20000_25032020.npy"))
X_test_seq_reviews=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_test_seq_reviews_80-20_non-balanced_20000_25032020.npy"))

print("X_test data inputs have been loaded!\n")

y_train=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\y_train_80-20_non-balanced_20000_25032020.npy"))
y_test=np.load(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\y_test_80-20_non-balanced_20000_25032020.npy"))

print("y_train & y_test have been loaded!\n")

X_train data inputs have been loaded!

X_test data inputs have been loaded!

y_train & y_test have been loaded!



In [4]:
X_train_seq_actors.shape

(39193, 17)

#### Import the saved tokenizers

In [5]:
"""
IMport the tokenizers of each input, fitted on part 3.1
"""
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\actors_tokenizer_20000_25032020.pkl'),'rb') as f:
    actors_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\plot_tokenizer_20000_25032020.pkl'),'rb') as f:
    plot_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\features_tokenizer_20000_25032020.pkl'),'rb') as f:
    features_tokenizer = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\20000_max_features\\reviews_tokenizer_20000_25032020.pkl'),'rb') as f:
    reviews_tokenizer = pickle.load(f)
    
print("Tokenizers are loaded successfully!")

Tokenizers are loaded successfully!


In [None]:
# """
# KFold cross validation

# Maybe I could run a Kfold for with k=1 to reshuffle the data.
# """
# X_seq_actors=np.load("C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\x_seq_actors_80-20_non-balanced_28022020.npy")
# X_seq_plot=np.load("C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\x_seq_plot_80-20_non-balanced_28022020.npy")
# X_seq_features=np.load("C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\x_seq_features_80-20_non-balanced_28022020.npy")
# X_seq_reviews=np.load("C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\x_seq_reviews_80-20_non-balanced_28022020.npy")

# print("X featurs data inputs have been loaded!\n")

# y=np.load("C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\y_80-20_non-balanced_28022020.npy")

# print("y variable has been loaded!\n")

# print("X_seq_actors shape:{}".format(X_seq_actors.shape))
# print("X_seq_plot shape:{}".format(X_seq_plot.shape))
# print("X_seq_features shape:{}".format(X_seq_features.shape))
# print("X_seq_reviews shape:{}\n".format(X_seq_reviews.shape))

# print("y shape:{}".format(y.shape))

#### Import the trained and saved models of Part 3.2 for model 1

Initialise some predefined values first:

* Adam optimizer
* Model loss
* Model metric

In [267]:
neural_network_parameters={}
optimizer_parameters={}
neural_network_parameters['model_loss'] = 'binary_crossentropy'
neural_network_parameters['model_metric'] = 'accuracy'
validation_split_ratio=0.8

# Function 1
def optimizer_adam_v2(batch_size_value):
    
    optimizer_parameters['steps_per_epoch'] = int(np.ceil((X_train_seq_features.shape[0]*validation_split_ratio)//batch_size_value))
    optimizer_parameters['lr_schedule_learning_rate'] = 0.01
    optimizer_parameters['lr_schedule_decay_steps'] = optimizer_parameters['steps_per_epoch']*1000
    optimizer_parameters['lr_schedule_decay_rate'] = 1
    optimizer_parameters['staircase'] = False
    
    lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
        optimizer_parameters['lr_schedule_learning_rate'],
        decay_steps=optimizer_parameters['lr_schedule_decay_steps'],
        decay_rate=optimizer_parameters['lr_schedule_decay_rate'],
        staircase=optimizer_parameters['staircase'])
    
    return keras.optimizers.Adam(lr_schedule)

#----------------------------------------------------------------------

# Function 2
def hamming_loss(y_true, y_pred, mode='multilabel'):
    if mode not in ['multiclass', 'multilabel']:
        raise TypeError('mode must be: [multiclass, multilabel])')

    if mode == 'multiclass':
        nonzero = tf.cast(tf.math.count_nonzero(y_true * y_pred, axis=-1), tf.float32)
        print(nonzero)
        return 1.0 - nonzero

    else:
        nonzero = tf.cast(tf.math.count_nonzero(y_true - y_pred, axis=-1), 
            tf.float32)
        return nonzero / y_true.shape[-1]


class HammingLoss(tfa.metrics.MeanMetricWrapper):
    def __init__(self, name='hamming_loss', dtype=None, mode='multilabel'):
        super(HammingLoss, self).__init__(
                hamming_loss, name, dtype=dtype, mode=mode)

#### Model 1 - 50 embedding dimenstion & 16 batch size

In [32]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(50), str(16))),'r') as f:
    model_json = json.load(f)

model_one = model_from_json(model_json)

model_one.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(50), str(16))))

model_one.compile(optimizer=optimizer_adam_v2(16),
                  loss=neural_network_parameters['model_loss'],
                  metrics=[neural_network_parameters['model_metric']])

print(type(model_one))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 2 - 50 embedding dimenstion & 32 batch size

In [39]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(50), str(32))),'r') as f:
    model_json = json.load(f)

model_two = model_from_json(model_json)

model_two.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(50), str(32))))

neural_network_parameters['batch_size'] = 32

model_two.compile(optimizer=optimizer_adam_v2(32),
                  loss=neural_network_parameters['model_loss'],
                  metrics=[neural_network_parameters['model_metric']])
print(type(model_two))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 3 - 50 embedding dimenstion & 64 batch size

In [40]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(50), str(64))),'r') as f:
    model_json = json.load(f)

model_three = model_from_json(model_json)

model_three.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(50), str(64))))

model_three.compile(optimizer=optimizer_adam_v2(64),
                    loss=neural_network_parameters['model_loss'],
                    metrics=[neural_network_parameters['model_metric']])

print(type(model_three))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 4 - 50 embedding dimenstion & 128 batch size

In [41]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(50), str(128))),'r') as f:
    model_json = json.load(f)

model_four = model_from_json(model_json)

model_four.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(50), str(128))))

model_four.compile(optimizer=optimizer_adam_v2(128),
                   loss=neural_network_parameters['model_loss'],
                   metrics=[neural_network_parameters['model_metric']])

print(type(model_four))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 5 - 100 embedding dimenstion & 16 batch size

In [42]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(100), str(16))),'r') as f:
    model_json = json.load(f)

model_five = model_from_json(model_json)

model_five.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(100), str(16))))

model_five.compile(optimizer=optimizer_adam_v2(16),
                   loss=neural_network_parameters['model_loss'],
                   metrics=[neural_network_parameters['model_metric']])

print(type(model_five))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 6 - 100 embedding dimenstion & 32 batch size

In [43]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(100), str(32))),'r') as f:
    model_json = json.load(f)

model_six = model_from_json(model_json)

model_six.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(100), str(32))))

model_six.compile(optimizer=optimizer_adam_v2(32),
                  loss=neural_network_parameters['model_loss'],
                  metrics=[neural_network_parameters['model_metric']])

print(type(model_six))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 7 - 100 embedding dimenstion & 64 batch size

In [44]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(100), str(64))),'r') as f:
    model_json = json.load(f)

model_seven = model_from_json(model_json)

model_seven.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(100), str(64))))

model_seven.compile(optimizer=optimizer_adam_v2(64),
                    loss=neural_network_parameters['model_loss'],
                    metrics=[neural_network_parameters['model_metric']])

print(type(model_seven))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 8 - 100 embedding dimenstion & 128 batch size

In [45]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(100), str(128))),'r') as f:
    model_json = json.load(f)

model_eight = model_from_json(model_json)

model_eight.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(100), str(128))))

model_eight.compile(optimizer=optimizer_adam_v2(128),
                    loss=neural_network_parameters['model_loss'],
                    metrics=[neural_network_parameters['model_metric']])

print(type(model_eight))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 9 - 200 embedding dimenstion & 16 batch size

In [46]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(200), str(16))),'r') as f:
    model_json = json.load(f)

model_nine = model_from_json(model_json)

model_nine.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(200), str(16))))

model_nine.compile(optimizer=optimizer_adam_v2(16),
                   loss=neural_network_parameters['model_loss'],
                   metrics=[neural_network_parameters['model_metric']])

print(type(model_nine))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 10 - 200 embedding dimenstion & 32 batch size

In [47]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(200), str(32))),'r') as f:
    model_json = json.load(f)

model_ten = model_from_json(model_json)

model_ten.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(200), str(32))))

model_ten.compile(optimizer=optimizer_adam_v2(32),
                  loss=neural_network_parameters['model_loss'],
                  metrics=[neural_network_parameters['model_metric']])

print(type(model_ten))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 11 - 200 embedding dimenstion & 64 batch size

In [48]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(200), str(64))),'r') as f:
    model_json = json.load(f)

model_eleven = model_from_json(model_json)

model_eleven.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(200), str(64))))

model_eleven.compile(optimizer=optimizer_adam_v2(64),
                     loss=neural_network_parameters['model_loss'],
                     metrics=[neural_network_parameters['model_metric']])

print(type(model_eleven))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 12 - 200 embedding dimenstion & 128 batch size

In [49]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(200), str(128))),'r') as f:
    model_json = json.load(f)

model_twelve = model_from_json(model_json)

model_twelve.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(200), str(128))))

model_twelve.compile(optimizer=optimizer_adam_v2(128),
                     loss=neural_network_parameters['model_loss'],
                     metrics=[neural_network_parameters['model_metric']])

print(type(model_twelve))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 13 - 300 embedding dimenstion & 16 batch size

In [50]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(300), str(16))),'r') as f:
    model_json = json.load(f)

model_thirteen = model_from_json(model_json)

model_thirteen.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(300), str(16))))

model_thirteen.compile(optimizer=optimizer_adam_v2(16),
                       loss=neural_network_parameters['model_loss'],
                       metrics=[neural_network_parameters['model_metric']])

print(type(model_thirteen))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 14 - 300 embedding dimenstion & 32 batch size

In [51]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(300), str(32))),'r') as f:
    model_json = json.load(f)

model_fourteen = model_from_json(model_json)

model_fourteen.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(300), str(32))))

model_fourteen.compile(optimizer=optimizer_adam_v2(32),
                       loss=neural_network_parameters['model_loss'],
                       metrics=[neural_network_parameters['model_metric']])

print(type(model_fourteen))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 15 - 300 embedding dimenstion & 64 batch size

In [52]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(300), str(64))),'r') as f:
    model_json = json.load(f)

model_fifteen = model_from_json(model_json)

model_fifteen.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(300), str(64))))

model_fifteen.compile(optimizer=optimizer_adam_v2(64),
                      loss=neural_network_parameters['model_loss'],
                      metrics=[neural_network_parameters['model_metric']])

print(type(model_fifteen))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


#### Model 16 - 300 embedding dimenstion & 128 batch size

In [53]:
with open(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.json'.format(str(300), str(128))),'r') as f:
    model_json = json.load(f)

model_sixteen = model_from_json(model_json)

model_sixteen.load_weights(os.path.join(os.getcwd(), 'model_one\\multi_input_keras_model_{0}dim_{1}batchsize.h5'.format(str(300), str(128))))

model_sixteen.compile(optimizer=optimizer_adam_v2(128),
                      loss=neural_network_parameters['model_loss'],
                      metrics=[neural_network_parameters['model_metric']])

print(type(model_sixteen))
print("\nModel is loaded successfully")

<class 'tensorflow.python.keras.engine.training.Model'>

Model is loaded successfully


# <b>- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  </b>

**Create a scoring dataframe for each model**

In [268]:
def create_df_scoring_table(model_tag, model, embeddings_number, batch_size_number):
    
    model_evaluation = model.evaluate([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews], 
                                      y_test,
                                      batch_size=batch_size_number,
                                      verbose=2)

    y_test_pred_probs = model.predict([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews])
    y_test_predictions = (y_test_pred_probs>0.5).astype(int)

    hamming_loss_value = HammingLoss(mode='multilabel')
    hamming_loss_value.update_state(y_test, y_test_predictions)

    df_scores=pd.DataFrame({'Tag Name':pd.Series(model_tag, dtype='str'),
                            'Embedding tag':pd.Series(embeddings_number, dtype='int'),
                            'Batch tag':pd.Series(batch_size_number, dtype='int'),
                            'Keras Model':pd.Series("multi_input_model_{0}dim_{1}batchsize".format(str(embeddings_number), str(batch_size_number)), dtype='str'),
                            'Test Loss':pd.Series([model_evaluation[0]], dtype='float'),
                            'Test Accuracy':pd.Series([model_evaluation[1]], dtype='float'),
                            'Hamming Loss':pd.Series([hamming_loss_value.result().numpy()], dtype='float'),
                            'Zero_one Loss':pd.Series([zero_one_loss(y_test, y_test_predictions, normalize=False)], dtype='float'),
                            'F1_score':pd.Series([f1_score(y_test, y_test_predictions, average="micro")], dtype='float'),
                            'ROC_score':pd.Series([roc_auc_score(y_test, y_test_predictions, average="micro", multi_class="ovr")], dtype='float'),
                           })

    df_scores.to_pickle(os.path.join(os.getcwd(), "model_one\\df_metrics_multy_input_keras_{0}dim_{1}batchsize_29032020.pkl".format(str(embeddings_number), str(batch_size_number))))
    return df_scores

In [269]:
#Model 1
df_scores_one=create_df_scoring_table("model one", model_one, 50, 16)
#Model 2
df_scores_two=create_df_scoring_table("model two", model_two, 50, 32)
#Model 3
df_scores_three=create_df_scoring_table("model three", model_three, 50, 64)
#Model 4
df_scores_four=create_df_scoring_table("model four", model_four, 50, 128)
#Model 5
df_scores_five=create_df_scoring_table("model five", model_five, 100, 16)
#Model 6
df_scores_six=create_df_scoring_table("model six", model_six, 100, 32)
#Model 7
df_scores_seven=create_df_scoring_table("model seven", model_seven, 100, 64)
#Model 8
df_scores_eight=create_df_scoring_table("model eight", model_eight, 100, 128)
#Model 9
df_scores_nine=create_df_scoring_table("model nine", model_nine, 200, 16)
#Model 10
df_scores_ten=create_df_scoring_table("model ten", model_ten, 200, 32)
#Model 11
df_scores_eleven=create_df_scoring_table("model eleven", model_eleven, 200, 64)
#Model 12
df_scores_twelve=create_df_scoring_table("model twelve", model_twelve, 200, 128)
#Model 13
df_scores_thirteen=create_df_scoring_table("model thirteen", model_thirteen, 300, 16)
#Model 14
df_scores_fourteen=create_df_scoring_table("model fourteen", model_fourteen, 300, 32)
#Model 15
df_scores_fifteen=create_df_scoring_table("model fifteen", model_fifteen, 300, 64)
#Model 16
df_scores_sixteen=create_df_scoring_table("model sixteen", model_sixteen, 300, 128)

9799/9799 - 1s - loss: 0.0436 - accuracy: 0.9915
9799/9799 - 0s - loss: 0.0407 - accuracy: 0.9935
9799/9799 - 0s - loss: 0.0307 - accuracy: 0.9947
9799/9799 - 0s - loss: 0.0246 - accuracy: 0.9957
9799/9799 - 1s - loss: 0.0638 - accuracy: 0.9911
9799/9799 - 1s - loss: 0.0378 - accuracy: 0.9944
9799/9799 - 0s - loss: 0.0328 - accuracy: 0.9955
9799/9799 - 0s - loss: 0.0364 - accuracy: 0.9945
9799/9799 - 1s - loss: 0.0514 - accuracy: 0.9936
9799/9799 - 1s - loss: 0.0383 - accuracy: 0.9950
9799/9799 - 1s - loss: 0.0459 - accuracy: 0.9943
9799/9799 - 1s - loss: 0.0397 - accuracy: 0.9951
9799/9799 - 1s - loss: 0.0588 - accuracy: 0.9914
9799/9799 - 1s - loss: 0.0430 - accuracy: 0.9953
9799/9799 - 1s - loss: 0.0415 - accuracy: 0.9954
9799/9799 - 1s - loss: 0.0424 - accuracy: 0.9949


In [270]:
# All frames together
frames = [df_scores_one, df_scores_two, df_scores_three, df_scores_four, df_scores_five, df_scores_six, df_scores_seven,
          df_scores_eight, df_scores_nine, df_scores_ten, df_scores_eleven, df_scores_twelve, df_scores_thirteen,
          df_scores_fourteen, df_scores_fifteen, df_scores_sixteen]

# Frames ber batch
# 16-batch size
frames_16batch = [df_scores_one, df_scores_five, df_scores_nine, df_scores_thirteen]

# 32-batch size
frames_32batch = [df_scores_two, df_scores_six, df_scores_ten, df_scores_fourteen]

# 64-batch size
frames_64batch = [df_scores_three, df_scores_seven, df_scores_eleven, df_scores_fifteen]

# 128-batch size
frames_128batch = [df_scores_four, df_scores_eight, df_scores_twelve, df_scores_sixteen]

# Frames ber embedding dimension
# 50-embedding dimenstion
frames_50dim = [df_scores_one, df_scores_two, df_scores_three, df_scores_four]

# 100-embedding dimenstion
frames_100dim = [df_scores_five, df_scores_six, df_scores_seven, df_scores_eight]

# 200-embedding dimenstion
frames_200dim = [df_scores_nine, df_scores_ten, df_scores_eleven, df_scores_twelve]

# 300-embedding dimenstion
frames_300dim = [df_scores_thirteen, df_scores_fourteen, df_scores_fifteen, df_scores_sixteen]

In [271]:
result = pd.concat(frames)
result.reset_index(drop=True)

Unnamed: 0,Tag Name,Embedding tag,Batch tag,Keras Model,Test Loss,Test Accuracy,Hamming Loss,Zero_one Loss,F1_score,ROC_score
0,model one,50,16,multi_input_model_50dim_16batchsize,0.043602,0.991539,0.008464,1216.0,0.959941,0.970311
1,model two,50,32,multi_input_model_50dim_32batchsize,0.040737,0.993548,0.006453,990.0,0.970021,0.983666
2,model three,50,64,multi_input_model_50dim_64batchsize,0.030692,0.994736,0.005265,824.0,0.975436,0.984847
3,model four,50,128,multi_input_model_50dim_128batchsize,0.024604,0.99569,0.00431,691.0,0.97993,0.988206
4,model five,100,16,multi_input_model_100dim_16batchsize,0.063827,0.991119,0.008884,1318.0,0.958064,0.97037
5,model six,100,32,multi_input_model_100dim_32batchsize,0.037757,0.994376,0.005625,869.0,0.973655,0.982239
6,model seven,100,64,multi_input_model_100dim_64batchsize,0.032798,0.995516,0.004484,714.0,0.979163,0.988673
7,model eight,100,128,multi_input_model_100dim_128batchsize,0.036412,0.994453,0.005547,860.0,0.974206,0.985573
8,model nine,200,16,multi_input_model_200dim_16batchsize,0.051377,0.993622,0.006381,967.0,0.970463,0.985449
9,model ten,200,32,multi_input_model_200dim_32batchsize,0.038296,0.994995,0.005007,784.0,0.976832,0.989141


In [272]:
result.sort_values(by=['Hamming Loss', 'Zero_one Loss'])

Unnamed: 0,Tag Name,Embedding tag,Batch tag,Keras Model,Test Loss,Test Accuracy,Hamming Loss,Zero_one Loss,F1_score,ROC_score
0,model four,50,128,multi_input_model_50dim_128batchsize,0.024604,0.99569,0.00431,691.0,0.97993,0.988206
0,model seven,100,64,multi_input_model_100dim_64batchsize,0.032798,0.995516,0.004484,714.0,0.979163,0.988673
0,model fifteen,300,64,multi_input_model_300dim_64batchsize,0.041491,0.995384,0.004616,716.0,0.978617,0.989728
0,model fourteen,300,32,multi_input_model_300dim_32batchsize,0.042978,0.995277,0.004724,743.0,0.977985,0.986819
0,model twelve,200,128,multi_input_model_200dim_128batchsize,0.039669,0.995071,0.004928,771.0,0.977087,0.987294
0,model ten,200,32,multi_input_model_200dim_32batchsize,0.038296,0.994995,0.005007,784.0,0.976832,0.989141
0,model sixteen,300,128,multi_input_model_300dim_128batchsize,0.042408,0.994867,0.005133,804.0,0.976125,0.986517
0,model three,50,64,multi_input_model_50dim_64batchsize,0.030692,0.994736,0.005265,824.0,0.975436,0.984847
0,model eight,100,128,multi_input_model_100dim_128batchsize,0.036412,0.994453,0.005547,860.0,0.974206,0.985573
0,model six,100,32,multi_input_model_100dim_32batchsize,0.037757,0.994376,0.005625,869.0,0.973655,0.982239


In [186]:
"""
Create a correlation dataframe
"""
correlation_result = result.corr()
print(correlation_result)

               Embedding tag  Batch tag  Test Loss  Test Accuracy  \
Embedding tag       1.000000   0.000000   0.390836       0.131006   
Batch tag           0.000000   1.000000  -0.568288       0.599367   
Test Loss           0.390836  -0.568288   1.000000      -0.782746   
Test Accuracy       0.131006   0.599367  -0.782746       1.000000   
Hamming Loss       -0.130931  -0.599622   0.782821      -1.000000   
Zero_one Loss      -0.152241  -0.595379   0.786742      -0.998241   
F1_score            0.135840   0.596771  -0.778140       0.999817   

               Hamming Loss  Zero_one Loss  F1_score  
Embedding tag     -0.130931      -0.152241  0.135840  
Batch tag         -0.599622      -0.595379  0.596771  
Test Loss          0.782821       0.786742 -0.778140  
Test Accuracy     -1.000000      -0.998241  0.999817  
Hamming Loss       1.000000       0.998238 -0.999815  
Zero_one Loss      0.998238       1.000000 -0.997813  
F1_score          -0.999815      -0.997813  1.000000  


**Comment:** From the correlation dataframe above, we spotted that Hamming Loss and Test Accuracy have a perfect negative correlation.
Thus, we can conclude that the model with the lowest Hamming Loss will also be the model with the highest Test Accuracy.
To validate this we checked the dataframe with the model results, and indeed model four what the highest accuracy on the test data.

**Comparison 1: Hamming Loss & Zero-One Loss**

In [147]:
colormin = 'red'
colorother = 'black'
clrs  = [colormin if result['Hamming Loss'].iloc[row]== result['Hamming Loss'].min() else colorother for row in range(len(result['Hamming Loss']))]

x=result['Tag Name'].values.tolist()
y=result['Hamming Loss'].values.tolist()
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=x, y=y,
                          mode='markers',
                          marker=dict(color=clrs)
                        ))
fig1.update_layout(title="Hamming loss per model (the less is better)",
                   xaxis_title="Model number",
                   yaxis_title="Hamming loss value/model")
fig1.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=result['Tag Name'].values.tolist(), 
                          y=result['Zero_one Loss'].values.tolist(),
                          mode='markers',
                          marker=dict(color=clrs)
                         ))

fig2.update_layout(title="Zero-one loss per model (the less is better)",
                  xaxis_title="Model number",
                  yaxis_title="Zero-one loss value/model")
fig2.show()

In [187]:
colormin = 'red'
clrgrn = 'black'
clrs  = [clrred if result['Hamming Loss'].iloc[row]== result['Hamming Loss'].min() else clrgrn for row in range(len(result['Hamming Loss']))]

x=result['Embedding tag'].values.tolist()
y=result['Hamming Loss'].values.tolist()
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=x, 
                          y=y,
                          mode='markers',
                          marker=dict(color=clrs),
                          text=result['Batch tag'].tolist()
                        ))
fig3.update_layout(title="Hamming loss per embedding dimension",
                   xaxis_type='category',
                   xaxis_title="Model number",
                   yaxis_title="Hamming loss value/model",
                   showlegend=True)

fig3.update_layout(showlegend=False,
                   annotations=[dict(x=50,y=result['Hamming Loss'].min(),xref="x",yref="y",text="Lowest value on 50-dim",showarrow=True,arrowhead=5,ax=0,ay=-40)])

fig3.add_annotation(x=100,y=result['Hamming Loss'][result['Embedding tag']==100].min(),text="Lowest value on 100-dim", showarrow=True,arrowhead=5,ax=0,ay=-40)
fig3.add_annotation(x=200,y=result['Hamming Loss'][result['Embedding tag']==200].min(),text="Lowest value on 200-dim", showarrow=True,arrowhead=5,ax=80,ay=-20)
fig3.add_annotation(x=300,y=result['Hamming Loss'][result['Embedding tag']==300].min(),text="Lowest value on 300-dim", showarrow=True,arrowhead=5,ax=110,ay=0)

fig3.show()

#--------------------------------------------------------------------------

fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=result['Batch tag'].values.tolist(), 
                          y=result['Hamming Loss'].values.tolist(),
                          mode='markers',
                          marker=dict(color=clrs),
                          text=result['Embedding tag'].tolist()
                         ))

fig4.update_layout(title="Hamming loss per batch size",
                   xaxis_type='category',
                   xaxis_title="Model number",
                   yaxis_title="Hamming loss value/model")
fig4.show()

**Comparison 2: Test Accuracy - Test Score/Loss**

In [207]:
colormin = 'red'
colorother = 'black'
clrs_acc = [colormin if result['Test Accuracy'].iloc[row]== result['Test Accuracy'].max() else colorother for row in range(len(result['Test Accuracy']))]
clrs_loss = [colormin if result['Test Loss'].iloc[row]== result['Test Loss'].min() else colorother for row in range(len(result['Test Loss']))]

x=result['Tag Name'].values.tolist()
y=result['Test Accuracy'].values.tolist()
fig5 = go.Figure()
fig5.add_trace(go.Scatter(x=x, y=y,
                          mode='markers',
                          marker=dict(color=clrs_acc)
                        ))
fig5.update_layout(title="Accuracy on test set per model",
                   xaxis_title="Model number",
                   yaxis_title="Accuracy value/model")
fig5.show()

#--------------------------------------------------

fig6 = go.Figure()
fig6.add_trace(go.Scatter(x=result['Tag Name'].values.tolist(), 
                          y=result['Test Loss'].values.tolist(),
                          mode='markers',
                          marker=dict(color=clrs_loss)
                         ))

fig6.update_layout(title="Score on test set per model",
                  xaxis_title="Model number",
                  yaxis_title="Test loss/model")
fig6.show()

In [188]:
# colormin = 'red'
# clrgrn = 'black'
# clrs  = [clrred if result['Hamming Loss'].iloc[row]== result['Hamming Loss'].min() else clrgrn for row in range(len(result['Hamming Loss']))]

# x=result['Batch tag'].values.tolist()
# y=result['Hamming Loss'].values.tolist()
# fig5 = go.Figure()
# fig5.add_trace(go.Scatter(x=x, y=y,
#                           mode='markers',
#                           marker=dict(color=clrs),
#                           text=result['Embedding tag'].tolist()
#                         ))
# fig5.update_layout(title="Hamming loss per model (the less is better)",
#                    xaxis_type='category',
#                    xaxis_title="Model number",
#                    yaxis_title="Hamming loss value/model",
#                    showlegend=True)
# fig5.show()

# fig6 = go.Figure()
# fig6.add_trace(go.Scatter(x=result['Batch tag'].values.tolist(), 
#                           y=result['Zero_one Loss'].values.tolist(),
#                           mode='markers',
#                           marker=dict(color=clrs)
#                          ))

# fig6.update_layout(title="Zero-one loss per model (the less is better)",
#                    xaxis_type='category',
#                    xaxis_title="Model number",
#                    yaxis_title="Zero-one loss value/model")
# fig6.show()

**Comparison 3: Create a classification report and a confusion matrix for the two closest models (model four, model seven)**

In [196]:
with open(os.path.join(os.getcwd(), "pickled_data_per_part\\genres_list_06032020.pkl"), 'rb') as handle:
    genres_list = pickle.load(handle)

In [197]:
def create_classification_table(model):
    
    y_test_pred_probs = model.predict([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews])
    y_test_predictions = (y_test_pred_probs>0.5).astype(int)
    
    classification_table = classification_report(y_true=y_test, y_pred=y_test_predictions)
    
    return classification_table

def create_confusion_matrix(model, embeddings_dim, batch_size_value):
    
    y_test_pred_probs = model.predict([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews])
    y_test_predictions = (y_test_pred_probs>0.5).astype(int)
    
    conf_mat=confusion_matrix(y_test.argmax(axis=1), y_test_predictions.argmax(axis=1))

    conf_matrix=pd.DataFrame(conf_mat,
                 columns=genres_list,
                 index=genres_list)
    
    conf_matrix.to_pickle(os.path.join(os.getcwd(), "model_one\\confusion_matrix_{0}dim_{1}batchsize_29032020.pkl".format(str(embeddings_dim), str(batch_size_value))))
    return conf_matrix

In [216]:
classification_table_one=create_classification_table(model_four)
print("Classification report of model four\n" + str(classification_table_one))


Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.



Classification report of model four
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1353
           1       0.96      0.95      0.95       768
           2       0.99      1.00      1.00       423
           3       0.94      0.92      0.93       453
           4       1.00      1.00      1.00      2857
           5       0.94      0.98      0.96       970
           6       0.98      1.00      0.99       778
           7       1.00      1.00      1.00      4551
           8       1.00      0.97      0.98       408
           9       0.98      1.00      0.99      1027
          10       0.90      0.89      0.90       208
          11       0.94      0.97      0.95       487
          12       0.98      0.98      0.98      1287
          13       1.00      1.00      1.00       539
          14       0.99      1.00      1.00      1256
          15       0.74      0.35      0.47       302
          16       0.97      0.96      0.97  

In [202]:
confusion_matrix_one=create_confusion_matrix(model_four, 50, 128)
confusion_matrix_one

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Action,1351,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
Adventure,0,405,17,4,5,1,2,3,0,1,0,3,1,0,0,0,1
Animation,0,5,271,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Children,5,7,1,167,5,0,2,16,1,1,0,0,1,0,0,0,0
Comedy,5,12,0,8,2322,0,0,0,0,0,0,0,0,0,0,0,0
Crime,4,0,0,0,0,539,1,13,0,0,0,0,0,0,0,0,0
Documentary,0,1,1,0,5,2,704,0,0,0,0,0,0,0,0,0,0
Drama,6,2,0,11,7,19,6,2678,0,0,0,0,0,0,0,0,0
Fantasy,2,0,0,1,0,0,0,0,55,0,0,0,0,0,0,0,0
Horror,1,1,0,0,0,3,4,0,0,583,0,0,0,0,0,0,0


In [204]:
classification_table_two=create_classification_table(model_seven)
print("Classification report of model seven\n\n" + str(classification_table_two))

Classification report of model seven

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1353
           1       0.93      0.99      0.96       768
           2       0.99      1.00      1.00       423
           3       0.93      0.93      0.93       453
           4       1.00      1.00      1.00      2857
           5       0.94      0.97      0.95       970
           6       0.98      1.00      0.99       778
           7       1.00      1.00      1.00      4551
           8       0.96      0.96      0.96       408
           9       0.97      1.00      0.99      1027
          10       0.96      0.94      0.95       208
          11       0.98      0.88      0.93       487
          12       0.96      1.00      0.98      1287
          13       1.00      1.00      1.00       539
          14       0.99      1.00      0.99      1256
          15       0.69      0.48      0.57       302
          16       0.98      0.97      0.97

In [203]:
confusion_matrix_two=create_confusion_matrix(model_seven, 100, 64)
confusion_matrix_two

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Action,1335,1,1,0,7,2,0,5,0,0,0,0,0,1,0,1,0
Adventure,0,437,0,0,3,1,1,0,0,0,0,1,0,0,0,0,0
Animation,0,15,261,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Children,2,12,1,169,5,0,2,12,1,1,0,0,0,1,0,0,0
Comedy,1,19,0,11,2307,0,3,3,1,2,0,0,0,0,0,0,0
Crime,1,0,0,0,0,540,0,3,0,8,0,0,1,1,3,0,0
Documentary,0,1,1,0,3,5,702,1,0,0,0,0,0,0,0,0,0
Drama,1,2,0,7,6,27,6,2680,0,0,0,0,0,0,0,0,0
Fantasy,1,0,0,1,0,0,0,0,54,0,2,0,0,0,0,0,0
Horror,0,1,0,0,0,1,3,0,1,586,0,0,0,0,0,0,0


#### Comparison 4: Predicted vs Actual Genre Tags

In [None]:
X_test=pd.read_pickle(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_test_20000_25032020.pkl"))

In [211]:
def predict_genre_tags(indx, model, genres_list):
        
    test_sequence_actors = X_test_seq_actors[indx:indx+1]
    
    test_sequence_plot = X_test_seq_plot[indx:indx+1]
    
    test_sequence_features = X_test_seq_features[indx:indx+1]
    
    test_sequence_reviews = X_test_seq_reviews[indx:indx+1]
    
    text_prediction = model.predict([test_sequence_actors, test_sequence_plot, test_sequence_features, test_sequence_reviews])
    
    [float(i) for i in text_prediction[0]]
    
    tag_probabilities = text_prediction[0][np.argsort(text_prediction[0])[-3:]]
    
    indexes = np.argsort(text_prediction[0])[::-1][:3]

    predicted_tags = []
    
    for i, tag in enumerate(genres_list):
        if i in indexes:
            predicted_tags.append(genres_list[i])
    
    return predicted_tags

def create_predictions_df(model, random_numbers_list, embeddings_dim, batch_size_value):
    
    
    df_predictions = pd.DataFrame({'Movie Title':pd.Series([X_test['title'].iloc[random_numbers_list[0]]], dtype='str'),
                                   'Predicted Genre tags':pd.Series([predict_genre_tags(random_numbers_list[0], model, genres_list)], dtype='str'),
                                   'Real Genre tags':pd.Series([X_test['reduced_genres'].iloc[random_numbers_list[0]]], dtype='str')})

    for i in range(len(random_numbers_list)):

        df_predictions = df_predictions.append({'Movie Title':X_test['title'].iloc[random_numbers_list[i]], 
                                                'Predicted Genre tags':predict_genre_tags(random_numbers_list[i], model, genres_list),
                                                'Real Genre tags':X_test['reduced_genres'].iloc[random_numbers_list[i]]} , ignore_index=True)

    df_predictions = df_predictions.drop(df_predictions.index[0])
    df_predictions.to_pickle("model_one\\model_one_df_predictions_{0}dim_{1}batchsize_29032020.pkl".format(str(embeddings_dim), str(batch_size_value)))
    
    return df_predictions

In [209]:
random_numbers = random.sample(range(1, y_test.shape[0]), 20)

save_index_of_numbers = random_numbers

print("Randomly saved numbers to make predictions: {}".format(save_index_of_numbers))

Randomly saved numbers to make predictions: [5503, 2411, 661, 1584, 9625, 2469, 8393, 8680, 9096, 3150, 5575, 3796, 7468, 483, 7407, 7003, 2055, 1540, 9757, 8260]


In [212]:
predictions_dataframe_one=create_predictions_df(model_four, save_index_of_numbers, 50, 128)
predictions_dataframe_two=create_predictions_df(model_seven, save_index_of_numbers, 100, 64)

In [213]:
predictions_dataframe_one

Unnamed: 0,Movie Title,Predicted Genre tags,Real Genre tags
1,Three Days of the Condor (3 Days of the Condor),"[Drama, Mystery, Romance]","[Drama, Mystery, Romance]"
2,Nazi Pop Twins,"[Comedy, Documentary, Musical]",[Documentary]
3,The Mark of the Hawk,"[Comedy, Drama, Romance]",[Drama]
4,Land of Plenty (Angst and Alienation in America),"[Adventure, Drama, War]",[Drama]
5,The Hobbit: An Unexpected Journey,"[Adventure, Animation, Fantasy]","[Adventure, Fantasy]"
6,The Good Lie,"[Comedy, Crime, Drama]",[Drama]
7,"Cherry, Harry & Raquel!","[Crime, Drama, Thriller]","[Crime, Drama, Thriller]"
8,The Desert Rats,"[Drama, Romance, War]","[Drama, War]"
9,Blood Creek (a.k.a. Town Creek),"[Comedy, Drama, Horror]",[Horror]
10,The River,"[Comedy, Drama, Romance]","[Drama, Romance]"


In [214]:
predictions_dataframe_two

Unnamed: 0,Movie Title,Predicted Genre tags,Real Genre tags
1,Three Days of the Condor (3 Days of the Condor),"[Drama, Mystery, Romance]","[Drama, Mystery, Romance]"
2,Nazi Pop Twins,"[Crime, Documentary, Musical]",[Documentary]
3,The Mark of the Hawk,"[Drama, Thriller, War]",[Drama]
4,Land of Plenty (Angst and Alienation in America),"[Drama, Thriller, War]",[Drama]
5,The Hobbit: An Unexpected Journey,"[Adventure, Drama, Fantasy]","[Adventure, Fantasy]"
6,The Good Lie,"[Drama, Romance, Thriller]",[Drama]
7,"Cherry, Harry & Raquel!","[Crime, Drama, Thriller]","[Crime, Drama, Thriller]"
8,The Desert Rats,"[Drama, Thriller, War]","[Drama, War]"
9,Blood Creek (a.k.a. Town Creek),"[Adventure, Horror, Mystery]",[Horror]
10,The River,"[Action, Drama, Romance]","[Drama, Romance]"


#### Comparison 5: Training and Validation plots

In [219]:
history_dataframe_one=pd.read_pickle(os.path.join(os.getcwd(), "model_one\\metrics_histogram_multi_input_keras_{0}dim_{1}batchsize.pkl".format(str(50), str(128))))
history_dataframe_two=pd.read_pickle(os.path.join(os.getcwd(), "model_one\\metrics_histogram_multi_input_keras_{0}dim_{1}batchsize.pkl".format(str(100), str(64))))

In [262]:
colormin = 'black'
colormax = 'black'
colorother = 'rgb(252, 141, 98)'
clrs_acc_model_four = [colormax if history_dataframe_one.val_accuracy.iloc[row]==history_dataframe_one.val_accuracy.max() else colorother for row in range(len(history_dataframe_one.val_accuracy))]
clrs_acc_model_seven = [colormax if history_dataframe_two.val_accuracy.iloc[row]==history_dataframe_two.val_accuracy.max() else colorother for row in range(len(history_dataframe_two.val_accuracy))]

clrs_loss=[colormin if history_dataframe_one.val_loss.iloc[row]==history_dataframe_one.val_loss.min() else colorother for row in range(len(history_dataframe_one.val_loss))]

#Accuracy of model four
fig1=go.Figure()

fig1.add_trace(go.Scatter(x=history_dataframe_one.epoch.tolist(), 
                          y=history_dataframe_one.accuracy.tolist(),
                          mode='lines+markers',
                          name='Training Accuracy',
                          line=dict(color='rgb(102, 194, 165)')))

fig1.add_trace(go.Scatter(x=history_dataframe_one.epoch.tolist(), 
                          y=history_dataframe_one.val_accuracy.tolist(),
                          mode='lines+markers',
                          name='Validation Accuracy',
                          line=dict(color='rgb(252, 141, 98)'),
                          marker=dict(color=clrs_acc_model_four)))

fig1.update_layout(template="simple_white",
                   title="Accuracy score on train & validation sets (model four)",
                   xaxis_title="Number of epochs",
                   yaxis_title="Accuracy/epoch")

fig1.update_layout(showlegend=True,
                   annotations=[dict(x=history_dataframe_one.epoch[history_dataframe_one.val_accuracy==history_dataframe_one.val_accuracy.max()].tolist()[0],
                                     y=history_dataframe_one.val_accuracy.max(),
                                     xref="x",yref="y",
                                     text="Epoch with the highest validation accuracy",
                                     showarrow=True,
                                     arrowhead=5,
                                     ax=0,ay=40)])

fig1.show()

#---------------------------------------------------------

#Accuracy of model seven

fig2=go.Figure()

fig2.add_trace(go.Scatter(x=history_dataframe_two.epoch.tolist(), 
                          y=history_dataframe_two.accuracy.tolist(),
                          mode='lines+markers',
                          name='Training Accuracy',
                          line=dict(color='rgb(102, 194, 165)')))

fig2.add_trace(go.Scatter(x=history_dataframe_two.epoch.tolist(), 
                          y=history_dataframe_two.val_accuracy.tolist(),
                          mode='lines+markers',
                          name='Validation Accuracy',
                          line=dict(color='rgb(252, 141, 98)'),
                          marker=dict(color=clrs_acc_model_seven)))

fig2.update_layout(template="simple_white",
                   title="Accuracy score on train & validation sets (model seven)",
                   xaxis_title="Number of epochs",
                   yaxis_title="Accuracy/epoch")

fig2.update_layout(showlegend=True,
                   annotations=[dict(x=history_dataframe_two.epoch[history_dataframe_two.val_accuracy==history_dataframe_two.val_accuracy.max()].tolist()[0],
                                     y=history_dataframe_two.val_accuracy.max(),
                                     xref="x",yref="y",
                                     text="Epoch with the highest validation accuracy",
                                     showarrow=True,
                                     arrowhead=5,
                                     ax=0,ay=40)])

fig2.show()

In [263]:
colormin = 'black'
colorother = 'rgb(252, 141, 98)'

clrs_loss_model_four=[colormin if history_dataframe_one.val_loss.iloc[row]==history_dataframe_one.val_loss.min() else colorother for row in range(len(history_dataframe_one.val_loss))]
clrs_loss_model_seven=[colormin if history_dataframe_two.val_loss.iloc[row]==history_dataframe_two.val_loss.min() else colorother for row in range(len(history_dataframe_two.val_loss))]

#Loss of model four

fig3=go.Figure()

fig3.add_trace(go.Scatter(x=history_dataframe_one.epoch.tolist(), 
                          y=history_dataframe_one.loss.tolist(),
                          mode='lines+markers',
                          name='Training Loss',
                          line=dict(color='rgb(102, 194, 165)')))

fig3.add_trace(go.Scatter(x=history_dataframe_one.epoch.tolist(), 
                          y=history_dataframe_one.val_loss.tolist(),
                          mode='lines+markers',
                          name='Validation Loss',
                          line=dict(color='rgb(252, 141, 98)'),
                          marker=dict(color=clrs_loss_model_four)))

fig3.update_layout(template="simple_white",
                   title="Loss score on train & validation sets (model four)",
                   xaxis_title="Number of epochs",
                   yaxis_title="Loss/epoch")

fig3.update_layout(showlegend=True,
                   annotations=[dict(x=history_dataframe_one.epoch[history_dataframe_one.val_loss==history_dataframe_one.val_loss.min()].tolist()[0],
                                     y=history_dataframe_one.val_loss.min(),
                                     xref="x",yref="y",
                                     text="Epoch with the lowest validation loss",
                                     showarrow=True,
                                     arrowhead=5,
                                     ax=0,ay=-40)])
fig3.show()

#---------------------------------------------------------

#Loss of model seven

fig4=go.Figure()

fig4.add_trace(go.Scatter(x=history_dataframe_two.epoch.tolist(), 
                          y=history_dataframe_two.loss.tolist(),
                          mode='lines+markers',
                          name='Training Loss',
                          line=dict(color='rgb(102, 194, 165)')))

fig4.add_trace(go.Scatter(x=history_dataframe_two.epoch.tolist(), 
                          y=history_dataframe_two.val_loss.tolist(),
                          mode='lines+markers',
                          name='Validation Loss',
                          line=dict(color='rgb(252, 141, 98)'),
                          marker=dict(color=clrs_loss_model_seven)))

fig4.update_layout(template="simple_white",
                   title="Loss score on train & validation sets (model four)",
                   xaxis_title="Number of epochs",
                   yaxis_title="Loss/epoch")

fig4.update_layout(showlegend=True,
                   annotations=[dict(x=history_dataframe_two.epoch[history_dataframe_two.val_loss==history_dataframe_two.val_loss.min()].tolist()[0],
                                     y=history_dataframe_two.val_loss.min(),
                                     xref="x",yref="y",
                                     text="Epoch with the lowest validation loss",
                                     showarrow=True,
                                     arrowhead=5,
                                     ax=0,ay=-65)])
fig4.show()

<b>- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  </b>