### Part 3.3: Model comparisons (latest changes on 08.03.2020)

* Classification report -> (dataframe)
* Metrics (Accuracy, Validation_loss) -> (dataframe)
* Metrics per epoch -> (plot)

#### Import the libraries

In [None]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import plotly.express as px

from sklearn.metrics import confusion_matrix, classification_report

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

#### Keras Text Classification (For creating the word embeddings)

In [None]:
%%time

from nltk.stem import WordNetLemmatizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from time import time

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

import tensorflow_hub as hub
import bert
# from bert import tokenization
# from bert.tokenization import FullTokenizer

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

#### #1 Import & compare each different model's score-dataframe

* Test Accuracy
* Test Loss
* Hamming Loss
* Zero on Loss
* F1 score

In [None]:
# model 1: Multi-input
dataframe_1=pd.read_pickle(os.path.join(os.getcwd(), "model_one\\df_metrics_multy_input_keras_08032020.pkl"))

# model 2: GloVe
dataframe_2=pd.read_pickle(os.path.join(os.getcwd(), "model_two\\df_metrics_glove_embeddings_08032020.pkl"))

# model 3: Google news 130GB (without OOV tokens)
dataframe_3=pd.read_pickle(os.path.join(os.getcwd(), "model_three\\df_metrics_google_news_130_without_OOV_tokens_08032020.pkl"))

# model 4: Google news 130GB (with OOV tokens)
dataframe_4=pd.read_pickle(os.path.join(os.getcwd(), "model_four\\df_metrics_google_news_130_with_OOV_tokens_08032020.pkl"))

# model 5: Google news 7B
dataframe_5=pd.read_pickle(os.path.join(os.getcwd(), "model_five\\df_metrics_english_google_news_7b_corpus_08032020.pkl"))

#model 6: Google news 200B
dataframe_6=pd.read_pickle(os.path.join(os.getcwd(), "model_six\\df_metrics_english_google_news_7b_corpus_08032020.pkl"))

#model 7: Universal Sentence Encoder
dataframe_7=pd.read_pickle(os.path.join(os.getcwd(), "model_seven\\df_metrics_universal_sentence_encoder_08032020.pkl"))

#model 8:
#To be done

#model 9:
#To be done

#Union the dataframes!
dataframe_union = dataframe_1.append([dataframe_2, dataframe_3, dataframe_4, dataframe_5, dataframe_6, dataframe_7], ignore_index=True)

dataframe_union

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #2 Import & plot each different model's accuracy-loss per epoch

In [None]:
# model 1: Multi-input
metrics_dataframe_1=pd.read_pickle(os.path.join(os.getcwd(), "model_one\\metrics_histogram_multi_input_keras.pkl"))

# model 2: GloVe
metrics_dataframe_2=pd.read_pickle(os.path.join(os.getcwd(), "model_two\\metrics_histogram_glove_embeddings.pkl"))

# model 3: Google news 130GB (without OOV tokens)
metrics_dataframe_3=pd.read_pickle(os.path.join(os.getcwd(), "model_three\\metrics_histogram_english_google_news_without_oovtokens_08032020.pkl"))

# model 4: Google news 130GB (with OOV tokens)
metrics_dataframe_4=pd.read_pickle(os.path.join(os.getcwd(), "model_four\\metrics_histogram_english_google_news_with_oovtokens_08032020.pkl"))

# model 5: Google news 7B
metrics_dataframe_5=pd.read_pickle(os.path.join(os.getcwd(), "model_five\\metrics_histogram_english_google_news_7b_corpus_08032020.pkl"))

#model 6: Google news 200B
metrics_dataframe_6=pd.read_pickle(os.path.join(os.getcwd(), "model_six\\metrics_histogram_english_google_news_200b_corpus_08032020.pkl"))

#model 7: Universal Sentence Encoder
metrics_dataframe_6=pd.read_pickle(os.path.join(os.getcwd(), "model_seven\\metrics_histogram_universal_sentence_encoder_08032020.pkl"))

#model 8:
#To be done

#model 9:
#To be done

In [None]:
metrics_dataframe_1

In [None]:
# Val_loss

fig=px.line()
fig.add_scatter(x=metrics_dataframe_1["epoch"], y=metrics_dataframe_1["val_loss"], mode="lines+markers", name='model 1: multi-input keras')
fig.add_scatter(x=metrics_dataframe_3["epoch"], y=metrics_dataframe_3["val_loss"], mode="lines+markers", name='model 3: google news 130GB (no OOV tokens)')
fig.add_scatter(x=metrics_dataframe_4["epoch"], y=metrics_dataframe_4["val_loss"], mode="lines+markers", name='model 4: google news 130GB (with OOV tokens)')
fig.add_scatter(x=metrics_dataframe_5["epoch"], y=metrics_dataframe_5["val_loss"], mode="lines+markers", name='model 5: google news 7B')
fig.add_scatter(x=metrics_dataframe_6["epoch"], y=metrics_dataframe_6["val_loss"], mode="lines+markers", name='model 6: google news 200B (no OOV tokens)')

fig.update_layout(title='Loss Function comparison (per model)',
                   xaxis_title='Epoch',
                   yaxis_title='Test Loss (model loss)',
                   legend_title='<b> Model name </b>',
                   legend_orientation="h")
fig.show()

In [None]:
#Val_accuracy

fig=px.line()
fig.add_scatter(x=metrics_dataframe_1["epoch"], y=metrics_dataframe_1["val_accuracy"], mode="lines+markers", name='model 1: multi-input keras')
fig.add_scatter(x=metrics_dataframe_3["epoch"], y=metrics_dataframe_3["val_accuracy"], mode="lines+markers", name='model 3: google news 130GB (no OOV tokens)')
fig.add_scatter(x=metrics_dataframe_4["epoch"], y=metrics_dataframe_4["val_accuracy"], mode="lines+markers", name='model 4: google news 130GB (with OOV tokens)')
fig.add_scatter(x=metrics_dataframe_5["epoch"], y=metrics_dataframe_5["val_accuracy"], mode="lines+markers", name='model 5: google news 7B')
fig.add_scatter(x=metrics_dataframe_6["epoch"], y=metrics_dataframe_6["val_accuracy"], mode="lines+markers", name='model 6: google news 200B (no OOV tokens)')

fig.update_layout(title='Accuracy comparison (per model)',
                   xaxis_title='Epoch',
                   yaxis_title='Test Accuracy (model metric)',
                   legend_title='<b> Model name </b>',
                   legend_orientation="h")
fig.show()

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #3 Import & compare each different model's classification report

In [None]:
# model 2: GloVe
#To be Done

# model 1: Multi-input
y_predictions_model_1=np.load(os.path.join(os.getcwd(), "model_one\\y_predictions_multi_input_keras_07032020.npy"))
y_true_model_1=np.load(os.path.join(os.getcwd(), "model_one\\y_true_multi_input_keras_07032020.npy"))
classification_table_1= classification_report(y_true=y_true_model_1, y_pred=y_predictions_model_1)
print("Model 1: Multi-input Keras\n")
print(classification_table_1)

# model 2: Glove Embeddings
y_predictions_model_2=np.load(os.path.join(os.getcwd(), "model_two\\y_predictions_glove_embeddings_08032020.npy"))
y_true_model_2=np.load(os.path.join(os.getcwd(), "model_two\\y_true_glove_embeddings_08032020.npy"))
classification_table_2= classification_report(y_true=y_true_model_2, y_pred=y_predictions_model_2)
print("\nModel 2: Glove Embeddings\n")
print(classification_table_2)

# model 3: Google news 130GB (without OOV tokens)
y_predictions_model_3=np.load(os.path.join(os.getcwd(), "model_three\\y_predictions_english_google_news_without_oovtokens_08032020.npy"))
y_true_model_3=np.load(os.path.join(os.getcwd(), "model_three\\y_true_english_google_news_without_oovtokens_08032020.npy"))
classification_table_3= classification_report(y_true=y_true_model_3, y_pred=y_predictions_model_3)
print("\nModel 3: Google news 130GB (without OOV tokens)\n")
print(classification_table_3)

# model 4: Google news 130GB (with OOV tokens)
y_predictions_model_4=np.load(os.path.join(os.getcwd(), "model_four\\y_predictions_english_google_news_with_oovtokens_08032020.npy"))
y_true_model_4=np.load(os.path.join(os.getcwd(), "model_four\\y_true_english_google_news_with_oovtokens_08032020.npy"))
classification_table_4= classification_report(y_true=y_true_model_4, y_pred=y_predictions_model_4)
print("\nModel 4: Google news 130GB (with OOV tokens)\n")
print(classification_table_4)

# model 5: Google news 7B
y_predictions_model_5=np.load(os.path.join(os.getcwd(), "model_five\\y_predictions_english_google_news_7b_corpus_08032020.npy"))
y_true_model_5=np.load(os.path.join(os.getcwd(), "model_five\\y_true_english_google_news_7b_corpus_08032020.npy"))
classification_table_5= classification_report(y_true=y_true_model_5, y_pred=y_predictions_model_5)
print("\nModel 5: Google news 7B\n")
print(classification_table_5)

#model 6: Google news 200B
y_predictions_model_6=np.load(os.path.join(os.getcwd(), "model_six\\y_predictions_english_google_news_200b_corpus_08032020.npy"))
y_true_model_6=np.load(os.path.join(os.getcwd(), "model_six\\y_true_english_google_news_200b_corpus_08032020.npy"))
classification_table_6= classification_report(y_true=y_true_model_6, y_pred=y_predictions_model_6)
print("\nModel 6: Google news 200B\n")
print(classification_table_6)

#model 7: Universal Sentence Encoder
y_predictions_model_7=np.load(os.path.join(os.getcwd(), "model_seven\\y_predictions_universal_sentence_encoder_08032020.npy"))
y_true_model_7=np.load(os.path.join(os.getcwd(), "model_seven\\y_true_universal_sentence_encoder_08032020.npy"))
classification_table_7= classification_report(y_true=y_true_model_7, y_pred=y_predictions_model_7)
print("\nModel 7: Universal Sentence Encoder\n")
print(classification_table_7)

#model 8:
#To be done

#model 9:
#To be done

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -