# __Step 2c: BERT models__

## __Setup__

### _Imports_

In [1]:
'''
For building text classification model based on embedding of Word2Vec and BERT
'''

## for data
import argparse
import json
import pandas as pd
import numpy as np
import pickle
import sys
import itertools
from pathlib import Path

from sklearn import model_selection, metrics

## for word embedding with w2v
import gensim

## for deep learning
from tensorflow.keras import models, layers, callbacks, preprocessing

### _Functions_

In [2]:
def read_configs(config_file):
  """Read configuration file and return a config_dict"""
  # required
  config_dict = {'lang_model':0,
                 'proj_dir':0,
                 'work_dir':0,
                 'corpus_combo_file':0,
                 'rand_state':0,
                 'bert_param':0,}

  # Read config file and fill in the dictionary
  with open(config_file, 'r') as f:
    configs     = f.readlines()
    for config in configs:
      if config.strip() == "" or config[0] == "#":
        pass
      else:
        config = config.strip().split("=")
        if config[0] in config_dict:
          config_dict[config[0]] = eval(config[1])

  # Check if any config missing
  missing = 0
  for config in config_dict:
    if config_dict[config] == 0:
      print("  missing:", config)
      missing += 1
    else:
      print("  ", config, "=", config_dict[config])

  if missing == 0:
    print("  all config available")
  else:
    print("  missing config, QUIT!")
    sys.exit(0)

  return config_dict


def split_train_validate_test(corpus_combo_file, rand_state):
  '''Load data and split train, validation, test subsets for the cleaned texts
  Args:
    corpus_combo_file (str): path to the json data file
    rand_state (int): for reproducibility
  Return:
    train, valid, test (pandas dataframes): training, validation, testing sets
  '''
  # Load json file
  with corpus_combo_file.open("r+") as f:
      corpus_combo_json = json.load(f)

  # Convert json back to dataframe
  corpus_combo = pd.read_json(corpus_combo_json)

  # Cleaned corpus
  corpus = corpus_combo[['label','txt_clean']]

  # Split train test
  train, test = model_selection.train_test_split(corpus, 
      test_size=0.2, stratify=corpus['label'], random_state=rand_state)

  # Split train validate
  train, valid = model_selection.train_test_split(train, 
      test_size=0.25, stratify=train['label'], random_state=rand_state)

  X_train = train['txt_clean']
  X_valid = valid['txt_clean']
  X_test  = test['txt_clean']
  y_train = train['label']
  y_valid = valid['label']
  y_test  = test['label']

  print(f"    size: train={X_train.shape}, valid={X_valid.shape}," +\
        f" test={X_test.shape}")

  return [X_train, X_valid, X_test, y_train, y_valid, y_test]
  
def get_hyperparameters(w2v_param):
  ''' Return a list with hyperparameters based on the passed dictionary
  Adopted from:
    https://stackoverflow.com/questions/38721847/how-to-generate-all-combination-from-values-in-dict-of-lists-in-python
  Args:
    param (dict): a dictionary specified in the config.txt file.
  Return:
    param_list (list): a nested list of hyperparameters in the order of
      max_feature, ngram_range, and p_threshold
  '''
  print(w2v_param)
  keys, values = zip(*w2v_param.items())
  param_list = [v for v in itertools.product(*values)]
  
  return keys, param_list

def get_unigram(corpus):
  unigram = []
  for txt in corpus:
    lst_words = txt.split()
    unigram.append(lst_words)

  return unigram

def get_ngram(X_train, X_valid, X_test, ngram):

  uni_train = get_unigram(X_train)
  uni_valid = get_unigram(X_valid)
  uni_test  = get_unigram(X_test)

  if ngram == 1:
    return uni_train, uni_valid, uni_test
  # ngram >1
  else:
    # Get bigrams
    bigrams_detector  = gensim.models.phrases.Phrases(
                          uni_train, delimiter=" ", min_count=5, threshold=10)
    bigrams_detector  = gensim.models.phrases.Phraser(bigrams_detector)
    bi_train = list(bigrams_detector[uni_train])
    bi_valid = list(bigrams_detector[uni_valid])
    bi_test  = list(bigrams_detector[uni_test])

    # Return bigrams
    if ngram == 2:
      return bi_train, bi_valid, bi_test

    # Get trigrams and return them
    elif ngram == 3:
      trigrams_detector = gensim.models.phrases.Phrases(
                          bigrams_detector[uni_train], delimiter=" ", 
                          min_count=5, threshold=10)
      trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)
      tri_train = list(trigrams_detector[bi_train])
      tri_valid = list(trigrams_detector[bi_valid])
      tri_test  = list(trigrams_detector[bi_test])
      return tri_train, tri_valid, tri_test
    
    else:
      print('ERR: ngram cannot be larger than 3. QUIT!')
      sys.exit(0)


def get_w2v_model(X_train, X_valid, X_test, param, rand_state):
  '''Get ngram lists and w2v model
  Args:
  Return:
  '''
  [min_count, window, ngram] = param

  ngram_train, ngram_valid, ngram_test = get_ngram(X_train, X_valid, X_test, 
                                                  ngram)

  # Check if w2v model is already generated
  model_w2v_name = work_dir / f"model_cln_w2v_{min_count}-{window}-{ngram}"

  if model_w2v_name.is_file():
    print("  load the w2v model")
    with open(work_dir / model_w2v_name, "rb") as f:
        model_w2v = pickle.load(f)
  else:
    print("  geneate and save w2v model")
    model_w2v = gensim.models.Word2Vec(ngram_train, vector_size=300, 
                                      window=window, min_count=min_count, 
                                      sg=1, epochs=30, seed=rand_state)
    
    with open(model_w2v_name, "wb") as f:
      pickle.dump(model_w2v, f)

  return model_w2v, model_w2v_name, ngram_train, ngram_valid, ngram_test


def train_tokenizer(corpus):
  '''Train a tokenizer
  Args:
    corpus (list): a nested list of word lists
  Return:
    tokenizer (keras.preprocessing.text.Tokenizer): trained tokenizer
    dic_vocab_token (dict): token as key, index as value
  '''

  # intialize tokenizer
  # See: https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization
  # This is replaced by tf.keras.layers.TextVectorization
  tokenizer = preprocessing.text.Tokenizer(lower=True, split=' ', 
                oov_token="NaN", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

  # tokenize corpus 
  tokenizer.fit_on_texts(corpus)

  # get token dictionary, with token as key, index number as value
  dic_vocab_token = tokenizer.word_index

  return tokenizer, dic_vocab_token


def get_embeddings(corpus, model_w2v, tokenizer, dic_vocab_token):

  # Transforms each text in texts to a sequence of integers.
  lst_text2seq = tokenizer.texts_to_sequences(corpus)

  # pad or trucate sequence
  X_w2v = preprocessing.sequence.pad_sequences(
                    lst_text2seq,      # List of sequences, each a list of ints 
                    maxlen=500,        # maximum length of all sequences
                    padding="post",    # 'pre' or 'post' 
                    truncating="post") # remove values from sequences > maxlen

  ## start the matrix (length of vocabulary x vector size) with all 0s

  embeddings = np.zeros((len(dic_vocab_token)+1, 300))
  not_in_emb = {}
  for word, idx in dic_vocab_token.items():
      ## update the row with vector
      try:
          embeddings[idx] =  model_w2v.wv[word]
      ## if word not in model then skip and the row stays all 0s
      except KeyError:
          not_in_emb[word] = 1

  return embeddings, X_w2v


def get_w2v_emb_model(embeddings):
  '''Build a deep learning model with Word2Vec embeddings
  Args:
    embeddings
  '''

  ## code attention layer
  def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

  ## input
  x_in = layers.Input(shape=(500,)) ## embedding
  x = layers.Embedding(input_dim=embeddings.shape[0],  
                      output_dim=embeddings.shape[1], 
                      weights=[embeddings],
                      input_length=500, trainable=False)(x_in)

  ## apply attention
  x = attention_layer(x, neurons=500)

  ## 2 layers of bidirectional lstm
  x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2, 
                          return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2))(x)

  ## final dense layers
  x = layers.Dense(64, activation='relu')(x)
  y_out = layers.Dense(2, activation='softmax')(x)

  ## Initialize and compile model
  model = models.Model(x_in, y_out)
  model.compile(loss='sparse_categorical_crossentropy',
                optimizer='adam', 
                metrics=['accuracy'])

  return model


def run_main_function():

  pass


def run_pipeline(param, subsets):
  '''Carry out the major steps'''

  rand_state = config_dict['rand_state']

  [X_train, X_valid, X_test, y_train, y_valid, y_test] = subsets

  # Get list of ngrams and w2v model
  print("  get list of ngrams and w2v model")
  model_w2v, model_w2v_name, ngram_train, ngram_valid, ngram_test = \
                      get_w2v_model(X_train, X_valid, X_test, param, rand_state)
  
  # Train tokenizer
  print("  train tokenizer")
  tokenizer, dic_vocab_token = train_tokenizer(ngram_train)

  # Get embeddings
  print("  get embeddings")
  embeddings, X_train_w2v = get_embeddings(ngram_train, model_w2v, 
                                                    tokenizer, dic_vocab_token)
  _, X_valid_w2v = get_embeddings(ngram_valid, model_w2v, 
                                                    tokenizer, dic_vocab_token)
  _ , X_test_w2v  = get_embeddings(ngram_test , model_w2v, 
                                                    tokenizer, dic_vocab_token)

  # Model checkpoint path and output model file name
  cp_filepath  = Path(str(model_w2v_name) + "_dnn")

  # Load model if exists
  if cp_filepath.is_dir():
    print("  load model in:", cp_filepath)
    model_emb = models.load_model(cp_filepath)

  # Train and save model if not
  else:
    print("  train model")
    model_emb    = get_w2v_emb_model(embeddings)

    # setup check points
    callback_es  = callbacks.EarlyStopping(monitor='val_loss', patience=5)
    callback_mcp = callbacks.ModelCheckpoint(filepath=cp_filepath, mode='max', 
            save_weights_only=False, monitor='val_accuracy', save_best_only=True)

    # Train model
    history = model_emb.fit(x=X_train_w2v, y=y_train, batch_size=256, 
                            epochs=20, shuffle=True, verbose=1, 
                            validation_data=(X_valid_w2v, y_valid), 
                            callbacks=[callback_es, callback_mcp])

  print("  get validation f1 score")
  y_valid_pred_prob = model_emb.predict(X_valid_w2v)
  dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_valid))}
  y_valid_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_valid_pred_prob]
  best_score = metrics.f1_score(y_valid, y_valid_pred)
  print("    ", best_score)

  print("  get testing f1 score")
  y_test_pred_prob = model_emb.predict(X_test_w2v)
  dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_test))}
  y_test_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_test_pred_prob]
  test_score = metrics.f1_score(y_test, y_test_pred)
  print("    ", test_score)

  # provide some space between runs
  print('\n')

  return best_score, cp_filepath, test_score

### _Get training/testing split_

In [4]:
config_file = Path("config_bert.txt")

print("\nRead configuration file...")
config_dict = read_configs(config_file)

# Set up working directory and corpus file location
proj_dir          = Path(config_dict['proj_dir'])
work_dir          = proj_dir / config_dict['work_dir']
corpus_combo_file = work_dir / config_dict['corpus_combo_file']

# For reproducibility
rand_state = config_dict['rand_state']


Read configuration file...
   lang_model = bert
   proj_dir = /home/shius/projects/plant_sci_hist
   work_dir = 2_text_classify
   corpus_combo_file = corpus_combo
   rand_state = 20220609
   bert_param = {}
  all config available


In [5]:
# Split train/validate/test for cleaned text
#   Will not focus on original due to issues with non-alphanumeric characters
#   and stop words.
print("\nRead file and split train/validate/test...")
subsets = split_train_validate_test(corpus_combo_file, rand_state)
[X_train, X_valid, X_test, y_train, y_valid, y_test] = subsets


Read file and split train/validate/test...
    size: train=(51987,), valid=(17329,), test=(17330,)


In [None]:

  # get w2c parameter list
  #   [min_count, window, ngram]
  param_keys, param_list  = get_hyperparameters(w2v_param)

  # iterate through different parameters
  with open(work_dir / f"scores_cln_w2v", "w") as f:
    f.write("run\ttxt_flag\tlang_model\tparameters\tvalidate_f1\t" +\
            "test_f1\tmodel_dir\n")
    run_num = 0
    for param in param_list:
      print(f"\n## param: {param}")
      best_score, model_dir, test_score = run_pipeline(param, subsets)

      f.write(f"{run_num}\tcln\t{lang_model}\t{str(param)}\t"+\
              f"{best_score}\t{test_score}\t{model_dir}\n")

      run_num += 1

In [None]:
# Split train/test for original and cleaned text
print("\nRead file and split train/test...")
train_ori, test_ori, train_cln, test_cln = split_train_test(
                                              corpus_combo_file, rand_state)

## __Set up the corpus__

### _Create a list of unigram lists_

A nested list with the first dimension the number of training instances:
- 69316

Q: Why not use tokenizer now instead of later?
- Need to get the unigrams so a tokenizer can be trained.

Q: Why bi- and tri-gram detector not used?

In [None]:
# Define the cleaned text as corpus
corpus = train_cln['txt_clean'] # pandas Series
type(corpus), corpus[:2]

In [None]:
## create list of lists of unigrams
lst_corpus = []
lst_corpus_test = []
for string in corpus:

   # Q: lst_words and lst_grams are the same, what's the point?
   lst_words = string.split()
   #lst_grams = [" ".join(lst_words[i:i+1]) 
   #            for i in range(0, len(lst_words), 1)]
   #lst_corpus.append(lst_grams)
   #lst_corpus_test.append(lst_words)
   lst_corpus.append(lst_words)

len(lst_corpus)

In [None]:
# Check if lst_words and lst_grams are the same. They are EXACTLY the same. So
# did not use the lst_gram part in the cell above.
'''
count_not_the_same = 0
for i in range(len(lst_corpus)):
    gram = lst_corpus[i]
    word = lst_corpus_test[i]
    if gram != word:
        count_not_the_same += 1
print(count_not_the_same)
'''

In [None]:
## detect bigrams and trigrams
bigrams_detector  = gensim.models.phrases.Phrases(lst_corpus, 
                                                  delimiter=" ", 
                                                  min_count=5, 
                                                  threshold=10)
bigrams_detector  = gensim.models.phrases.Phraser(bigrams_detector)

In [None]:
# Note that the input the trigrams_detector is output of the bigrams_detector
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], 
                                                  delimiter=" ", 
                                                  min_count=5, 
                                                  threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

### _Initialize Word2Vec model_

- `sequences`: lst_corpus
- `vector_size`: dimension of word embeddings
- `window`: max distance between the current and the predicted words in a sentence
- `min_count`: ignore all words with total frquency lower than this.
  - [Discussion on seeting min_count](https://stackoverflow.com/questions/50723303/how-is-word2vec-min-count-applied)
- `sg`: history algorithm, 1: skip-gram, otherwise CBOW.

Following [this](https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook).
- Q: Why don't we train the w2v model using bi and tri-grams?
  - See [this article](https://www.kaggle.com/code/hamishdickson/training-and-plotting-word2vec-with-bigrams/notebook)

In [None]:
model_w2v = gensim.models.Word2Vec(vector_size=300, window=8, 
                                   min_count=20, sg=1, epochs=30, workers=16,
                                   seed=rand_state)

In [None]:
# Building the Vocabulary Table
model_w2v.build_vocab(lst_corpus, progress_per=10000)

In [None]:
# Train w2v model
model_w2v.train(lst_corpus, total_examples=model_w2v.corpus_count, epochs=30,
                report_delay=1)

# Save the w2v model
with open(work_dir / "model_cln_w2v", "wb") as f:
    pickle.dump(model_w2v, f)

In [None]:
# load the w2v model
with open(work_dir / "model_cln_w2v", "rb") as f:
    model_w2v = pickle.load(f)

In [None]:
# Testing the w2v model
# Here there is problem with stop words. Like 'jasmonate..', '(MeJA)', and other
# variants. So should use the cleaned text.
example = "jasmonate"
print(len(model_w2v.wv[example]))
print(model_w2v.wv.most_similar(example, topn=20))

## __Feature engineering__

#### _Train tokenizer_

In [None]:
# intialize tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(
                        lower=True, 
                        split=' ', 
                        oov_token="NaN", 
                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

# tokenize corpus 
tokenizer.fit_on_texts(lst_corpus)

# get token dictionary, with token as key, index number as value
dic_vocab_token = tokenizer.word_index
len(dic_vocab_token)

#### _Turn texts into index numbers_

Transforms each text in texts to a sequence of integers.

In [None]:
lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)
print(lst_corpus[0][:5])
print(lst_text2seq[0][:5])

In [None]:
# The index numbers are the values from the token dictionary
# Note that these are lowercased
dic_vocab_token['update'], dic_vocab_token['exertional']

#### _Pad or trucate sequences_

In [None]:
X_train_w2v = keras.preprocessing.sequence.pad_sequences(
                    lst_text2seq,      # List of sequences, each a list of ints 
                    maxlen=500,         # maximum length of all sequences
                    padding="post",    # 'pre' or 'post' 
                    truncating="post") # remove values from sequences > maxlen
X_train_w2v.shape

#### _Create embedding matrix_

In [None]:
## start the matrix (length of vocabulary x vector size) with all 0s

embeddings = np.zeros((len(dic_vocab_token)+1, 300))
not_in_emb = {}
for word, idx in dic_vocab_token.items():
    ## update the row with vector
    try:
        embeddings[idx] =  model_w2v.wv[word]
    ## if word not in model then skip and the row stays all 0s
    except KeyError:
        not_in_emb[word] = 1

len(not_in_emb) # Q: How did this got into the corpus??

#### _Set up ANN_

The model contains:
- An embedding layer:
  - Sequences as input (15 tokens, including padding)
  - Word (embedding?) vectors as weights (what??)
  - Embedding as output (15x300).
- An attention layer
  - Capture the eughts of each instance for building an explaniner.
  - Not needed for the predictions.
- Two layers of bidirectional LSTM.
- Two final dense layer to predict probabilities of classes

In [None]:
def get_w2v_emb_model(embeddings):

    ## code attention layer
    def attention_layer(inputs, neurons):
        x = layers.Permute((2,1))(inputs)
        x = layers.Dense(neurons, activation="softmax")(x)
        x = layers.Permute((2,1), name="attention")(x)
        x = layers.multiply([inputs, x])
        return x

    ## input
    x_in = layers.Input(shape=(500,)) ## embedding
    x = layers.Embedding(input_dim=embeddings.shape[0],  
                        output_dim=embeddings.shape[1], 
                        weights=[embeddings],
                        input_length=500, trainable=False)(x_in)

    ## apply attention
    x = attention_layer(x, neurons=500)

    ## 2 layers of bidirectional lstm
    x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2, 
                            return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2))(x)

    ## final dense layers
    x = layers.Dense(64, activation='relu')(x)
    y_out = layers.Dense(2, activation='softmax')(x)

    ## Initialize and compile model
    model = models.Model(x_in, y_out)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam', 
                  metrics=['accuracy'])

    return model


In [None]:
model_emb = get_w2v_emb_model(embeddings)
model_emb.summary()

#### _Convert text labels to numeric ones_

In [None]:
## encode y
#  This is the class label, not sure why inversse is done
y_train       = train_cln['label']
dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_train))}
inverse_dic   = {v:k for k,v in dic_y_mapping.items()}
inverse_dic

In [None]:
# Convert text labels to numeric ones.
#y_train_label = np.array([inverse_dic[y] for y in y_train])
y_train_label = y_train
X_train_w2v.shape, len(y_train_label)

#### _Train model_

In [None]:
## train
callback = callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model_emb.fit(x=X_train_w2v, y=y_train_label, batch_size=256, 
                        epochs=20, shuffle=True, verbose=1, 
                        validation_split=0.3, callbacks=[callback])

In [None]:
# Save model
model_emb.save('model_cln_w2v_dnn.h5')

In [None]:
help(model_emb)

In [None]:
type(model_emb)

#### _Plot loss and accuracy_


In [None]:
his_keys = history.history.keys()
his_keys

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
df_history_loss = pd.DataFrame(history.history)[['loss','val_loss']]
df_history_loss.plot(ax=ax1)
df_history_accu = pd.DataFrame(history.history)[['accuracy','val_accuracy']]
df_history_accu.plot(ax=ax2)
ax1.grid(True); ax1.set_xlabel('Epoch'); ax1.set_ylabel('Loss')
ax2.grid(True); ax2.set_xlabel('Epoch'); ax2.set_ylabel('Cross entropy')
plt.show()

#### _Get prediction f1_

In [None]:
y_train_pred_prob = model_emb.predict(X_train_w2v)

In [None]:
y_train_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_train_pred_prob]

In [None]:
metrics.f1_score(y_train_label, y_train_pred)

#### _Evaluate model_

In [None]:
## Create list of n-grams for test set
X_test = test_cln['txt_clean']
y_test = test_cln['label']

lst_corpus_test = []
for text in X_test:
    lst_words = text.split()
    lst_corpus_test.append(lst_words)

## Detect common bigram and trigram with fitted detectors
lst_corpus_test_bi = list(bigrams_detector[lst_corpus_test])
lst_corpus_test_tr = list(trigrams_detector[lst_corpus_test_bi])

len(lst_corpus_test_bi), len(lst_corpus_test_tr)

In [None]:
## text to sequence with the fitted tokenizer
lst_text2seq_test = tokenizer.texts_to_sequences(lst_corpus_test_tr)

## padding sequence
X_test_w2v = keras.preprocessing.sequence.pad_sequences(lst_text2seq_test, 
                                maxlen=500, padding="post", truncating="post")
X_test_w2v.shape

In [None]:
y_pred_prob_w2v = model_emb.predict(X_test_w2v)

In [None]:
y_pred_prob_w2v[1:4,] # Why are there 3 columns??

In [None]:
# q: Why use dic_y_mapping instead of inverse_dic???
y_pred_w2v      = [dic_y_mapping[np.argmax(pred)] for pred in y_pred_prob_w2v]

In [None]:
test_score = metrics.f1_score(y_test, y_pred_w2v)
print(test_score)

# __NOT USED__

#### _Check min, max, avg len_

In [None]:
### NOT Run ###
'''
minlen = 100; maxlen = 0; totlen = 0
lst_0  = []   # index of sequences with zero lengths

for idx in tqdm(range(len(lst_text2seq))):
    slen   = len(lst_text2seq[idx])
    totlen +=slen
    if slen > maxlen: maxlen = slen
    if slen < minlen: 
        if slen == 0: lst_0.append(idx)
        else: minlen = slen
print(f'Min:{minlen}, Max:{maxlen}, Avg:{totlen/len(lst_text2seq)}')
print('Zero length:', lst_0)
'''

In [None]:
## evaluation
def eval_model(y_test, y_pred, y_pred_prob, plot_auc=1):
    
    classes = np.unique(y_test)

    # pd.get_dummies: Convert categorical variable into dummy/indicator 
    # variables.
    y_test_array = pd.get_dummies(y_test, drop_first=False).values
    
    ## Accuracy, Precision, Recall
    #accuracy = metrics.accuracy_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred_prob[:, 1])
    #print("Accuracy:",  round(accuracy,2))
    print("Auc:", round(auc,2))
    print("Detail:")
    print(metrics.classification_report(y_test, y_pred))
        
    ## Plot confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    _, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, 
                cbar=False)
    ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, 
        yticklabels=classes, title="Confusion matrix")
    plt.yticks(rotation=0)

    # Setup subplots
    if plot_auc:
        _, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))

        ## Plot roc
        for i in range(len(classes)):
                fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
                                y_pred_prob[:,i])
                ax[0].plot(fpr, tpr, lw=3, 
                        label='{0} (area={1:0.2f})'.format(classes[i], 
                                        metrics.auc(fpr, tpr))
                        )
        ax[0].plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
        ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
                xlabel='False Positive Rate', 
                ylabel="True Positive Rate (Recall)", 
                title="Receiver operating characteristic")
        ax[0].legend(loc="lower right")
        ax[0].grid(True)
                
        ## Plot precision-recall curve
        for i in range(len(classes)):
                precision, recall, thresholds = metrics.precision_recall_curve(
                        y_test_array[:,i], y_pred_prob[:,i])
                ax[1].plot(recall, precision, lw=3, 
                        label='{0} (area={1:0.2f})'.format(classes[i], 
                                        metrics.auc(recall, precision))
                        )
        ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
                ylabel="Precision", title="Precision-Recall curve")
        ax[1].plot([0,1], [1/3,1/3], color='navy', lw=2, linestyle='--')
        ax[1].legend(loc="best")
        ax[1].grid(True)
        plt.show()

In [None]:
eval_model(y_test, y_pred_w2v, y_pred_prob_w2v, 1)