# __Step 2c: BERT models - pytorch__

Follow:
- [How to train a BERT model from scratch](https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6)
- [Huggintface ByteLevelBPETokenizer tutorial](https://github.com/huggingface/blog/blob/main/how-to-train.md)

## __Setup__

### _Establish the environment_

This is the pytorch version.


```bash
conda create -n bert_classify python=3.9
conda activate bert_classify
conda install ipykernel --update-deps --force-reinstall
```

In addition, install:
- numpy, pandas, scikit-learn, ,transformers, tokenizers, datasets, pytorch, tensorflow-gpu

Issues:
- datasets cannot be installed through conda successfully. Did pip.
- Python 3.10 as of 6/16/22 is not fully supported by pytorch and lead to the following error when calling `transfomer.Trainer`:
  - TypeError: Instance and class checks can only be used with @runtime_checkable protocols
- pytorch installed by default when transformers is installed but without CUDA support. following instructions from [here](https://pytorch.org/):

```bash
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
```


### _Imports_

In [35]:
'''
For building text classification model based on embedding of Word2Vec and BERT
'''

## for data
#import argparse
import json
import pandas as pd
import numpy as np
import pickle
import sys
import itertools
from pathlib import Path

from sklearn import model_selection, metrics

#import torch
import transformers
from datasets import Dataset
from tokenizers import BertWordPieceTokenizer

import tensorflow as tf


### _Functions_

In [2]:
def read_configs(config_file):
  """Read configuration file and return a config_dict"""
  # required
  config_dict = {'lang_model':0,
                 'proj_dir':0,
                 'work_dir':0,
                 'corpus_combo_file':0,
                 'rand_state':0,
                 'bert_param':0,}

  # Read config file and fill in the dictionary
  with open(config_file, 'r') as f:
    configs     = f.readlines()
    for config in configs:
      if config.strip() == "" or config[0] == "#":
        pass
      else:
        config = config.strip().split("=")
        if config[0] in config_dict:
          config_dict[config[0]] = eval(config[1])

  # Check if any config missing
  missing = 0
  for config in config_dict:
    if config_dict[config] == 0:
      print("  missing:", config)
      missing += 1
    else:
      print("  ", config, "=", config_dict[config])

  if missing == 0:
    print("  all config available")
  else:
    print("  missing config, QUIT!")
    sys.exit(0)

  return config_dict


def split_train_validate_test(corpus_combo_file, rand_state):
  '''Load data and split train, validation, test subsets for the cleaned texts
  Args:
    corpus_combo_file (str): path to the json data file
    rand_state (int): for reproducibility
  Return:
    train, valid, test (pandas dataframes): training, validation, testing sets
  '''
  # Load json file
  with corpus_combo_file.open("r+") as f:
      corpus_combo_json = json.load(f)

  # Convert json back to dataframe
  corpus_combo = pd.read_json(corpus_combo_json)

  corpus = corpus_combo[['label','txt','txt_clean']]

  # Split train test
  train, test = model_selection.train_test_split(corpus, 
      test_size=0.2, stratify=corpus['label'], random_state=rand_state)

  # Split train validate
  train, valid = model_selection.train_test_split(train, 
      test_size=0.25, stratify=train['label'], random_state=rand_state)

  print(f"    train={train.shape}, valid={valid.shape}," +\
        f" test={test.shape}")

  return [train, valid, test]
  


In [3]:
def get_hyperparameters(w2v_param):
  ''' Return a list with hyperparameters based on the passed dictionary
  Adopted from:
    https://stackoverflow.com/questions/38721847/how-to-generate-all-combination-from-values-in-dict-of-lists-in-python
  Args:
    param (dict): a dictionary specified in the config.txt file.
  Return:
    param_list (list): a nested list of hyperparameters in the order of
      max_feature, ngram_range, and p_threshold
  '''
  print(w2v_param)
  keys, values = zip(*w2v_param.items())
  param_list = [v for v in itertools.product(*values)]
  
  return keys, param_list

def get_unigram(corpus):
  unigram = []
  for txt in corpus:
    lst_words = txt.split()
    unigram.append(lst_words)

  return unigram

def get_ngram(X_train, X_valid, X_test, ngram):

  uni_train = get_unigram(X_train)
  uni_valid = get_unigram(X_valid)
  uni_test  = get_unigram(X_test)

  if ngram == 1:
    return uni_train, uni_valid, uni_test
  # ngram >1
  else:
    # Get bigrams
    bigrams_detector  = gensim.models.phrases.Phrases(
                          uni_train, delimiter=" ", min_count=5, threshold=10)
    bigrams_detector  = gensim.models.phrases.Phraser(bigrams_detector)
    bi_train = list(bigrams_detector[uni_train])
    bi_valid = list(bigrams_detector[uni_valid])
    bi_test  = list(bigrams_detector[uni_test])

    # Return bigrams
    if ngram == 2:
      return bi_train, bi_valid, bi_test

    # Get trigrams and return them
    elif ngram == 3:
      trigrams_detector = gensim.models.phrases.Phrases(
                          bigrams_detector[uni_train], delimiter=" ", 
                          min_count=5, threshold=10)
      trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)
      tri_train = list(trigrams_detector[bi_train])
      tri_valid = list(trigrams_detector[bi_valid])
      tri_test  = list(trigrams_detector[bi_test])
      return tri_train, tri_valid, tri_test
    
    else:
      print('ERR: ngram cannot be larger than 3. QUIT!')
      sys.exit(0)


def get_w2v_model(X_train, X_valid, X_test, param, rand_state):
  '''Get ngram lists and w2v model
  Args:
  Return:
  '''
  [min_count, window, ngram] = param

  ngram_train, ngram_valid, ngram_test = get_ngram(X_train, X_valid, X_test, 
                                                  ngram)

  # Check if w2v model is already generated
  model_w2v_name = work_dir / f"model_cln_w2v_{min_count}-{window}-{ngram}"

  if model_w2v_name.is_file():
    print("  load the w2v model")
    with open(work_dir / model_w2v_name, "rb") as f:
        model_w2v = pickle.load(f)
  else:
    print("  geneate and save w2v model")
    model_w2v = gensim.models.Word2Vec(ngram_train, vector_size=300, 
                                      window=window, min_count=min_count, 
                                      sg=1, epochs=30, seed=rand_state)
    
    with open(model_w2v_name, "wb") as f:
      pickle.dump(model_w2v, f)

  return model_w2v, model_w2v_name, ngram_train, ngram_valid, ngram_test


def train_tokenizer(corpus):
  '''Train a tokenizer
  Args:
    corpus (list): a nested list of word lists
  Return:
    tokenizer (keras.preprocessing.text.Tokenizer): trained tokenizer
    dic_vocab_token (dict): token as key, index as value
  '''

  # intialize tokenizer
  # See: https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization
  # This is replaced by tf.keras.layers.TextVectorization
  tokenizer = preprocessing.text.Tokenizer(lower=True, split=' ', 
                oov_token="NaN", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

  # tokenize corpus 
  tokenizer.fit_on_texts(corpus)

  # get token dictionary, with token as key, index number as value
  dic_vocab_token = tokenizer.word_index

  return tokenizer, dic_vocab_token


def get_embeddings(corpus, model_w2v, tokenizer, dic_vocab_token):

  # Transforms each text in texts to a sequence of integers.
  lst_text2seq = tokenizer.texts_to_sequences(corpus)

  # pad or trucate sequence
  X_w2v = preprocessing.sequence.pad_sequences(
                    lst_text2seq,      # List of sequences, each a list of ints 
                    maxlen=500,        # maximum length of all sequences
                    padding="post",    # 'pre' or 'post' 
                    truncating="post") # remove values from sequences > maxlen

  ## start the matrix (length of vocabulary x vector size) with all 0s

  embeddings = np.zeros((len(dic_vocab_token)+1, 300))
  not_in_emb = {}
  for word, idx in dic_vocab_token.items():
      ## update the row with vector
      try:
          embeddings[idx] =  model_w2v.wv[word]
      ## if word not in model then skip and the row stays all 0s
      except KeyError:
          not_in_emb[word] = 1

  return embeddings, X_w2v


def get_w2v_emb_model(embeddings):
  '''Build a deep learning model with Word2Vec embeddings
  Args:
    embeddings
  '''

  ## code attention layer
  def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

  ## input
  x_in = layers.Input(shape=(500,)) ## embedding
  x = layers.Embedding(input_dim=embeddings.shape[0],  
                      output_dim=embeddings.shape[1], 
                      weights=[embeddings],
                      input_length=500, trainable=False)(x_in)

  ## apply attention
  x = attention_layer(x, neurons=500)

  ## 2 layers of bidirectional lstm
  x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2, 
                          return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2))(x)

  ## final dense layers
  x = layers.Dense(64, activation='relu')(x)
  y_out = layers.Dense(2, activation='softmax')(x)

  ## Initialize and compile model
  model = models.Model(x_in, y_out)
  model.compile(loss='sparse_categorical_crossentropy',
                optimizer='adam', 
                metrics=['accuracy'])

  return model


def run_main_function():

  # get w2c parameter list
  #   [min_count, window, ngram]
  param_keys, param_list  = get_hyperparameters(w2v_param)

  # iterate through different parameters
  with open(work_dir / f"scores_cln_w2v", "w") as f:
    f.write("run\ttxt_flag\tlang_model\tparameters\tvalidate_f1\t" +\
            "test_f1\tmodel_dir\n")
    run_num = 0
    for param in param_list:
      print(f"\n## param: {param}")
      best_score, model_dir, test_score = run_pipeline(param, subsets)

      f.write(f"{run_num}\tcln\t{lang_model}\t{str(param)}\t"+\
              f"{best_score}\t{test_score}\t{model_dir}\n")

      run_num += 1


def run_pipeline(param, subsets):
  '''Carry out the major steps'''

  rand_state = config_dict['rand_state']

  [X_train, X_valid, X_test, y_train, y_valid, y_test] = subsets

  # Get list of ngrams and w2v model
  print("  get list of ngrams and w2v model")
  model_w2v, model_w2v_name, ngram_train, ngram_valid, ngram_test = \
                      get_w2v_model(X_train, X_valid, X_test, param, rand_state)
  
  # Train tokenizer
  print("  train tokenizer")
  tokenizer, dic_vocab_token = train_tokenizer(ngram_train)

  # Get embeddings
  print("  get embeddings")
  embeddings, X_train_w2v = get_embeddings(ngram_train, model_w2v, 
                                                    tokenizer, dic_vocab_token)
  _, X_valid_w2v = get_embeddings(ngram_valid, model_w2v, 
                                                    tokenizer, dic_vocab_token)
  _ , X_test_w2v  = get_embeddings(ngram_test , model_w2v, 
                                                    tokenizer, dic_vocab_token)

  # Model checkpoint path and output model file name
  cp_filepath  = Path(str(model_w2v_name) + "_dnn")

  # Load model if exists
  if cp_filepath.is_dir():
    print("  load model in:", cp_filepath)
    model_emb = models.load_model(cp_filepath)

  # Train and save model if not
  else:
    print("  train model")
    model_emb    = get_w2v_emb_model(embeddings)

    # setup check points
    callback_es  = callbacks.EarlyStopping(monitor='val_loss', patience=5)
    callback_mcp = callbacks.ModelCheckpoint(filepath=cp_filepath, mode='max', 
            save_weights_only=False, monitor='val_accuracy', save_best_only=True)

    # Train model
    history = model_emb.fit(x=X_train_w2v, y=y_train, batch_size=256, 
                            epochs=20, shuffle=True, verbose=1, 
                            validation_data=(X_valid_w2v, y_valid), 
                            callbacks=[callback_es, callback_mcp])

  print("  get validation f1 score")
  y_valid_pred_prob = model_emb.predict(X_valid_w2v)
  dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_valid))}
  y_valid_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_valid_pred_prob]
  best_score = metrics.f1_score(y_valid, y_valid_pred)
  print("    ", best_score)

  print("  get testing f1 score")
  y_test_pred_prob = model_emb.predict(X_test_w2v)
  dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_test))}
  y_test_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_test_pred_prob]
  test_score = metrics.f1_score(y_test, y_test_pred)
  print("    ", test_score)

  # provide some space between runs
  print('\n')

  return best_score, cp_filepath, test_score

### _Get training/testing split_

In [4]:
config_file = Path("config_bert.txt")

print("\nRead configuration file...")
config_dict = read_configs(config_file)

# Set up working directory and corpus file location
proj_dir          = Path(config_dict['proj_dir'])
work_dir          = proj_dir / config_dict['work_dir']
corpus_combo_file = work_dir / config_dict['corpus_combo_file']

# For reproducibility
rand_state = config_dict['rand_state']


Read configuration file...
   lang_model = bert
   proj_dir = /home/shius/projects/plant_sci_hist
   work_dir = 2_text_classify
   corpus_combo_file = corpus_combo
   rand_state = 20220609
   bert_param = {}
  all config available


In [5]:
# Split train/validate/test for cleaned text
#   Will not focus on original due to issues with non-alphanumeric characters
#   and stop words.
print("\nRead file and split train/validate/test...")
[train, valid, test] = split_train_validate_test(corpus_combo_file, rand_state)


Read file and split train/validate/test...
    train=(51987, 3), valid=(17329, 3), test=(17330, 3)


In [6]:
# Convert dataframes to Datasets
dataset_train = Dataset.from_pandas(train)
dataset_valid = Dataset.from_pandas(valid)
dataset_test  = Dataset.from_pandas(test)

In [7]:
dataset_train

Dataset({
    features: ['label', 'txt', 'txt_clean', '__index_level_0__'],
    num_rows: 51987
})

## __Tokenization__

### _Parameters_

In [8]:
# Parameters
vocab_size = 52_000

# maximum sequence length, lowering will result to faster training (when 
# increasing batch size)
max_length = 512

min_frequency=2

# whether to truncate
truncate_longer_samples = True

### _Write training texts to files_

In [9]:
# Write training texts to a folder where each file has 5000 entries.
corpus_train_path = work_dir / "corpus_train"
corpus_train_path.mkdir(parents=True, exist_ok=True)

# Note that I use the original text for training tokenizer
txts  = train['txt'].values

# list of training corpus files
files = []             
for idx in range(0,len(txts),5000):
  subset = txts[idx:idx+5000]
  subset_file = corpus_train_path / f"txt_{idx}"

  # force posix path to be string, otherwize the training step below will fail
  files.append(str(subset_file))
  with open(subset_file, "w") as f:
    subset_txts = '\n'.join(subset)
    f.write(subset_txts)

### _Train tokeinzier_

In [10]:
# Intialize and train tokenizer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]

tokenizer = BertWordPieceTokenizer()

tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, 
                special_tokens=special_tokens)







In [11]:
# enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)

### _Save tokenizer_

In [12]:
model_path = work_dir / "model_bert"
model_path.mkdir(parents=True, exist_ok=True)

# save the tokenizer  
tokenizer.save_model(str(model_path))

['/home/shius/projects/plant_sci_hist/2_text_classify/model_bert/vocab.txt']

In [13]:

# dumping some of the tokenizer config to config file, 
# including special tokens, whether to lower case and the maximum sequence length
with open(model_path / "config.json", "w") as f:
    tokenizer_cfg = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,}
    json.dump(tokenizer_cfg, f)

In [14]:
# This step is critical: the trained tokenizer object cannot be called directly.
tokenizer_loaded = transformers.BertTokenizerFast.from_pretrained(model_path)

## __Text classification with transfer learning__

### _Set up train, valid, and test data_

In [33]:
# Here use pre-processed text data for classification
X_train = dataset_train['txt_clean']
y_train = dataset_train['label']
X_valid = dataset_valid['txt_clean']
y_valid = dataset_valid['label']
X_test  = dataset_test['txt_clean']
y_test  = dataset_test['label']

### _Encode corpus_

In [36]:
# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=256):
  """""""""
  A function that encodes a batch of texts and returns the texts'
  corresponding encodings and attention masks that are ready to be fed 
  into a pre-trained transformer model.
  
  Input:
  - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
  - texts:       List of strings where each string represents a text
  - batch_size:  Integer controlling number of texts in a batch
  - max_length:  Integer controlling max number of words to tokenize in a
    given text
  Output:
  - input_ids:       sequence of texts encoded as a tf.Tensor object
  - attention_mask: the texts' attention mask encoded as a tf.Tensor obj
  """""""""
  # Define the maximum number of words to tokenize (up to 512)
  max_length = 50
  input_ids = []
  attention_mask = []
  
  for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    inputs = tokenizer.batch_encode_plus(batch,
                                          max_length=max_length,
                                          padding='max_length',
                                          truncation=True,
                                          return_attention_mask=True,
                                          return_token_type_ids=False
                                          )
    input_ids.extend(inputs['input_ids'])
    attention_mask.extend(inputs['attention_mask'])
  
  return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [37]:
# Encode corpus
X_train_ids, X_train_attn = batch_encode(tokenizer_loaded, X_train)
X_valid_ids, X_valid_attn = batch_encode(tokenizer_loaded, X_valid)
X_test_ids , X_test_attn  = batch_encode(tokenizer_loaded, X_test)

X_train_bert = [np.asarray(X_train_ids, dtype='int32'),
                np.asarray(X_train_attn, dtype='int32')]
X_valid_bert = [np.asarray(X_valid_ids, dtype='int32'),
                np.asarray(X_valid_attn, dtype='int32')]
X_test_bert  = [np.asarray(X_test_ids, dtype='int32'),
                np.asarray(X_test_attn, dtype='int32')]

2022-06-17 08:38:03.700413: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-06-17 08:38:03.700834: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-17 08:38:03.702798: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


### _Classification model training_


In [None]:
## inputs
idx = layers.Input((50), dtype="int32", name="input_idx")
masks = layers.Input((50), dtype="int32", name="input_masks")

## pre-trained bert with config
config = transformers.DistilBertConfig(dropout=0.2, 
           attention_dropout=0.2)
config.output_hidden_states = False
nlp = transformers.TFDistilBertModel.from_pretrained(
                                    'distilbert-base-uncased', config=config)
bert_out = nlp(idx, attention_mask=masks)[0]

## fine-tuning
x = layers.GlobalAveragePooling1D()(bert_out)
x = layers.Dense(64, activation="relu")(x)
y_out = layers.Dense(len(np.unique(y_train)), 
                     activation='softmax')(x)
                     
## compile
model = models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])
              
model.summary()

In [None]:
## encode y
dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train_bert))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_train_label_bert = np.array([inverse_dic[y] for y in y_train_bert])

## train
history = model.fit(x=X_train_bert, y=y_train_label_bert, batch_size=64, 
                     epochs=10, shuffle=True, verbose=1, 
                     validation_split=0.3)

### _Batch encode texts_

In [32]:
X_train = train['txt_clean']
y_train = train['label']
X_valid = valid['txt_clean']
y_valid = valid['label']
X_test  = train['txt_clean']
y_test = train['label']


AttributeError: 'Dataset' object has no attribute 'head'

## __Text classification with retrained model__

### _Tokenize dataset_

In [None]:
def encode(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    return tokenizer_loaded(examples["txt"], truncation=True, padding="max_length", 
                            max_length=max_length, return_special_tokens_mask=True)

In [None]:
# tokenizing the subsets
train_tokenized = dataset_train.map(encode, batched=True)
valid_tokenized = dataset_valid.map(encode, batched=True)
test_tokenized  = dataset_test.map(encode, batched=True)

100%|██████████| 52/52 [00:17<00:00,  2.94ba/s]
100%|██████████| 18/18 [00:05<00:00,  3.16ba/s]
100%|██████████| 18/18 [00:05<00:00,  3.28ba/s]


### _Reformat columns_

In [None]:
# Reformat two columns into type `torch`:
#  Setting to torch leda to trucation of longer samples
train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
valid_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

### _Load existing model_

In [18]:
# initialize the model with the config
model_config = transformers.BertConfig(vocab_size=vocab_size, 
                                       max_position_embeddings=max_length)

#  Bert Model with a `language modeling` head on top
model = transformers.BertForMaskedLM(config=model_config)

### _Randomly mask tokens_

In [19]:
# initialize the data collator, randomly masking 20% (default is 15%) of the 
# tokens for the Masked Language Modeling (MLM) task
data_collator = transformers.DataCollatorForLanguageModeling(
                                                tokenizer=tokenizer_loaded, 
                                                mlm=True, 
                                                mlm_probability=0.2)

### _Initialize training arguments_

In [28]:
# Changed:
# per_device_train_batch_size from 8 to lower, like 2
training_args = transformers.TrainingArguments(
    output_dir=model_path,          # where to save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs
    per_device_train_batch_size=8, # the training batch size
    gradient_accumulation_steps=8,  # accumulae gradients before weight update
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=500,              # steps before valuate/log/sav checkpoints
    save_steps=500,
    load_best_model_at_end=True,    # load the best model (loss) at the end
    # save_total_limit=3,           # save 3 model weights saved in the disk
)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### _Training_

In [29]:
device = 'cuda'
model = model.to(device)

In [30]:
# initialize the trainer and pass everything to it
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
)

In [31]:
# train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: txt_clean, special_tokens_mask, txt, __index_level_0__. If txt_clean, special_tokens_mask, txt, __index_level_0__ are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 51987
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 8120


Step,Training Loss,Validation Loss


The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: txt_clean, special_tokens_mask, txt, __index_level_0__. If txt_clean, special_tokens_mask, txt, __index_level_0__ are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17329
  Batch size = 64


RuntimeError: CUDA out of memory. Tried to allocate 6.35 GiB (GPU 0; 24.00 GiB total capacity; 10.27 GiB already allocated; 4.04 GiB free; 17.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# __FROM PREVIOUS__

#### _Get prediction f1_

In [None]:
y_train_pred_prob = model_emb.predict(X_train_w2v)

In [None]:
y_train_pred = [dic_y_mapping[np.argmax(pred)] for pred in y_train_pred_prob]

In [None]:
metrics.f1_score(y_train_label, y_train_pred)

#### _Evaluate model_

In [None]:
## Create list of n-grams for test set
X_test = test_cln['txt_clean']
y_test = test_cln['label']

lst_corpus_test = []
for text in X_test:
    lst_words = text.split()
    lst_corpus_test.append(lst_words)

## Detect common bigram and trigram with fitted detectors
lst_corpus_test_bi = list(bigrams_detector[lst_corpus_test])
lst_corpus_test_tr = list(trigrams_detector[lst_corpus_test_bi])

len(lst_corpus_test_bi), len(lst_corpus_test_tr)

In [None]:
## text to sequence with the fitted tokenizer
lst_text2seq_test = tokenizer.texts_to_sequences(lst_corpus_test_tr)

## padding sequence
X_test_w2v = keras.preprocessing.sequence.pad_sequences(lst_text2seq_test, 
                                maxlen=500, padding="post", truncating="post")
X_test_w2v.shape

In [None]:
y_pred_prob_w2v = model_emb.predict(X_test_w2v)

In [None]:
y_pred_prob_w2v[1:4,] # Why are there 3 columns??

In [None]:
# q: Why use dic_y_mapping instead of inverse_dic???
y_pred_w2v      = [dic_y_mapping[np.argmax(pred)] for pred in y_pred_prob_w2v]

In [None]:
test_score = metrics.f1_score(y_test, y_pred_w2v)
print(test_score)