The code below shows how to train a model for that purpose with the help of the `huggingface`.

In [1]:
from kaggle_datasets import KaggleDatasets
GCS_PATH = KaggleDatasets().get_gcs_path('coleridgetfrecs')

# Install packages

In [2]:
%%capture
import networkx as nx
MAX_SAMPLE = None # set a small number (e.g. 50) for experimentation, set None for production.
!pip install datasets --no-index --find-links=../input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [3]:
import os
import re
import math
import json
import time
import datetime
import copy
import random
import glob
import importlib
#import nlpaug as A

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

import tensorflow as tf
import tensorflow.keras as keras
!pip install tensorflow_addons
import tensorflow_addons as tfa
#from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
TFAutoModel, AutoConfig

import transformers
import nltk
import pickle
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
  
sns.set()
random.seed(123)
np.random.seed(456)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
    #tf.keras.mixed_precision.set_global_policy('mixed_bfloat16' if TPU else 'float32')
    tf.config.optimizer.set_jit(True)
except:
    TPU = None
    strategy = tf.distribute.get_strategy() 
N_REPLICAS = strategy.num_replicas_in_sync
bs = 2
BATCH_SIZE = bs * N_REPLICAS
TARGET_DTYPE = tf.float32#tf.bfloat16 if TPU else tf.float32

In [5]:
AUTO = tf.data.experimental.AUTOTUNE
def seed_it_all(seed=7):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
seed_it_all()

In [6]:
LOAD_FROM_PREV = True
SAVE_PATH = './'
MAX_LENGTH = 512 # 500 is Max length of any bert model. after token padding 400 -> ~500


train_corpus = None
val_corpus = None

DATASET_SYMBOL = '$' # this symbol represents a dataset name
NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# Load data

# Model Config:

In [7]:
class ModelConfig:
  model_checkpoint = 'roberta-base'
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, return_token_type_ids = True, return_attention_masks = True)
  
  encoder_dim = 768 #if model_checkpoint == 'roberta-large' else 768
  decoder_dim = 768 #if model_checkpoint == 'roberta-large' else 768
  num_att_heads = 12 #if model_checkpoint == 'roberta-large' else 12
  decoder_layers = 6 #if model_checkpoint == 'roberta-large' else 6
  intermediate_dim = 3072 #if model_checkpoint == 'roberta-large' else 3072
  dropout_rate = 0.1 
  # Save Some Configs Immediately
  config = AutoConfig.from_pretrained(model_checkpoint)
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  tokenizer.save_pretrained(f'{SAVE_PATH}model_tokenizer')
  config.save_pretrained(f'{SAVE_PATH}model_tokenizer')
  config = AutoConfig.from_pretrained(model_checkpoint)
  config.save_pretrained(f'{SAVE_PATH}')
  # CONFIG:
  model_head = 'linear'
  label_smoothing = 0.1
  # SPECIAL TOKS
  values = tokenizer.encode("<pad>")
  PAD_TOKEN = tf.constant(values[1], tf.int64)
  PAD_TOKEN_INT = PAD_TOKEN.numpy().item()
  START_TOKEN = tf.constant(values[0], tf.int64)
  START_TOKEN_INT = START_TOKEN.numpy().item()
  END_TOKEN = tf.constant(values[2], tf.int64)
  END_TOKEN_INT = END_TOKEN.numpy().item()
  SPLIT_TOKEN = tf.constant(tokenizer.encode(" |")[1], tf.int64)
  SPLIT_TOKEN_INT = SPLIT_TOKEN.numpy().item()
  del values
  '''
  ROBERTA-LARGE:
  - 1024 Decoder Dim = 1024 // num_heads
  - 4096 Intermediate Dim - Intermediate = FFN dim
  - 16 Att Heads = Number of Att Heads
  - 12 Decoder Layers = Number of Decoder Layers

  ROBERTA-BASE:
  - 3072 Intermediate Dim 
  - 6 Decoder Layers
  - 12 att Heads
  - 768 Decoder Dim
  '''
  

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




# Larger Sentences:
- No duplicates of Sentences, just all tokens at once.
- We have multi class labels, why not use them.

# load in the preprocessed datasets from TFRecordDatasets
- Dataset is prepadded for the max length


In [8]:
MAX_LENGTH_LABEL = 60
MAX_NUM_WORDS = 12 # Smaller the number, the faster Val is Computed(O(NM)) Real Amount = 6.
# Cannot Afford larger than 6(It's Quadratic in Graph Compilation.)
MAX_LENGTH_WORD = 32  # Max Dataset Name Length(Real Amount = 22)
AUTO = tf.data.experimental.AUTOTUNE


val = tf.io.gfile.glob(f"{GCS_PATH}/train_tfrecords/*")
fn = val[0]
TRAIN_NUMBER = sum(1 for _ in tf.data.TFRecordDataset(fn))

val = tf.io.gfile.glob(f"{GCS_PATH}/test_tfrecords/*")
fn = val[0]
VAL_NUMBER = sum(1 for _ in tf.data.TFRecordDataset(fn))

In [9]:
def load_file(example):
  PAD_TOKEN = ModelConfig.tokenizer.pad_token_id
  feature_dict = {
        'attention_mask': tf.io.FixedLenFeature(shape = [MAX_LENGTH], dtype = tf.int64, default_value = [0] * MAX_LENGTH),
        'input_ids': tf.io.FixedLenFeature(shape = [MAX_LENGTH], dtype = tf.int64, default_value = [PAD_TOKEN] * MAX_LENGTH),
        'label': tf.io.FixedLenFeature(shape = [MAX_LENGTH_LABEL], dtype = tf.int64, default_value = [PAD_TOKEN] * MAX_LENGTH_LABEL),
        'token_type_ids': tf.io.FixedLenFeature(shape = [MAX_LENGTH], dtype = tf.int64, default_value = [0] * MAX_LENGTH) 
  }
  features = tf.io.parse_single_example(example, features=feature_dict)

  attention_mask = features['attention_mask']
  input_ids = features['input_ids']
  label = features['label']
  token_type_ids = features['token_type_ids']
  return (input_ids, attention_mask, token_type_ids), label

In [10]:
def load_tf_dataset(fold_idx):
  # COLLECT THE TFRECORDS
  ALL_TFRECS = tf.io.gfile.glob(f"{GCS_PATH}/train_tfrecords/*")
  ALL_TFRECS = sorted(ALL_TFRECS, reverse = False)

  VAL_TFRECS_PATH = tf.io.gfile.glob(f"{GCS_PATH}/test_tfrecords/*")[0]
  TRAIN_TFRECS_PATH = ALL_TFRECS[0]
  # Create TF Datasets 
  train_dataset = tf.data.TFRecordDataset(TRAIN_TFRECS_PATH, num_parallel_reads = AUTO)
  val_dataset = tf.data.TFRecordDataset(VAL_TFRECS_PATH, num_parallel_reads = AUTO)
  # Set Determinism to 0 For faster
  options = tf.data.Options()
  options.experimental_deterministic = False
    
  train_dataset = train_dataset.with_options(options)
  val_dataset = val_dataset.with_options(options)

  # Map the Values
  train_dataset = train_dataset.map(lambda x: load_file(x), num_parallel_calls = AUTO, deterministic = False) 
  val_dataset = val_dataset.map(lambda x: load_file(x), num_parallel_calls = AUTO, deterministic = False)
  
  # batch and Shuffle 
  train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder = True)
  train_dataset = train_dataset.shuffle(4096)
  train_dataset = train_dataset.repeat()


  val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder = True)

  # Prefetch Dataset 
  train_dataset = train_dataset.prefetch(AUTO)
  val_dataset = val_dataset.prefetch(AUTO)

  return train_dataset, val_dataset

### Load pre-trained model and fine-tune

In [11]:
def get_model(model_name):
  model = TFAutoModel.from_pretrained(model_name) # 3 parts to a Roberta model
  '''
  1) Embeddings
  2) RoBERTA Main layer
  3) Pooler layer. - Can be skipped.
  '''
  # Freeze Half of the Layers in the Encoder - It's pretrained and already has decent embeddings.(Transfer Learning.)
  model.roberta.embeddings.trainable = False 
  for i in range(len(model.roberta.encoder.layer) // 2):
    model.roberta.encoder.layer[i].trainable = False
  return model

# Custom Layer Model

In [12]:
class Encoder(keras.Model):
  def __init__(self):
    super().__init__()
    self.model_checkpoint = ModelConfig.model_checkpoint
    self.frozen_backbone = get_model(self.model_checkpoint)
    self.frozen_backbone.roberta.pooler.trainable = False
    #self.frozen_backbone.config.use_bfloat16 = True
  def call(self, input_ids, attention_mask, token_type_ids, training):
    # Just grabs the Embeddings from the Roberta Model
    embeddings = self.frozen_backbone(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, training = training)
    return embeddings['last_hidden_state'] # (1, 512, 1024)

# Custom Decoder Layers

In [13]:
class EncoderMultiHeadAttention(keras.layers.Layer):
  def __init__(self, encoder_dim, num_att_heads):
    # x -> MAH + x -> LayerNorm -> Dropout
    super().__init__()
    self.encoder_dim = encoder_dim
    self.num_att_heads = num_att_heads
    self.drop_prob = ModelConfig.dropout_rate

    self.MultiHeadAttention = keras.layers.MultiHeadAttention(num_heads = self.num_att_heads, 
      key_dim = self.encoder_dim // self.num_att_heads,
      value_dim = self.encoder_dim // self.num_att_heads,
      dropout = self.drop_prob
    )
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)
  def call(self, x, attention_mask, training):
    MAH = self.MultiHeadAttention(query = x, key = x, value = x, attention_mask = attention_mask, training = training)
    norm = self.LayerNorm(MAH + x, training = training)
    return self.Dropout(norm, training = training)

class DecoderMultiHeadAttention(keras.layers.Layer):
  def __init__(self, encoder_dim, decoder_dim, num_att_heads, drop_prob):
    super().__init__()
    self.encoder_dim = encoder_dim
    self.decoder_dim = decoder_dim
    assert self.encoder_dim == self.decoder_dim
    self.num_att_heads = num_att_heads
    self.drop_prob = drop_prob 
    
    
    self.dec_enc_attention = keras.layers.MultiHeadAttention(num_heads = self.num_att_heads,
      key_dim = self.encoder_dim // self.num_att_heads, 
      value_dim = self.encoder_dim // self.num_att_heads,
      dropout = self.drop_prob
    )
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)
  
  def call(self, encoder, decoder, padding_mask, training):
    # Encoder: Tensor(B, L, C)
    # Decoder: Tensor(B, L, C)
    MAH = self.dec_enc_attention(query = decoder, key = encoder, value = encoder, attention_mask = padding_mask, training = training)
    norm = self.LayerNorm(MAH + decoder, training = training)
    return self.Dropout(norm, training = training)

class FFN(keras.layers.Layer):
  def __init__(self, decoder_dim, feedforward_dim, dropout_rate):
    super().__init__()
    self.decoder_dim = decoder_dim
    self.feedforward_dim = feedforward_dim
    self.drop_prob = dropout_rate
  
    self.FFN = keras.Sequential([
      keras.layers.Dense(self.feedforward_dim, activation = 'relu'),
      keras.layers.Dense(self.decoder_dim)
    ])
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)

  def call(self, x, training):
    ffn = self.FFN(x, training = training)
    norm = self.LayerNorm(ffn + x, training = training)
    return self.Dropout(norm, training = training)


In [14]:
class TransformerDecoder(keras.layers.Layer):
  def __init__(self, encoder_dim, decoder_dim, feedforward_dim, num_att_heads, dropout_rate):
    super().__init__()
    self.encoder_dim = encoder_dim
    self.decoder_dim = decoder_dim
    self.feedforward_dim = feedforward_dim
    self.num_att_heads = num_att_heads
    self.dropout_rate = dropout_rate

    self.DecoderAttention = EncoderMultiHeadAttention(self.decoder_dim, self.num_att_heads)
    self.EncoderDecoderAttention = DecoderMultiHeadAttention(self.encoder_dim, self.decoder_dim, self.num_att_heads, self.dropout_rate)
    self.FFN = FFN(self.decoder_dim, self.feedforward_dim, self.dropout_rate)

  def call(self, encoder, decoder, attention_mask, padding_mask, training):
    decoder_values = self.DecoderAttention(decoder, attention_mask = attention_mask, training = training)
    decoder_attended = self.EncoderDecoderAttention(encoder, decoder_values, padding_mask = padding_mask, training = training)
    ffn = self.FFN(decoder_attended, training = training)
    return ffn

class TransformerDecoderModel(keras.Model):
  def __init__(self):
    super().__init__()
    # ----------------------PROCESS---------------------
    # 1) GET EMBEDDINGS
    # 2) ADD POSITIONAL EMBEDDINGS
    # 3) RUN THROUGH THE DECODERS
    # 4) FINAL FFN
    # ---------------------PRETRAINING PARTS------------------------------
    # EMBEDDINGS
    self.model_checkpoint = ModelConfig.model_checkpoint
    # Load an Encoder Model
    tmp_model = get_model(self.model_checkpoint)
    # Steal embeddings(Pretrained Embeddings)
    self.embeddings = tmp_model.roberta.embeddings # call(input_ids, token_type_ids)
    self.vocab_size = self.embeddings.vocab_size
    del tmp_model 
    # ------------------MODEL DEFINITIONS------------------------
    self.max_len = MAX_LENGTH_LABEL - 1 
    self.decoder_dim = ModelConfig.decoder_dim
    self.encoder_dim = ModelConfig.encoder_dim
    self.num_att_heads = ModelConfig.num_att_heads
    self.decoder_layers = ModelConfig.decoder_layers
    self.dropout_rate = ModelConfig.dropout_rate
    self.feedforward_dim = ModelConfig.intermediate_dim

    self.decoders = [TransformerDecoder(
        self.encoder_dim,
        self.decoder_dim,
        self.feedforward_dim,
        self.num_att_heads,
        self.dropout_rate
    ) for _ in range(self.decoder_layers)]
    
    
    # PRECOMPUTE CAUSAL Attention MASKS
    self.batch_size = bs
    self.attention_mask = self.causal_attention_mask(self.batch_size, self.max_len, self.max_len, tf.uint8)
    # PRECOMPUTE Positional Embeddings
    self.pos_enc = self.positional_embeddings(self.max_len, self.decoder_dim) # (1, L, C) 
    self.pos_enc = tf.repeat(tf.expand_dims(self.pos_enc, axis = 0), self.batch_size, axis = 0)
  def positional_embeddings(self, max_length, dim):
    L, C = (max_length, dim) 
    positional_encodings = np.zeros((L, C), np.float32)
    for pos in range(L):
      for i in range(0, C  + 2, 2):
        if i >= C:
          continue
        positional_encodings[pos, i] = math.sin(pos / 10000 ** (i / self.decoder_dim))
        if i + 1 >= C:
          continue 
        positional_encodings[pos, i + 1] = math.cos(pos / 10000 ** ((i + 1) / self.decoder_dim))
    return tf.identity(positional_encodings) 


  def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
    """Masks the upper half of the dot product matrix in self attention.

    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult) # Diagonal Mask 

  def compute_padding_mask(self, decoder_ids, dtype):
    # Returns an Attention Mask to mask all padding tokens
    PAD_TOKEN_ID = ModelConfig.tokenizer.pad_token_id
    # Create a Mask over token ids.
    mask = tf.not_equal(decoder_ids, PAD_TOKEN_ID)
    return tf.cast(mask, dtype)
  def call_val(self, encoder, decoder, training):
    B, L, C = encoder.shape 
    _, Dec_Len = decoder.shape 

    padding_mask = self.compute_padding_mask(decoder, tf.uint8)
    padding_mask = tf.expand_dims(padding_mask, axis = -1) # (B, Dec_Len, 1) 

    attention_mask = self.causal_attention_mask(B, Dec_Len, Dec_Len, tf.uint8) # (B, Dec_Len, Dec_Len)
    attention_mask = attention_mask * padding_mask # (B, Dec_Len, Dec_Len) 

    padding_mask = tf.repeat(padding_mask, L, axis = -1) # (B, Dec_Len, L)

    decoder_embeddings = self.embeddings(decoder, training = training) # (B, L, C)
    # Pos Enc
    pos_enc = tf.identity(self.pos_enc)[:, :Dec_Len, :] # (B, L, C)
    decoder_embeddings = decoder_embeddings + pos_enc

    for DECODER in self.decoders:
      decoder_embeddings = DECODER(encoder, decoder_embeddings, attention_mask, padding_mask, training = training)
    return decoder_embeddings 
  def call(self, encoder, decoder, training):
    '''
    Encoder: Encoder Embeddings: Tensor(B, L, C)
    Decoder: Decoder Input Ids: Tensor(B, L')
    training: in training mode?
    Unfortuately, you cannot precompute attention masks, since the padding mask depends on the decoder_ids
    '''
    # NO NEED For TOKEN TYPE IDs, as they are always 0(Always 1 Sentence)
    B, L, C = encoder.shape
    _, Dec_Len = decoder.shape

    # GENERATE MASKS
    padding_mask = self.compute_padding_mask(decoder, tf.uint8) # (B, L)
    padding_mask = tf.expand_dims(padding_mask, axis = -1)
    attention_mask = tf.identity(self.attention_mask) # (B, L, L) - Used only in Decoder Attention
    attention_mask = attention_mask * padding_mask # (B, L, L)
    
    padding_mask = tf.repeat(padding_mask, L, axis = -1)
    
    
    # Convert Tokens to Embeddings
    decoder_embeddings = self.embeddings(decoder, training = training) # (B, L', C)
    # ----------------GET POS ENC FOR DECODER INPUTS(Encoder already got them) -------------------
    pos_enc = tf.identity(self.pos_enc) # (1, L, C)

    decoder_embeddings = decoder_embeddings + pos_enc
    # Run through the Decoders:
    for DECODER in self.decoders:
      decoder_embeddings = DECODER(encoder, decoder_embeddings, attention_mask, padding_mask, training = training)
    # FINAL HEAD
    return decoder_embeddings




Heads

In [15]:
class DenseHead(keras.Model):
  def __init__(self, vocab_size, pad_token):
    super().__init__()
    self.vocab_size = vocab_size
    self.pad_token = pad_token
    self.batch_size = bs
    self.head = keras.layers.Dense(self.vocab_size)
    
    
    self.label_smoothing = ModelConfig.label_smoothing
    

  def call(self, x, training):
    # Does the logic of Argmax and prediction in one swoop
    pred = self.head(x, training = training) # (B, C)
    return pred
  def call_val(self, x, training):
    pred = self.head(x[:, -1], training = training)
    return pred

accuracy FN

In [16]:
def update_accuracy(outputs, GT, metric_name):
  metrics[metric_name].update_state(outputs, GT)

# FULL MODEL

In [17]:
class FullModel(keras.Model):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder()
    self.START_TOKEN = ModelConfig.START_TOKEN
    self.END_TOKEN = ModelConfig.END_TOKEN # Default Special tokens for HuggingFace Tokenizers
    self.MAX_LEN = MAX_LENGTH_LABEL
    self.vocab_length = ModelConfig.tokenizer.vocab_size 
    self.decoder = TransformerDecoderModel()
    self.model_head = DenseHead(self.decoder.vocab_size, ModelConfig.tokenizer.pad_token_id)
  
  def call_train(self, input_ids, attention_mask, token_type_ids, decoder_input_ids, training):
    # decoder_input_ids: Tensor(B, L)
    encoded_embeddings = self.encoder(input_ids, attention_mask, token_type_ids, training = training) # (B, L, C)
    decoded_values = self.decoder(encoded_embeddings, decoder_input_ids, training = training)
    preds = self.model_head.call(decoded_values, training = training)
    return preds # (B, L, C)
  def call_val(self, input_ids, attention_mask, token_type_ids, training):
    # Inference Loop: 
    encoded_embeddings = self.encoder(input_ids, attention_mask, token_type_ids, training = training)
    # create Starter token
    B, _, _ = encoded_embeddings.shape
    sentence_tokens = tf.ones((B, 1), tf.int64) * tf.cast(self.START_TOKEN, tf.int64)
    pred_logits = tf.ones((B, 0, self.vocab_length), encoded_embeddings.dtype)
    for i in range(self.MAX_LEN - 1):
      embeddings = self.decoder.call_val(encoded_embeddings, sentence_tokens, training = training) # (B, L, C)
      pred = self.model_head.call_val(embeddings, training = training) # (B, C)
      # Add the logits 
      TMP_LOGITS = tf.expand_dims(pred, axis = 1) # (B, 1, C)
      
      pred_logits = tf.concat([pred_logits, TMP_LOGITS], axis = 1) # (B, 1, C)

      pred = keras.activations.softmax(pred) # (B, C)
      pred = tf.argmax(pred, axis = -1) # (B, )
      pred = tf.expand_dims(pred, axis = 1)
      # Just append the values, should predict <END> and then just random garbage(We filter it out)
      sentence_tokens = tf.concat([sentence_tokens, tf.cast(pred, sentence_tokens.dtype)], axis = 1)
    return pred_logits, sentence_tokens    

# LR SCHEDULER

In [18]:
class ParamScheduler:
    def __init__(self, start, end, num_iter):
        self.start = start
        self.end = end
        self.num_iter = num_iter
        self.idx = -1
        
        
    def step(self):
        self.idx+=1
        return self.func(self.start, self.end, self.idx/self.num_iter)
    
    def reset(self):
        self.idx=-1
        
    def is_complete(self):
        return self.idx >= self.num_iter

class CosineScheduler(ParamScheduler):
    def func(self, start_val, end_val, pct):
        cos_out = np.cos(np.pi * pct) + 1
        return end_val + (start_val - end_val)/2 * cos_out
class ConstantScheduler(ParamScheduler):
    def __init__(self, init_lr, num_steps):
        self.init_lr = init_lr
        self.num_steps = num_steps
        self.steps = -1
    def step(self):
        self.steps += 1
        return self.init_lr
    def reset(self):
        self.steps = -1
    def is_complete(self):
        return self.steps >= self.num_steps
class OneCycleScheduler(keras.callbacks.Callback):
    
    def __init__(self, init_lr, max_lr, min_lr, warm_steps, peak_steps, total_steps):
        momentums=(0.95,0.85)
        start_div=25.
        pct_start=warm_steps
        pct_climax = peak_steps# Stay at the peak for 0.1 of training.
        verbose=True
        sched=CosineScheduler
        end_div=None
        self.pct_climax = pct_climax
        self.max_lr, self.momentums, self.start_div, self.pct_start, self.verbose, self.sched, self.end_div = max_lr, momentums, start_div, pct_start, verbose, sched, end_div
        if self.end_div is None:
            self.end_div = start_div * 1e4
        self.logs = {}
        self.min_lr = min_lr
        self.init_lr = init_lr
  
        self.start_lr = self.max_lr/self.start_div
        self.end_lr = self.max_lr/self.end_div 
        self.num_iter = int(total_steps * 1.2) # Pad the Steps a bit to make sure no overflow.
        self.num_iter_1 = int(self.pct_start*self.num_iter)
        self.num_iter_2 = int(self.pct_climax * self.num_iter)
        self.num_iter_3 = self.num_iter - self.num_iter_1 - self.num_iter_2
        
        self.lr_scheds = (self.sched(self.start_lr, self.max_lr, self.num_iter_1), ConstantScheduler(self.max_lr, self.num_iter_2), self.sched(self.max_lr, self.end_lr, self.num_iter_3))
        self.sched_idx = 0 
        
    def optimizer_params_step(self):
        try:
          next_lr = self.lr_scheds[self.sched_idx].step()
        except:
          next_lr = self.min_lr
        next_lr = tf.maximum(next_lr, self.min_lr)
        next_lr = tf.cast(next_lr, tf.float32)
        # update optimizer params
        optimizer.optimizer.learning_rate.assign(next_lr)
        
    def step(self):
        self.optimizer_params_step()
        try:
          if self.lr_scheds[self.sched_idx].is_complete():
              self.sched_idx += 1
        except:
          pass

# Optimizer

In [19]:
class GradAccAdam():
    # Just a Wrapper to Accumulate Gradients and Send them to Adam
    def __init__(self, model, learning_rate, grad_acc_steps, prev_optim_path = None):
        self.learning_rate = learning_rate
        self.grad_acc_steps = grad_acc_steps
        
        self.weight_decay = TrainingConfig.weight_decay
        self.optimizer = tfa.optimizers.AdamW(learning_rate = self.learning_rate, weight_decay = self.weight_decay)
        
        self.PrevModelPath = prev_optim_path
        if self.PrevModelPath:
            self.opt_weights = np.load(f'{self.PrevModelPath}optimizer_last.npy', allow_pickle = True)
        
            trainable_weights = model.trainable_weights
            
            zero_grads = [tf.zeros_like(w) for w in trainable_weights]
            @tf.function
            def f():
                self.optimizer.apply_gradients(zip(zero_grads, trainable_weights))
            strategy.run(f)
            self.optimizer.set_weights(self.opt_weights)
            print("Loaded Weights")
        
        self.gradients = None
        self.cur_grad_acc = 0
    def apply_gradients(self, gradients, variables):
        if self.gradients is None:
            self.gradients = [g / tf.constant(float(self.grad_acc_steps)) for g in gradients]
            self.cur_grad_acc += 1
        else:
            for i in range(len(gradients)):
                self.gradients[i] += gradients[i] / tf.constant(float(self.grad_acc_steps))
            self.cur_grad_acc += 1
        if self.cur_grad_acc == self.grad_acc_steps:
            self.optimizer.apply_gradients(zip(self.gradients, variables))
            self.gradients = None
            self.cur_grad_acc = 0

# TRAINING CONFIGS

In [20]:
class TrainingConfig:
  learning_rate = 2e-5
  max_lr = 5e-5
  min_lr = 1e-13
  warm_steps = 0.05
  peak_steps = 0.05

  NUM_EPOCHS = 100
  STEPS = TRAIN_NUMBER // BATCH_SIZE
  TOTAL_STEPS = STEPS * NUM_EPOCHS
  STEPS_PER = 100
  weight_decay = 0
  PREV_MODEL_PATH = None#f'{SAVE_PATH}'

# Prepare the Model

In [21]:
def save_states(path_name):
  # Saves Optimizer, Model, Scheduler 
  # optimizer, model, scheduler
  print("SAVING STATES")
  optimizer_path = f'{SAVE_PATH}{path_name}_optimizer.npy'
  with open(optimizer_path, 'w') as file:
    pass
  
  np.save(optimizer_path, optimizer.optimizer.get_weights())
  model_path = f"{SAVE_PATH}{path_name}_model.h5"
  with open(model_path, 'w') as file:
    pass
  model.save_weights(model_path)


class ValF1Score(keras.metrics.Metric):
  def __init__(self, name = 'f1score', **kwargs):
    super().__init__(name = name, **kwargs)
    self.tp = 0.0
    self.fp = 0.0
    self.fn = 0.0
    self.tokenizer = ModelConfig.tokenizer
    self.start_token_id = ModelConfig.START_TOKEN_INT
    self.pad_token_id = ModelConfig.PAD_TOKEN_INT
    self.end_token_id = ModelConfig.END_TOKEN_INT
    self.split_token_id = ModelConfig.SPLIT_TOKEN_INT
    self.MAX_LEN = MAX_LENGTH_LABEL
    self.MAX_NUM_WORDS = MAX_NUM_WORDS
    self.MAX_LENGTH_WORD = MAX_LENGTH_WORD
    self.beta = 0.5
  def jaccard_similarity_score(self, GT, pred):
    intersection = tf.sets.intersection(tf.expand_dims(GT, axis = 0), tf.expand_dims(pred, axis = 0))
    intersection = tf.sparse.to_dense(intersection)
    intersection = tf.reshape(intersection, (-1, ))

    if len(intersection) + len(GT) + len(pred) == 0:
      return tf.constant(1.0, dtype = TARGET_DTYPE)
    else:
      jaccard_score = len(intersection) / (len(GT) + len(pred) - len(intersection))
      return tf.cast(jaccard_score, dtype = TARGET_DTYPE)
  
  def split_based_on_id(self, text):
    indices = tf.ones((self.MAX_NUM_WORDS), dtype = tf.int64) * tf.constant(-1, dtype = tf.int64)
    cur_idx = tf.constant(0, dtype = tf.int32)
    L = min(text.shape[0], self.MAX_NUM_WORDS)

    # Find the Indices Manually
    for i in range(self.MAX_LEN - 1):
      if text[i] == self.end_token_id:
        begin = indices[:cur_idx]
        end = indices[cur_idx + 1:]
        middle = tf.expand_dims(tf.constant(i + 1, indices.dtype), axis = 0)
        indices = tf.concat([begin, middle, end], axis = 0)
        indices = tf.reshape(indices, (self.MAX_NUM_WORDS, ))
        cur_idx = tf.minimum(L - 1, cur_idx + 1)

      elif text[i] == self.split_token_id:
        begin = indices[:cur_idx]
        end = indices[cur_idx + 1:]
        middle = tf.expand_dims(tf.constant(i, indices.dtype), axis = 0)
        indices = tf.concat([begin, middle, end], axis = 0)
        indices = tf.reshape(indices, (self.MAX_NUM_WORDS, ))
        cur_idx = tf.minimum(L - 1, cur_idx + 1)
    all_words = tf.ones((self.MAX_NUM_WORDS, self.MAX_LENGTH_WORD), dtype = text.dtype) * tf.constant(-1, dtype = text.dtype)
    word_idx = tf.constant(0, dtype = tf.int32)
    num_indices = len(indices)

    for split_idx in range(self.MAX_NUM_WORDS - 1):
      if indices[split_idx] == -1:
        continue
      elif split_idx == 0:
        word = text[:indices[split_idx]]
        # pad/truncate to max len
        length = tf.maximum(0, self.MAX_LENGTH_WORD - len(word))
        # Truncate
        word = word[:self.MAX_LENGTH_WORD]
        word = tf.pad(tf.expand_dims(word, axis = 0), [[0, 0], [0, length]],  constant_values = self.pad_token_id)    
        
        begin = all_words[:word_idx]
        end = all_words[word_idx + 1:]
        middle = word
        all_words = tf.concat([begin, middle, end], axis = 0)
        all_words = tf.reshape(all_words, (self.MAX_NUM_WORDS, self.MAX_LENGTH_WORD))
        word_idx = word_idx + 1
      else:
        word = text[indices[split_idx] + 1: indices[split_idx + 1]]
        length = tf.maximum(0, self.MAX_LENGTH_WORD - len(word))
        # Truncate 
        word = word[:self.MAX_LENGTH_WORD]
        word = tf.pad(tf.expand_dims(word, axis = 0), [[0, 0], [0, length]],  constant_values = self.pad_token_id)
        
        begin = all_words[:word_idx]
        end = all_words[word_idx + 1:]
        middle = word
        all_words = tf.concat([begin, middle, end], axis = 0)
        all_words = tf.reshape(all_words, (self.MAX_NUM_WORDS, self.MAX_LENGTH_WORD))
        word_idx = word_idx + 1
  
    return all_words
  def constant_to_tensor(self, idx):
    return tf.expand_dims(tf.identity(idx), axis = 0)
  def is_in(self, idx, total):
    total = tf.expand_dims(total, axis = 0)
    idx = tf.expand_dims(idx, axis = 0)

    intersection = tf.sparse.to_dense(tf.sets.intersection(idx, total))
    intersection = tf.reshape(intersection, (-1, ))
    return len(intersection) > 0
  def update_state(self, GT, pred_tokens):
    # pred tokens: Tensor(B, L)
    # GT: Tensor(B, L)
    B, L = GT.shape
    # Convert to int
    pred_tokens= tf.cast(pred_tokens, tf.int64)
    GT = tf.cast(GT, tf.int64)
 
    for b in range(B):
      
      ground_truth = GT[b][1:] # Cut off Start Tokens
      predicted_text = pred_tokens[b][1:] # cut off start tokens # (MAX_LEN)
      split_ground_truth = self.split_based_on_id(ground_truth) # (MAX_NUM_WORDS, MAX_LENGTH_WORD)
      split_predicted_text = self.split_based_on_id(predicted_text) # (MAX_NUM_WORDS, MAX_LENGTH_WORD)
      removed = tf.ones((self.MAX_NUM_WORDS, ), dtype = tf.int32) * tf.constant(-1, dtype = tf.int32)  
      cur_removed_idx = tf.constant(0, dtype = tf.int32)
      for i in range(self.MAX_NUM_WORDS):   
        if split_ground_truth[i, 0] == -1:
          continue # Filter out bad Sentences
        predicted_dataset = split_ground_truth[i] # Padded to (60)
        #print(predicted_dataset)
        index = tf.argmax(tf.equal(predicted_dataset, self.pad_token_id))
        ground_truth = predicted_dataset[:index]
       
        best_score = tf.constant(0.0, dtype = TARGET_DTYPE)
        best_idx = tf.constant(0, dtype = tf.int32)
        for j in range(self.MAX_NUM_WORDS):
          #print(i, j)
          if self.is_in(self.constant_to_tensor(j), removed):
            continue
          if split_predicted_text[j, 0] == -1:
            begin = removed[:cur_removed_idx]
            end = removed[cur_removed_idx + 1:]
            middle = self.constant_to_tensor(j)
            removed = tf.concat([begin, middle, end], axis = 0)
            removed = tf.reshape(removed, (self.MAX_NUM_WORDS, ))
            cur_removed_idx = cur_removed_idx + 1
            continue
          model_pred = split_predicted_text[j]
          model_index = tf.argmax(tf.equal(model_pred, self.pad_token_id))
          predicted_text = model_pred[:model_index]
          jaccard_score = self.jaccard_similarity_score(ground_truth, predicted_text)
          if jaccard_score > best_score:
            best_score = jaccard_score
            best_idx = j
        
        if best_score >= 0.5:
          self.tp += 1
          begin = removed[:cur_removed_idx]
          end = removed[cur_removed_idx + 1:]
          middle = self.constant_to_tensor(best_idx)
          removed = tf.concat([begin, middle, end], axis = 0)
          removed = tf.reshape(removed, (self.MAX_NUM_WORDS, ))
          cur_removed_idx = cur_removed_idx + 1

        else:
          self.fn += 1
      # compute How many removed indices weren't used.
      count = tf.where(tf.equal(removed, tf.constant(-1, dtype = removed.dtype)))
      count = tf.reshape(count, (-1, ))
      for _ in range(len(count)):
        self.fp += 1
  def reset_states(self):
    self.tp = 0.0
    self.fp = 0.0
    self.fn = 0.0
  def result(self):
    # Compute BETA score
    tp = self.tp * (1 + self.beta ** 2)
    fn = self.fn * (self.beta ** 2)
    fp = self.fp
    eps = 1e-8
    fbeta = (tp + eps) / (tp + fp + fn + eps)
    return fbeta

In [22]:
class ValF1Score(keras.metrics.Metric):
  def __init__(self, name = 'f1score', **kwargs):
    super().__init__(name = name, **kwargs)
    
    self.inter = tf.Variable(0.0, TARGET_DTYPE)
    self.union = tf.Variable(0.0, TARGET_DTYPE)

  def reset_states(self):
    self.inter.assign(tf.cast(0.0, TARGET_DTYPE))
    self.union.assign(tf.cast(0.0, TARGET_DTYPE))
    
  @tf.function
  def update_state(self, GT, y_pred):
    # GT: Tensor(B, MAX_LEN)
    # Y_PRED: Tensor(B, MAX_LEN)

    # Compute the larger of the two indices
    END_TOKEN = ModelConfig.END_TOKEN_INT
    # Find the END token 
    end_tok_GT = tf.argmax(tf.equal(GT, END_TOKEN), axis = -1) # (B, )
    end_tok_pred = tf.argmax(tf.equal(y_pred, END_TOKEN), axis = -1) # (B, ) 
    # Take the ARGMAX
    end_toks = tf.maximum(end_tok_GT, end_tok_pred) # (B, )
    B, L = GT.shape
    for b in range(B):
      # Index into the Tensors
      ground_truth = GT[b]
      ground_truth = ground_truth[:end_toks[b]]
      preds = y_pred[b]
      preds = preds[:end_toks[b]]
  
      # Comparison
      tp = tf.cast(tf.equal(ground_truth, preds), dtype = TARGET_DTYPE)
      inter = tf.reduce_sum(tp)
      union = tf.cast(len(tf.reshape(ground_truth, (-1, ))) + len(tf.reshape(preds, (-1, ))), TARGET_DTYPE) - inter

      self.inter.assign_add(inter)
      self.union.assign_add(union)



  def result(self):
    eps = 1e-10
    return (self.inter + eps) / (self.union + eps)

# Test all Splits after.

In [23]:
def prep_training():
  with strategy.scope():
    print(f'----------------CREATE METRICS-------------------')
    # ----------------Create METRICS-------------------
    metrics = {
        'train_loss': keras.metrics.Mean(),
        'val_loss': keras.metrics.Mean(),
        'train_acc': keras.metrics.Accuracy(),
        'val_acc': keras.metrics.Accuracy(),
        'val_f1': ValF1Score()# F1 Only Computed at Val Time, Since no Use to compute it at Train time.
    }
    print(f"-------------CREATE LOSS FUNCTION---------------")
    label_smoothing = ModelConfig.label_smoothing
    loss_obj = keras.losses.CategoricalCrossentropy(
        from_logits = True,
        label_smoothing = label_smoothing,
        reduction = tf.keras.losses.Reduction.NONE   
    )
  
    def loss_fn(GT, x):
      # Computes loss, ignoring the <PAD> tokens.
      
      PAD_TOKEN = ModelConfig.tokenizer.pad_token_id
      vocab_size = ModelConfig.tokenizer.vocab_size
      mask = tf.not_equal(GT, PAD_TOKEN)

      one_hot = tf.one_hot(GT, vocab_size)
      loss = loss_obj(one_hot, x) # (B, L)
      loss = loss * tf.cast(mask, loss.dtype)
      # REDUCE AVG
      loss = tf.nn.compute_average_loss(loss, global_batch_size= bs)
      return loss
    print(f"------------------CREATE MODEL--------------------")
    # ---------------CREATE MODEL----------------(AND LOAD IT?)
    model = FullModel()
    print(f"------------------CREATE OPTIMIZER----------------")
    # ---------------CREATE OPTIMIZER-----------------(AND LOAD IT?)
    prev_optim_path = TrainingConfig.PREV_MODEL_PATH
    if prev_optim_path is not None:
      prev_optim_path = f"{prev_optim_path}optimizer.npy"
    optimizer = GradAccAdam(model, TrainingConfig.learning_rate, 1, prev_optim_path = prev_optim_path)
    print(f"------------------CREATE SCHEDULER----------------")
    # ---------------CREATE SCHEDULER-----------------
    scheduler = OneCycleScheduler(
        TrainingConfig.learning_rate,
        TrainingConfig.max_lr, 
        TrainingConfig.min_lr,
        TrainingConfig.warm_steps,
        TrainingConfig.peak_steps,
        TrainingConfig.TOTAL_STEPS
      )
    return model, optimizer, metrics, loss_fn, scheduler
model, optimizer, metrics, loss_fn, scheduler = prep_training()

----------------CREATE METRICS-------------------
-------------CREATE LOSS FUNCTION---------------
------------------CREATE MODEL--------------------


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=657434796.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel f

------------------CREATE OPTIMIZER----------------
------------------CREATE SCHEDULER----------------


# Transformer Training Steps(elementwise Loss, remove loss from PAD token)

In [24]:
def train_step(input_ids, attention_mask, token_type_ids, labels):
  training = True
  with tf.GradientTape() as tape:

    predictions = model.call_train(input_ids, attention_mask, token_type_ids, labels[:, :-1], training = training)
    loss = loss_fn(labels[:, 1:], predictions)

    # Softmax and Compute Accuracy
    predictions = keras.activations.softmax(predictions, axis = -1)
    predictions = tf.argmax(predictions, axis = -1) # (B, L)

    metrics['train_acc'].update_state(labels[:, 1:], predictions, sample_weight = 
          tf.where(tf.not_equal(labels[:, 1:], ModelConfig.PAD_TOKEN), 1.0, 0.0))

  gradients = tape.gradient(loss, model.trainable_variables)
  
  loss = loss / (MAX_LENGTH_LABEL - 1)
  metrics['train_loss'].update_state(loss)

  optimizer.apply_gradients(gradients, model.trainable_variables)

@tf.function
def dist_train_step(iterator, num_steps):
  for i in tf.range(num_steps):
   (input_ids, attention_mask, token_type_ids), labels = next(iterator)
   strategy.run(train_step, args = (input_ids, attention_mask, token_type_ids, labels)) 

Metrics

In [25]:
def val_step(input_ids, attention_mask, token_type_ids, labels):
  # Greedy Decoding Method
  training = False
  preds, pred_tokens = model.call_val(input_ids, attention_mask, token_type_ids, training = training)
  # take the elementwise lossfn
  loss = loss_fn(labels[:, 1:], preds)
  loss = loss / (MAX_LENGTH_LABEL - 1)
  metrics['val_loss'].update_state(loss)

  # Compute the Accuracy of LM 
  metrics['val_acc'].update_state(labels[:, 1:], pred_tokens[:, 1:], sample_weight=
    tf.where(tf.not_equal(labels[:, 1:], ModelConfig.PAD_TOKEN), 1.0, 0.0))
  # Decode the predictions and accumulate the Val F1 Score
  return labels[:, 1:], pred_tokens[:, 1:]

@tf.function
def dist_val_step(input_ids, attention_mask, token_type_ids, labels):
  labels, pred_tokens = strategy.run(val_step, args = (input_ids, attention_mask, token_type_ids, labels))
  labels = strategy.gather(labels, axis = 0)
  pred_tokens = strategy.gather(pred_tokens, axis = 0)
  return labels, pred_tokens

# Stat Logger

In [26]:
class StatLogger():
  def __init__(self):
    self.EPOCHS = 0
    self.STEPS = 0
    
    self.best_f1 = tf.Variable(tf.constant(0.0, dtype = TARGET_DTYPE))
    self.best_acc = tf.Variable(tf.constant(0.0, dtype = TARGET_DTYPE))
    self.best_loss = tf.Variable(1e6, dtype = TARGET_DTYPE)
  def update_train(self):
    # Grab the Current LR and other Stat, called every TrainingConfig.STEPS_PER
    lr = optimizer.optimizer.learning_rate.numpy().item()
    train_acc = metrics['train_acc'].result().numpy().item()
    train_loss = metrics['train_loss'].result().numpy().item()

    print(f"E: {self.EPOCHS}, S: {self.STEPS}, TA: {train_acc}, TL: {train_loss}, LR: {lr}")
    self.STEPS += TrainingConfig.STEPS_PER
    # Reset the States
    metrics['train_acc'].reset_states()
    metrics['train_loss'].reset_states()

  def update_val(self):
    val_acc = metrics['val_acc'].result().numpy().item()
    val_loss = metrics['val_loss'].result().numpy().item()
    #val_f1 = metrics['val_f1'].result().numpy().item()

    #if val_f1 >= self.best_f1:
    #  self.best_f1.assign(val_f1)
    #  save_states('f1')
    if val_loss <= self.best_loss:
      self.best_loss.assign(val_loss)
      save_states('loss')
    if val_acc >= self.best_acc:
      self.best_acc.assign(val_acc)
      save_states('acc')
    
    print(f"E: {self.EPOCHS}, BA: {self.best_acc}, BL: {self.best_loss}, VA: {val_acc}, VL: {val_loss}")
    self.EPOCHS += 1
    for m in metrics:
      metrics[m].reset_states()
  


    

# train the model
- FIX F1 Score
- Val Loss is Broken too.
- Problem with LR Scheduler too.

Change Metric to F1 in BMS
- Or just use Accuracy as metric.
- Consider Changing the tokenizer method(Subword? BPE?)
- Try Larger ROBERTA model.

# TRaining Rapidly
1) ROBERTA BASE on LM - TPU Colab

2) XLNET BASE on LM - TPU Kaggle

3) DistilROBERTA on LM - GPU Kaggle

4) XLNET LARGE - TPU Kaggle
# Then, Inference using the ROBERTA sets, while doing this:
1) Best XLNET on NER

2) Best XLNEt on QA

3) Best ROBERTA on NER

4) Best ROBERTA on QA

- Ensemble while waiting for inference.

In [27]:
tf.config.set_soft_device_placement(True)

In [28]:
NUM_EPOCHS = 20

In [29]:
N_LOOPS_EVERY_VAL = 1 # Val is super slow, so we eval every 3 times.

In [30]:
FOLD_IDX = 0
stat_logger = StatLogger()
for epoch in range(TrainingConfig.NUM_EPOCHS // N_LOOPS_EVERY_VAL):
  print(f'EPOCH: {epoch}')
  train_ds, val_ds = load_tf_dataset(FOLD_IDX)
  train_dist_ds = iter(strategy.experimental_distribute_dataset(train_ds))
  val_dist_ds = iter(strategy.experimental_distribute_dataset(val_ds))
  for _ in range(N_LOOPS_EVERY_VAL):
    for i in range(TrainingConfig.STEPS // TrainingConfig.STEPS_PER):
      tf.config.set_soft_device_placement(True)
      dist_train_step(train_dist_ds, TrainingConfig.STEPS_PER)
      for j in range(TrainingConfig.STEPS_PER):
        scheduler.step()
      stat_logger.update_train()
  print("VALIDATION LOOP")
  for (input_ids, attention_mask, token_type_ids), labels in val_dist_ds:
    ground_truth, prediction = dist_val_step(input_ids, attention_mask, token_type_ids, labels)
    #metrics['val_f1'].update_state(ground_truth, prediction)
  stat_logger.update_val()
  if epoch == NUM_EPOCHS:
        break


EPOCH: 0
E: 0, S: 0, TA: 0.21552331745624542, TL: 0.7022749185562134, LR: 2.0025686353619676e-06
E: 0, S: 100, TA: 0.22265516221523285, TL: 0.6121121644973755, LR: 2.010378011618741e-06
E: 0, S: 200, TA: 0.23111367225646973, TL: 0.5746179223060608, LR: 2.023426532105077e-06
E: 0, S: 300, TA: 0.2209639549255371, TL: 0.586740255355835, LR: 2.041711468336871e-06
E: 0, S: 400, TA: 0.22704696655273438, TL: 0.5550917983055115, LR: 2.06522895496164e-06
E: 0, S: 500, TA: 0.21747994422912598, TL: 0.5661287903785706, LR: 2.0939737623848487e-06
E: 0, S: 600, TA: 0.23869909346103668, TL: 0.4964577555656433, LR: 2.1279392967699096e-06
E: 0, S: 700, TA: 0.21840022504329681, TL: 0.53590989112854, LR: 2.167118736906559e-06
E: 0, S: 800, TA: 0.22650057077407837, TL: 0.5024697184562683, LR: 2.21150298784778e-06
E: 0, S: 900, TA: 0.21222974359989166, TL: 0.526620626449585, LR: 2.2610827272728784e-06
E: 0, S: 1000, TA: 0.21508267521858215, TL: 0.4958970546722412, LR: 2.3158470412454335e-06
E: 0, S: 1100, 

  return array(a, dtype, copy=False, order=order, subok=True)


SAVING STATES
E: 0, BA: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.21830986>, BL: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.5233979>, VA: 0.21830986440181732, VL: 0.5233979225158691
EPOCH: 1
E: 1, S: 3500, TA: 0.5673118233680725, TL: 0.2683919668197632, LR: 5.3154117267695256e-06
E: 1, S: 3600, TA: 0.5033743381500244, TL: 0.30196383595466614, LR: 5.49754076928366e-06
E: 1, S: 3700, TA: 0.5742615461349487, TL: 0.2672550678253174, LR: 5.684147254214622e-06
E: 1, S: 3800, TA: 0.543166995048523, TL: 0.2741634249687195, LR: 5.8751911637955345e-06
E: 1, S: 3900, TA: 0.6217756867408752, TL: 0.24449342489242554, LR: 6.070629751775414e-06
E: 1, S: 4000, TA: 0.6459279656410217, TL: 0.2212294638156891, LR: 6.270421181397978e-06
E: 1, S: 4100, TA: 0.6461604237556458, TL: 0.24033623933792114, LR: 6.474521342170192e-06
E: 1, S: 4200, TA: 0.6280891299247742, TL: 0.25889381766319275, LR: 6.682885668851668e-06
E: 1, S: 4300, TA: 0.6273812651634216, TL: 0.23640255630016327


# Distributed Variables inside of ValF1
- Also, ValF1 States aren't updating
- Debug on Max size = 2

Im SO DUMB. I didn't split a val set.

# TODO;
- Fix Attention Masks
- Input Id type(from tokenizer)
- Fix Validation Loop 
- ADD A CRF Layer.
- SEND THE TFRECORDS to GCS.

Runs on TPU now. Compile TIme seems to be pretty crazy.


# TODO: Tune the validation metrics and Training Hyper Params. I'm trying to just get it to run.

I think Batch size was too large. I'm not sure.
- Model isn't Converging: Reaches 2.27 Loss, much worse than baseline notebook: 2.1 Loss.
- Estimated for actually good performance; 1.1 Loss or so.
- Tesla V100 is the fastest GPU, then P100, the T4, then K80.





Cyclic LR isn't working as well
- It seems a simple One-Cycle would be perfect for the task.
- KEY NOTE: VAL Loss vs Training loss: Big Gap RN.
- Lower and Lower LR needed
- Restarting training helps a lot for some reason.
- Base LR should be 0.0 or like 1e-13

Starts Overfitting by EPOCH 1.5 or around there.
- Do one Overnight training today.
- IDK why, but the Validation Score is super duper low.
- Normal People have large Gaps(Ex. 0.8)
- Seems like a bug. debug later.
- It's the Learning Rate Scheduler: BERT is tough to tune
- 2 EPOCH with 2e-5 is perfect, so probably freeze the backbone and train on smaller LR.
- Current LR Scheduler: Just decay linearly down to 0.0
- I think with a better LR schedule, it can work(One cycle with higher LR)
- Sometimes, with weird testing, you can get down to 1.7 ~1.6 loss.
- ROBERTA can further improve performance(ROBERTA base or large)
- Freeze BEGINNING of ROBERTA-Large.
- Dropout, weight decay
- NLP Augmentations(avoiding Dataset names)
- Training Baseline tonight
- Seems to train pretty well so far using CyclicLR.
- Overfitting RN.
- Still Overfitting, but with more and more epochs, the gap decreases.
- COnverged to 2.1 Train Loss
- Val Loss keeps Converging.
- 



### Save model

# TESTING:
1) Baseline:
- LR 2e-5, 2 EPOCHS - Able to Acheieve BaseLine Results - 0.17 LB
- LR with Warm Restarts:Works well, but when it reaches the peak, it fails, so OneCycleLR should work.

# Requirements:
- MLM Model: 0.39 - 0.4 LB Explicit Remove Train Labels
- NER Model: 0.4 LB
- QA: 0.4LB
- ENSEMBLE ALL: 0.45LB.
- SOTA = ~0.5LB without string matching.

# TO TEST:
1) INFERENCE ON VALIDATION
2) Add Text Augmentation: See if Validation is true or not.
3) Start Using LB as Validation Please(we have enough submissions i guess)
- BERT + Dropout - Doesn't Help at all: oof.(3.12)
- BERT + Text Augmentation - Testing RN - Helps a lot to converge and bridge the gap between VL nd TL! - I think it's converging worse. Oof. - Failed Experiment(2.39)
- Frozen BERT
- Add Final Dropout Layer. - Doesn't Help. why does nothing work.(2.42)
- Baseline(2.36) - Converges well with decent consistency. Problems is due to overfitting LB, since the public LB has train samples, which is really stupid.
- Roberta
- CRF Layer
- FINE-TUNE the BERT model until 0.4 or 0.5 LB without string-matching.
- NEED MINIMUM to 0.4 BEFORE MOVING TO ANOTHER MODEL
- Other Models
- MLM, QA, NER, SPACY models. 
- SPACY is a Machine Learning Based Classifier
- BERT based Classifier
- Pure Language Modelling Task(Seems to work well enough, if you check the LM output is actually in the text)
- LM has very good performance compared to other.
- Big Problem with overfitting to be honest.(Not sure how to prevent overfitting the new sentences)


- Like BirdClef and BMS, folds are kinda useless, but you can add 5 FOLDS for each, for like 30 models total.
- Test Hypothesis of randomly restarting training.
- Heavy Bugs from restarting training.

2) Testing:
- It seems that everyone is heavily overfitting LM
- Trying ROBERTA models right now, to see at least if better models get better results.

3) Models:
- SPACY: I suspect heavy LB overfitting
- Attention based seems to work decently, at not overfitting
- Longer Lengths should help.(500 token length?)
- I think continue with Attention based model or test out MLM model
- He didn't even use a pretrained BERT, so I think that this model can be heavily improved!
- It also doesn't overfit as much even with train removed.
- Swapping to Attention RN, can come back to MLM later(no time to fine tune a broken model)

# ORDER:
- Attention Model LM
- NER
- QA
- SPACY 
- CLASSIFIER
- MLM



# Current Model Uses up too much RAM
- Can fix with TPU + HighRam
- 15 GB RAM needed.
- 6 Ex takes 11 Minutes
- 10 Ex takes: 30 Minutes(MAX CAP)
