# Setting

# 8 Models:
- 2 Models
    - Roberta
    - XLNet
    - *Electra(Only works for classification)
        - Electra's VOCAB is weird, it's the same as BERT.
    - No Bert needed, as RObertA is just better BERT(Same as Distill BERT)
- 3 Models
    - QA
        - + ELECTRA
    - Ner
        - + ELECTRA
    - LM
        - No Electra
        
- For QA and NER, no need for checking. Set their logits to 0.
       
  

# Best Models:
- ROBERTA LM:
    - Roberta-Base, Weird LR Scheduler
    - 6 Decoder
    - 94.6 CV
    - Irreproducable?
 - XLNet LM
 

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

from IPython.display import clear_output
clear_output()

# Import

In [None]:
import os
import re
import json
import time
import random
import glob
import importlib
import math

import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm

import tensorflow as tf
import tensorflow.keras as keras 
import tensorflow_addons as tfa
import tensorflow.keras.backend as backend
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial
import warnings
warnings.filterwarnings("ignore")



sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
test_files_path = paper_test_folder
    
adnl_govt_labels_path = '../input/bigger-govt-dataset-list/data_set_800.csv'

In [None]:
papers = {}
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt


"""
if not BS_CLEANING:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        text = re.sub(' +', ' ', text)
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        return text
else:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = ''.join([k for k in text if k not in string.punctuation])
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        # text = re.sub("/'+/g", ' ', text)
        return text

"""
def read_json_pub(filename, train_data_path=train_files_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

### Paths and Hyperparameters

In [None]:
TOKENIZER_PATH = '../input/coleridge-mlm-model/model_tokenizer/'
PRETRAINED_PATH = '../input/coleridge-mlm-model/98.8 Model.h5'

MAX_LENGTH = 512
MAX_LENGTH_LABEL = 60
OVERLAP = 20

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())
def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def shorten_sentences_tokens(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    length = MAX_LENGTH - 2
    short_sentences = []
    for sentence in sentences:
        words = sentence
        if len(words) > length:
            for p in range(0, len(words), length - OVERLAP):
                short_sentences.append(words[p:p+length])
        else:
            short_sentences.append(sentence)
    return short_sentences


connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

### Transform

In [None]:
class ModelConfig:
  model_checkpoint = 'roberta-base'
  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True, return_token_type_ids = True, return_attention_masks = True)
  
  encoder_dim = 1024 if model_checkpoint == 'roberta-large' else 768
  decoder_dim = 1024 if model_checkpoint == 'roberta-large' else 768
  num_att_heads = 16 if model_checkpoint == 'roberta-large' else 12
  decoder_layers = 12 if model_checkpoint == 'roberta-large' else 6
  intermediate_dim = 4096 if model_checkpoint == 'roberta-large' else 3072
  dropout_rate = 0.1 
  # CONFIG:
  model_head = 'linear'
  label_smoothing = 0.1
  # SPECIAL TOKS
  values = tokenizer.encode("<pad>")
  PAD_TOKEN = tf.constant(values[1], tf.int64)
  PAD_TOKEN_INT = PAD_TOKEN.numpy().item()
  START_TOKEN = tf.constant(values[0], tf.int64)
  START_TOKEN_INT = START_TOKEN.numpy().item()
  END_TOKEN = tf.constant(values[2], tf.int64)
  END_TOKEN_INT = END_TOKEN.numpy().item()
  SPLIT_TOKEN = tf.constant(tokenizer.encode(" |")[1], tf.int64)
  SPLIT_TOKEN_INT = SPLIT_TOKEN.numpy().item()
  del values
  '''
  ROBERTA-LARGE:
  - 1024 Decoder Dim = 1024 // num_heads
  - 4096 Intermediate Dim - Intermediate = FFN dim
  - 16 Att Heads = Number of Att Heads
  - 12 Decoder Layers = Number of Decoder Layers

  ROBERTA-BASE:
  - 3072 Intermediate Dim 
  - 6 Decoder Layers
  - 12 att Heads
  - 768 Decoder Dim
  '''
  

In [None]:
all_test_data = []

for paper_id in tqdm(sample_submission['Id']):
    # load paper
    paper = papers[paper_id]

    # extract sentences
    sentences = [clean_paper_sentence(sentence) for section in paper 
                     for sentence in section['text'].split('.')
                    ]

    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 1] # only accept sentences with length > 1 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
    sentences = [sentence.split() for sentence in sentences] # sentence = list of words
    new_sentences = []
    for sent in sentences:
        new_sentences += sent
    sentences = new_sentences
    # mask
    test_data = []
    LOOKAHEAD = 100
    cur_pos = 0
    for phrase_start, phrase_end in find_mask_candidates(sentences):
        if phrase_end < cur_pos:
            continue
        try:
            dt_point = sentences[max(cur_pos, phrase_start - LOOKAHEAD): phrase_end + LOOKAHEAD]
            cur_pos = phrase_end + LOOKAHEAD
            test_data.append(" ".join(dt_point))
        except:
            pass
    test_data = '. '.join(test_data)
    # Tokenize the Data
    test_data = ModelConfig.tokenizer(test_data)['input_ids'][1:-1]
    test_data = [sentence for sentence in shorten_sentences_tokens([test_data]) if len(sentence) > 10]
    # Untokenize the data
    test_data = ModelConfig.tokenizer.batch_decode(test_data)
    all_test_data.append(test_data)

# Grab the Model
- LM.

In [None]:
def get_model(model_name):
  config = transformers.RobertaConfig.from_json_file(f"{TOKENIZER_PATH}config.json")
  model = transformers.TFAutoModel.from_config(config) # 3 parts to a Roberta model
  '''
  1) Embeddings
  2) RoBERTA Main layer
  3) Pooler layer. - Can be skipped.
  '''
  # Freeze Half of the Layers in the Encoder - It's pretrained and already has decent embeddings.(Transfer Learning.)
  model.roberta.embeddings.trainable = False 
  for i in range(len(model.roberta.encoder.layer) // 2):
    model.roberta.encoder.layer[i].trainable = False
  return model
    
class Encoder(keras.Model):
  def __init__(self):
    super().__init__()
    self.model_checkpoint = ModelConfig.model_checkpoint
    self.frozen_backbone = get_model(self.model_checkpoint)
    self.frozen_backbone.roberta.pooler.trainable = False
    #self.frozen_backbone.config.use_bfloat16 = True
  def call(self, input_ids, attention_mask, token_type_ids, training):
    # Just grabs the Embeddings from the Roberta Model
    embeddings = self.frozen_backbone(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, training = training)
    return embeddings['last_hidden_state'] # (1, 512, 1024)

class EncoderMultiHeadAttention(keras.layers.Layer):
  def __init__(self, encoder_dim, num_att_heads):
    # x -> MAH + x -> LayerNorm -> Dropout
    super().__init__()
    self.encoder_dim = encoder_dim
    self.num_att_heads = num_att_heads
    self.drop_prob = ModelConfig.dropout_rate

    self.MultiHeadAttention = keras.layers.MultiHeadAttention(num_heads = self.num_att_heads, 
      key_dim = self.encoder_dim // self.num_att_heads,
      value_dim = self.encoder_dim // self.num_att_heads,
      dropout = self.drop_prob
    )
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)
  def call(self, x, attention_mask, training):
    MAH = self.MultiHeadAttention(query = x, key = x, value = x, attention_mask = attention_mask, training = training)
    norm = self.LayerNorm(MAH + x, training = training)
    return self.Dropout(norm, training = training)

class DecoderMultiHeadAttention(keras.layers.Layer):
  def __init__(self, encoder_dim, decoder_dim, num_att_heads, drop_prob):
    super().__init__()
    self.encoder_dim = encoder_dim
    self.decoder_dim = decoder_dim
    assert self.encoder_dim == self.decoder_dim
    self.num_att_heads = num_att_heads
    self.drop_prob = drop_prob 
    
    
    self.dec_enc_attention = keras.layers.MultiHeadAttention(num_heads = self.num_att_heads,
      key_dim = self.encoder_dim // self.num_att_heads, 
      value_dim = self.encoder_dim // self.num_att_heads,
      dropout = self.drop_prob
    )
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)
  
  def call(self, encoder, decoder, padding_mask, training):
    # Encoder: Tensor(B, L, C)
    # Decoder: Tensor(B, L, C)
    MAH = self.dec_enc_attention(query = decoder, key = encoder, value = encoder, attention_mask = padding_mask, training = training)
    norm = self.LayerNorm(MAH + decoder, training = training)
    return self.Dropout(norm, training = training)

class FFN(keras.layers.Layer):
  def __init__(self, decoder_dim, feedforward_dim, dropout_rate):
    super().__init__()
    self.decoder_dim = decoder_dim
    self.feedforward_dim = feedforward_dim
    self.drop_prob = dropout_rate
  
    self.FFN = keras.Sequential([
      keras.layers.Dense(self.feedforward_dim, activation = 'relu'),
      keras.layers.Dense(self.decoder_dim)
    ])
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)

  def call(self, x, training):
    ffn = self.FFN(x, training = training)
    norm = self.LayerNorm(ffn + x, training = training)
    return self.Dropout(norm, training = training)


In [None]:
class EncoderMultiHeadAttention(keras.layers.Layer):
  def __init__(self, encoder_dim, num_att_heads):
    # x -> MAH + x -> LayerNorm -> Dropout
    super().__init__()
    self.encoder_dim = encoder_dim
    self.num_att_heads = num_att_heads
    self.drop_prob = ModelConfig.dropout_rate

    self.MultiHeadAttention = keras.layers.MultiHeadAttention(num_heads = self.num_att_heads, 
      key_dim = self.encoder_dim // self.num_att_heads,
      value_dim = self.encoder_dim // self.num_att_heads,
      dropout = self.drop_prob
    )
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)
  def call(self, x, attention_mask, training):
    MAH = self.MultiHeadAttention(query = x, key = x, value = x, attention_mask = attention_mask, training = training)
    norm = self.LayerNorm(MAH + x, training = training)
    return self.Dropout(norm, training = training)

class DecoderMultiHeadAttention(keras.layers.Layer):
  def __init__(self, encoder_dim, decoder_dim, num_att_heads, drop_prob):
    super().__init__()
    self.encoder_dim = encoder_dim
    self.decoder_dim = decoder_dim
    assert self.encoder_dim == self.decoder_dim
    self.num_att_heads = num_att_heads
    self.drop_prob = drop_prob 
    
    
    self.dec_enc_attention = keras.layers.MultiHeadAttention(num_heads = self.num_att_heads,
      key_dim = self.encoder_dim // self.num_att_heads, 
      value_dim = self.encoder_dim // self.num_att_heads,
      dropout = self.drop_prob
    )
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)
  
  def call(self, encoder, decoder, padding_mask, training):
    # Encoder: Tensor(B, L, C)
    # Decoder: Tensor(B, L, C)
    MAH = self.dec_enc_attention(query = decoder, key = encoder, value = encoder, attention_mask = padding_mask, training = training)
    norm = self.LayerNorm(MAH + decoder, training = training)
    return self.Dropout(norm, training = training)

class FFN(keras.layers.Layer):
  def __init__(self, decoder_dim, feedforward_dim, dropout_rate):
    super().__init__()
    self.decoder_dim = decoder_dim
    self.feedforward_dim = feedforward_dim
    self.drop_prob = dropout_rate
  
    self.FFN = keras.Sequential([
      keras.layers.Dense(self.feedforward_dim, activation = 'relu'),
      keras.layers.Dense(self.decoder_dim)
    ])
    self.LayerNorm = keras.layers.LayerNormalization(epsilon = 1e-6)
    self.Dropout = keras.layers.Dropout(self.drop_prob)

  def call(self, x, training):
    ffn = self.FFN(x, training = training)
    norm = self.LayerNorm(ffn + x, training = training)
    return self.Dropout(norm, training = training)

In [None]:
class TransformerDecoder(keras.layers.Layer):
  def __init__(self, encoder_dim, decoder_dim, feedforward_dim, num_att_heads, dropout_rate):
    super().__init__()
    self.encoder_dim = encoder_dim
    self.decoder_dim = decoder_dim
    self.feedforward_dim = feedforward_dim
    self.num_att_heads = num_att_heads
    self.dropout_rate = dropout_rate

    self.DecoderAttention = EncoderMultiHeadAttention(self.decoder_dim, self.num_att_heads)
    self.EncoderDecoderAttention = DecoderMultiHeadAttention(self.encoder_dim, self.decoder_dim, self.num_att_heads, self.dropout_rate)
    self.FFN = FFN(self.decoder_dim, self.feedforward_dim, self.dropout_rate)

  def call(self, encoder, decoder, attention_mask, padding_mask, training):
    decoder_values = self.DecoderAttention(decoder, attention_mask = attention_mask, training = training)
    decoder_attended = self.EncoderDecoderAttention(encoder, decoder_values, padding_mask = padding_mask, training = training)
    ffn = self.FFN(decoder_attended, training = training)
    return ffn

class TransformerDecoderModel(keras.Model):
  def __init__(self):
    super().__init__()
    # ----------------------PROCESS---------------------
    # 1) GET EMBEDDINGS
    # 2) ADD POSITIONAL EMBEDDINGS
    # 3) RUN THROUGH THE DECODERS
    # 4) FINAL FFN
    # ---------------------PRETRAINING PARTS------------------------------
    # EMBEDDINGS
    self.model_checkpoint = ModelConfig.model_checkpoint
    # Load an Encoder Model
    tmp_model = get_model(self.model_checkpoint)
    # Steal embeddings(Pretrained Embeddings)
    self.embeddings = tmp_model.roberta.embeddings # call(input_ids, token_type_ids)
    self.vocab_size = self.embeddings.vocab_size
    del tmp_model 
    # ------------------MODEL DEFINITIONS------------------------
    self.max_len = MAX_LENGTH_LABEL - 1 
    self.decoder_dim = ModelConfig.decoder_dim
    self.encoder_dim = ModelConfig.encoder_dim
    self.num_att_heads = ModelConfig.num_att_heads
    self.decoder_layers = ModelConfig.decoder_layers
    self.dropout_rate = ModelConfig.dropout_rate
    self.feedforward_dim = ModelConfig.intermediate_dim

    self.decoders = [TransformerDecoder(
        self.encoder_dim,
        self.decoder_dim,
        self.feedforward_dim,
        self.num_att_heads,
        self.dropout_rate
    ) for _ in range(self.decoder_layers)]
    
    
    # PRECOMPUTE CAUSAL Attention MASKS
    self.attention_mask = self.causal_attention_mask(self.max_len, self.max_len, tf.uint8)
    # PRECOMPUTE Positional Embeddings
    self.pos_enc = self.positional_embeddings(self.max_len, self.decoder_dim) # (1, L, C) 
    self.pos_enc = tf.expand_dims(self.pos_enc, axis = 0)
  def positional_embeddings(self, max_length, dim):
    L, C = (max_length, dim) 
    positional_encodings = np.zeros((L, C), np.float32)
    for pos in range(L):
      for i in range(0, C  + 2, 2):
        if i >= C:
          continue
        positional_encodings[pos, i] = math.sin(pos / 10000 ** (i / self.decoder_dim))
        if i + 1 >= C:
          continue 
        positional_encodings[pos, i + 1] = math.cos(pos / 10000 ** ((i + 1) / self.decoder_dim))
    return tf.identity(positional_encodings) 


  def causal_attention_mask(self, n_dest, n_src, dtype):
    """Masks the upper half of the dot product matrix in self attention.

    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(1, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult) # Diagonal Mask 

  def compute_padding_mask(self, decoder_ids, dtype):
    # Returns an Attention Mask to mask all padding tokens
    PAD_TOKEN_ID = ModelConfig.tokenizer.pad_token_id
    # Create a Mask over token ids.
    mask = tf.not_equal(decoder_ids, PAD_TOKEN_ID)
    return tf.cast(mask, dtype)
  def call_val(self, encoder, decoder, training):
    B, L, C = encoder.shape 
    _, Dec_Len = decoder.shape 

    padding_mask = self.compute_padding_mask(decoder, tf.uint8)
    padding_mask = tf.expand_dims(padding_mask, axis = -1) # (B, Dec_Len, 1) 

    attention_mask = self.causal_attention_mask(Dec_Len, Dec_Len, tf.uint8) # (B, Dec_Len, Dec_Len)
    attention_mask = attention_mask * padding_mask # (B, Dec_Len, Dec_Len) 

    padding_mask = tf.repeat(padding_mask, L, axis = -1) # (B, Dec_Len, L)

    decoder_embeddings = self.embeddings(decoder, training = training) # (B, L, C)
    # Pos Enc
    pos_enc = tf.identity(self.pos_enc)[:, :Dec_Len, :] # (B, L, C)
    decoder_embeddings = decoder_embeddings + pos_enc

    for DECODER in self.decoders:
      decoder_embeddings = DECODER(encoder, decoder_embeddings, attention_mask, padding_mask, training = training)
    return decoder_embeddings 
  def call(self, encoder, decoder, training):
    '''
    Encoder: Encoder Embeddings: Tensor(B, L, C)
    Decoder: Decoder Input Ids: Tensor(B, L')
    training: in training mode?
    Unfortuately, you cannot precompute attention masks, since the padding mask depends on the decoder_ids
    '''
    # NO NEED For TOKEN TYPE IDs, as they are always 0(Always 1 Sentence)
    B, L, C = encoder.shape
    _, Dec_Len = decoder.shape

    # GENERATE MASKS
    padding_mask = self.compute_padding_mask(decoder, tf.uint8) # (B, L)
    padding_mask = tf.expand_dims(padding_mask, axis = -1)
    attention_mask = tf.identity(self.attention_mask) # (B, L, L) - Used only in Decoder Attention
    attention_mask = attention_mask * padding_mask # (B, L, L)
    
    padding_mask = tf.repeat(padding_mask, L, axis = -1)
    
    
    # Convert Tokens to Embeddings
    decoder_embeddings = self.embeddings(decoder, training = training) # (B, L', C)
    # ----------------GET POS ENC FOR DECODER INPUTS(Encoder already got them) -------------------
    pos_enc = tf.identity(self.pos_enc) # (1, L, C)

    decoder_embeddings = decoder_embeddings + pos_enc
    # Run through the Decoders:
    for DECODER in self.decoders:
      decoder_embeddings = DECODER(encoder, decoder_embeddings, attention_mask, padding_mask, training = training)
    # FINAL HEAD
    return decoder_embeddings

In [None]:
class DenseHead(keras.Model):
  def __init__(self, vocab_size, pad_token):
    super().__init__()
    self.vocab_size = vocab_size
    self.pad_token = pad_token
 
    self.head = keras.layers.Dense(self.vocab_size)
    
    
    self.label_smoothing = ModelConfig.label_smoothing
    

  def call(self, x, training):
    # Does the logic of Argmax and prediction in one swoop
    pred = self.head(x, training = training) # (B, C)
    return pred
  def call_val(self, x, training):
    pred = self.head(x[:, -1], training = training)
    return pred

def replace_index(cur_tokens, index, new_val):
    start_tokens = cur_tokens[:, :index] # (B, L)
    new_value = tf.expand_dims(new_val, axis = 1) # (B, 1)
    end_tokens = cur_tokens[:, index + 1:] # (B, L) 
    
    new_tokens = tf.concat([start_tokens, new_value, end_tokens], axis = 1) # (B, L)
    return new_tokens

class FullModel(keras.Model):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder()
    self.START_TOKEN = ModelConfig.START_TOKEN
    self.END_TOKEN = ModelConfig.END_TOKEN # Default Special tokens for HuggingFace Tokenizers
    self.MAX_LEN = MAX_LENGTH_LABEL
    self.vocab_length = ModelConfig.tokenizer.vocab_size 
    self.decoder = TransformerDecoderModel()
    self.model_head = DenseHead(self.decoder.vocab_size, ModelConfig.tokenizer.pad_token_id)
  
  def call(self, inputs, training):
    input_ids = inputs[:, :, 0]
    attention_mask = inputs[:, :, 1]
    token_type_ids = inputs[:, :, 2]
    
    encoding_embeddings = self.encoder(input_ids, attention_mask, token_type_ids, training = training)
    B = encoding_embeddings.shape[0]
    # Predict again and again, up to MAX_LEN. 
    predicted_tokens = tf.ones((B, self.MAX_LEN), dtype = tf.int64) * ModelConfig.START_TOKEN
    for i in range(1, self.MAX_LEN):
        cur_tokens = predicted_tokens[:, :i] # (B, :i)
        decoded_embeddings = self.decoder.call_val(encoding_embeddings, cur_tokens, training = training) 
        pred_tokens = self.model_head.call_val(decoded_embeddings, training = training) # (B, )
        pred_tokens = tf.argmax(pred_tokens, axis = 1)
    
        predicted_tokens = replace_index(predicted_tokens, i, pred_tokens) # (B, L)
    return predicted_tokens
        
  def call_train(self, input_ids, attention_mask, token_type_ids, decoder_input_ids, training):
    # decoder_input_ids: Tensor(B, L)
    encoded_embeddings = self.encoder(input_ids, attention_mask, token_type_ids, training = training) # (B, L, C)
    decoded_values = self.decoder(encoded_embeddings, decoder_input_ids, training = training)
    preds = self.model_head.call(decoded_values, training = training)
    return preds # (B, L, C)
  def call_val(self, input_ids, attention_mask, token_type_ids, training):
    # Inference Loop: 
    encoded_embeddings = self.encoder(input_ids, attention_mask, token_type_ids, training = training)
    # create Starter token
    B, _, _ = encoded_embeddings.shape
    sentence_tokens = tf.ones((B, 1), tf.int64) * tf.cast(self.START_TOKEN, tf.int64)
    pred_logits = tf.ones((B, 0, self.vocab_length), encoded_embeddings.dtype)
    for i in range(self.MAX_LEN - 1):
      embeddings = self.decoder.call_val(encoded_embeddings, sentence_tokens, training = training) # (B, L, C)
      pred = self.model_head.call_val(embeddings, training = training) # (B, C)
      # Add the logits 
      TMP_LOGITS = tf.expand_dims(pred, axis = 1) # (B, 1, C)
      
      pred_logits = tf.concat([pred_logits, TMP_LOGITS], axis = 1) # (B, 1, C)

      pred = keras.activations.softmax(pred) # (B, C)
      pred = tf.argmax(pred, axis = -1) # (B, )
      pred = tf.expand_dims(pred, axis = 1)
      # Just append the values, should predict <END> and then just random garbage(We filter it out)
      sentence_tokens = tf.concat([sentence_tokens, tf.cast(pred, sentence_tokens.dtype)], axis = 1)
    return pred_logits, sentence_tokens

In [None]:
def LOAD_MODEL():
    model = FullModel()
    input_ids = tf.ones((1, 512), dtype = tf.int64) 
    attention_mask = tf.ones((1, 512), dtype = tf.int64)
    token_type_id = tf.zeros((1, 512), dtype = tf.int64)
    
    input_embeddings = tf.stack([input_ids, attention_mask, token_type_id], axis = -1)
    
    model(input_embeddings, training = False)
    model.load_weights('../input/coleridge-mlm-model/acc_model.h5')
    return model

In [None]:
model = LOAD_MODEL()

### Predict

In [None]:
def tokenize_data(data):
    tokenized = ModelConfig.tokenizer(data, padding = 'max_length', truncation= True, max_length = MAX_LENGTH, return_attention_mask = True, return_token_type_ids = True)
    input_ids = np.array(tokenized['input_ids'])
    attention_mask = np.array(tokenized['attention_mask'])
    token_type_ids = np.array(tokenized['token_type_ids'])
    

    inputs = tf.stack([input_ids, attention_mask, token_type_ids], axis = -1)
    return inputs

In [None]:
def decode_values(labels):
    # labels: Tensor(N, 60)
    end_token = ModelConfig.tokenizer.eos_token_id
    first_end_token = tf.argmax(tf.cast(tf.equal(labels, end_token), tf.int64), axis = -1)
    N, _ = labels.shape
    predicted_values = []
    for n in range(N):
        value = labels[n][:first_end_token[n] + 1][1:-1]
        value = ModelConfig.tokenizer.decode(value)
        all_datasets = value.split(" | ")
        all_datasets = [clean_text(ex) for ex in all_datasets]
        if all_datasets != ['']:
            predicted_values += all_datasets
    
    return '|'.join(list(set(predicted_values)))

# Batched submission: Single Pred is too slow.

In [None]:
PREDICTION_LABELS = []
NUM_BATCHES = 20
cur_idx = 0
cur_COUNT = 0

while True:
    min_idx = cur_idx
    max_idx = min_idx + NUM_BATCHES 
    cur_idx = max_idx 
    
    selected_test_data = all_test_data[min_idx: max_idx]
    TST_DATA = []
    indices = []
    padded = []
    CUR_IDX = 0 
    all_zero = True
    
    for test_data in selected_test_data:
        try:
            if test_data == []:
                padded += [False]
                indices += [[]]
            else:
                input_values = tokenize_data(test_data)
                
                
                padded += [True]
                NEXT_IDX = len(input_values) + CUR_IDX 
                indices += [(CUR_IDX, NEXT_IDX)]
                TST_DATA += [input_values]
                CUR_IDX = NEXT_IDX
                all_zero = False
        except:
            padded += [False]
            indices += [[]]
  
    if all_zero or len(TST_DATA) == 0:
        print("ALLL ZEROOOS")
        for _ in padded:
            PREDICTION_LABELS += ['']
    else:
        # TST_DATA = [] raises an error, caught in the prev if.
        valid = True
        try:
            TST_DATA = tf.concat(TST_DATA, axis = 0)
            with tf.device("GPU:0"):
                prediction = model(TST_DATA, training = False)
        except:
            valid = False
            for _ in padded:
                PREDICTION_LABELS += ['']
        if valid:
            for IDX in range(len(padded)):
                try:
                    if not padded[IDX]:
                        PREDICTION_LABELS += ['']
                        continue
                    idx1, idx2 = indices[IDX]
                    pred = prediction[idx1: idx2] # (B, L)
                    decoded_predictions = decode_values(pred) 
                    PREDICTION_LABELS += [decoded_predictions]

                except:
                    print("ERRROOROOOROOROOR")
                    PREDICTION_LABELS += ['']
    
    if cur_idx >= cur_COUNT:
        cur_COUNT += 200
        print(len(PREDICTION_LABELS))
    if cur_idx >= len(all_test_data):
        # Done.
        break
        

# Inference isn't working
- Fist part: getting tokens shoulf be fine
- Assugming toen types is the issue.

In [None]:
sample_submission['PredictionString'] = PREDICTION_LABELS
sample_submission.to_csv("./submission.csv", index = False)

In [None]:
sample_submission.PredictionString.iloc[0]