In [None]:
import numpy as np
import pandas as pd 
import os
import gc
import sys
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from typing import Optional, Dict
from overrides import overrides


! pip install --upgrade pip
! pip install --upgrade allennlp


import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
import allennlp
from allennlp.common.checks import ConfigurationError
from allennlp.data import Vocabulary, Instance, Batch
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.fields import TextField, LabelField, TensorField
from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules import FeedForward, Seq2VecEncoder, TextFieldEmbedder
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.nn import util
from allennlp.training.metrics import MeanAbsoluteError


from transformers import AutoModel, AutoTokenizer

from sklearn.model_selection import KFold

In [None]:
from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder
# import inspect
# inspect.signature(PretrainedTransformerEmbedder.__init__)

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')

# Generate a sequence of word representations (vetors) for model input
* Huggingface : `Tokenizer` is responsible for all the stuff with some useful methods to probe `covert_tokens_to_ids(self.tokenize())` , `encode`, `encode_plus`
* AllenNLP has more control for varying Tokenization and Indexing with `Vocabulary`, `Indexer` and `Tokenizer`.

# AllenNLP: Text -> Vectors
1.`Tokenizers` (Text → Tokens) and `TokenIndexers` (Tokens → Ids):  
   * Tokenizers: AllenNLP has its own tokenizers and tokenizers built on top of `Spacy` tokenizers. In any way, it outputs a list of `Token` object.
   * Indexers: This is super convenient when we experiment with multiple indexers. Combined to the above step, the easy coding example (Text -> Ids) could be as this. 
   * work with 🤗transformers: Since transformers models have a fixed scheme for Tokens → Ids,`Tokenizers` and `TokenIndexers` have to be matched using `PretrainedTransformerTokenizer` and `PretrainedTransformerIndexer`. Underlying them,  🤗transformers Auto Classes will be called which
        > automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary
        + `AutoModel.from_pretrained('bert-base-cased')`
        + `AutoTokenizer.from_pretrained('bert-base-cased')`: this will access the vocabulary and output [`PreTrainedTokenizerBase`](https://huggingface.co/transformers/internal/tokenization_utils.html#pretrainedtokenizerbase) object for both tokenization and indexing.
        + `AutoConfig.from_pretrained('bert-base-cased')`: this is inferred automatically by the above two methods which output Configuration objects like [`DistilBertConfig`](https://huggingface.co/transformers/_modules/transformers/models/distilbert/configuration_distilbert.html#DistilBertConfig)
        + **Since we need to submit the code without access network, we have to use the directory with `config.json`, e.g., `config_dir = "../input/roberta-base"`**
    

2. `TextField`
```
    tokenizer = ...  # Whatever tokenizer you want
    sentence = "We are learning about TextFields"
    tokens = tokenizer.tokenize(sentence)
    token_indexers = {"indexer1": SingleIdTokenIndexer()} # we'll talk about this in the next section
    text_field = TextField(tokens, token_indexers)
    token_tensor = text_field.as_tensor() # The output would be: {"indexer1": {"tokens": torch.LongTensor([[1, 3, 2, 9, 4, 3]])}}
    ```
```

3.`TextFieldEmbedders` (Ids → Vectors):  Embedding those IDs into a vector space which happens to the model side. The names has to align with ones used in TokenIndexers
```
embedder = BasicTextFieldEmbedder(token_embedders={"indexer1": Embedding(num_embeddings=10, embedding_dim=3)}) # This 'indexer1' key must match the 'indexer1' key in the `token_tensor` above. 
```
        
        

In [None]:
backbone = '../input/d/maunish/clr-roberta/model3/model3.bin'
config_dir = "../input/roberta-base"

In [None]:
# construct instances
train_instances, test_instances = [], []
tokenizer = PretrainedTransformerTokenizer(model_name=config_dir, add_special_tokens=True)
indexer = PretrainedTransformerIndexer(model_name=config_dir) 
vocab = Vocabulary.from_pretrained_transformer(config_dir)
for text, y in zip(list(train['excerpt']),list(train['target'])):
    tokens = tokenizer.tokenize(text) # text -> tokens
    text_field = TextField(tokens, token_indexers={'tokens': indexer}) # tokens -> ids
    text_field.index(vocab)
    # if I want to use Huggingface pipeline, we can use the following code to extract the tokens information as input to the transformers models
    # However, I still wanna use AllenNLP PretrainedTransformerEmbedder as wrapper for transformers models for brevity
    # token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
    # add batch dimension
    #token_tensor['tokens']['token_ids'] = token_tensor['tokens']['token_ids'].unsqueeze(0)
    #token_tensor['tokens']['mask'] = token_tensor['tokens']['mask'].unsqueeze(0)
    #token_tensor['tokens']['type_ids'] = token_tensor['tokens']['type_ids'].unsqueeze(0)
    fields = {"tokens": text_field}
    fields["label"] = TensorField(torch.FloatTensor([y]))
    train_instances.append(Instance(fields))

# Unsolved Bug: UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 64: invalid start byte
text_field_embedder = PretrainedTransformerEmbedder(model_name=backbone, train_parameters=False, load_weights=False)
text_field_embedder = text_field_embedder.to(device)


In [None]:
# Due to the unsolved bug above, I choose to directly use transformers model
# text_field_embedder = BasicTextFieldEmbedder(token_embedders={"tokens": token_embedder})
# print("Using the TextFieldEmbedder:", text_field_embedder(token_tensor))
# print("Using the TokenEmbedder:", token_embedder(token_tensor['tokens']['token_ids'], token_tensor['tokens']['mask']))


# # `mask` is needed for `encoder`
# tokens_mask = token_tensor['tokens']['mask'] #get_text_field_mask(tokens) # shape: (B, T)
# print(tokens_mask)
# from allennlp.nn import util
# tokens_mask = util.get_text_field_mask(token_tensor) # shape: (B, T)
# print(tokens_mask)


# # Even though `TextFieldEmbedders` is the standard way to do embeddings, 
# # we have another option by just using `TokenEmbedder` (since normally we donot have > 1 (indexer, embedder) pair to concatenate).
# token_ids = tokens['tokens']['token_ids'] # shape: (B, T)
# embeddings = self.pretrained_embeddings.forward


# indexer logic: e.g. the length of: "[CLS] A B C [SEP] [CLS] D E F [SEP]" .
#         (token_ids=token_ids, mask=mask) # shape: (B, T, C)

In [None]:
# construct AllenNLP model for this regression task
class ClrpModel(Model):

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 regressor: nn.Module,
                 seq2vec_encoder: Seq2VecEncoder = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 remove_cls: bool = False) -> None:
        super(ClrpModel, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.seq2vec_encoder = seq2vec_encoder
        self.regressor = regressor
        self.metrics = {
                "mae": MeanAbsoluteError()
        }
        self.loss = nn.MSELoss()
        self.remove_cls = remove_cls
        
        initializer(self)

    @overrides
    def forward(self,  
                tokens: Dict[str, Dict[str, torch.LongTensor]],
                label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:

        """
        Parameters
        ----------
        tokens : Dict[str, Dict[str, Variable]], required
            The output of ``TextField.as_array()``.
        label : Variable, optional (default = None)
            A variable representing the label for each instance in the batch.
        Returns
        -------
        An output dictionary consisting of:
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_classes)`` representing a distribution over the
            label classes for each instance.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        
        tokens_mask = util.get_text_field_mask(tokens) # shape: (B, T)
        torch.cuda.empty_cache() # in case of OOM error
        embedded_tokens = self.text_field_embedder(tokens) # shape: (B, T, C)
        if self.seq2vec_encoder is None:
            encoded_tokens = embedded_tokens[:,0,:]
        else:
            if self.remove_cls:
                embedded_tokens = embedded_tokens[:,1:,:]
                tokens_mask = tokens_mask[:, 1:]
            encoded_tokens = self.seq2vec_encoder(embedded_tokens, mask=tokens_mask) # shape: (B, hidden)

        logits = self.regressor(encoded_tokens) # shape: (B, hidden) -> (B, 1)
        
        output_dict = {'logits': logits}
        if label is not None:
            for metric in self.metrics.values():
                metric(logits, label)
            logits = logits.squeeze(-1)
            label = label.squeeze(-1)
            output_dict["loss"] = self.loss(logits, label)
            
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {metric_name: metric.get_metric(reset) for metric_name, metric in self.metrics.items()}

class View(nn.Module):
    def __init__(self):
        super(View, self).__init__()

    def forward(self, x):
        return torch.unsqueeze(x, 1)
    
# instantiate the regressor
def crt_model(hidden_size=1024):
#     model = nn.Sequential(
#             View(), # input: (bsz, input channel = 1, signal length=lm_hidden_units=768)
#             nn.Conv1d(1, 64, 5,padding=0), # signal legth= emb_size = 768-4
#             nn.ReLU(), 
#             nn.MaxPool1d(2), # signal legth= 764/2 = 382
#             nn.Conv1d(64, 128, 5,padding=0), # 378
#             nn.ReLU(),
#             nn.MaxPool1d(2), # 189
#             nn.Conv1d(128, 256, 5,padding=0), # 185
#             nn.ReLU(),
#             nn.MaxPool1d(185),  # 1
#             nn.Flatten(1,-1), # no dim for signal length
#             nn.Linear(256, 120), nn.ReLU(),
#             nn.Linear(120, 1),
#     ) # Training loss:1429762551.7295775
#     model = nn.Sequential(
#         View(), # input: (bsz, input channel = 1, signal length=lm_hidden_units=768) | roberta-large: 1024
#         nn.Conv1d(1, 64, 5,padding=0), # signal legth= emb_size = 768-4  | roberta-large: 1024 - 4
#         nn.ReLU(), 
#         nn.MaxPool1d(2), # signal legth= 764/2 = 382 | roberta-large: 510
#         nn.Conv1d(64, 128, 5,padding=0), # 378 | roberta-large: 506
#         nn.ReLU(),
#         nn.MaxPool1d(2), # 189  | roberta-large: 253
#         nn.Conv1d(128, 256, 5,padding=0), # 185  | roberta-large: 249
#         nn.ReLU(),
#         nn.Conv1d(256, 512, 5,padding=0), # 181  | roberta-large: 245
#         nn.ReLU(),
#         nn.Conv1d(512, 1024, 5,padding=0), # 177  | roberta-large: 241
#         nn.ReLU(),
#         nn.Conv1d(1024, 2048, 5,padding=0), # 173  | roberta-large: 237
#         nn.ReLU(),
#         nn.Conv1d(2048, 4098, 5,padding=0), # 169  | roberta-large: 233
#         nn.ReLU(),
#         nn.Conv1d(4098, 8196, 5,padding=0), # 165 | roberta-large: 229
#         nn.ReLU(),
#         nn.MaxPool1d(229),  # 1
#         nn.Flatten(1,-1), # no dim for signal length
#         nn.Linear(8196, 120), nn.ReLU(),
#         nn.Linear(120, 240), nn.ReLU(),
#         nn.Linear(240, 480), nn.ReLU(),
#         nn.Linear(480, 980), nn.ReLU(),
#         nn.Linear(980, 1),
#         ) # Training loss: 2.057927086896341e+36 (use weight_init); 1.1041184755698057

    model = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.Dropout(0.1), nn.Linear(hidden_size, 1), ) # Training loss:59182.79889552157

#     def weight_init(m):
#         if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
#             nn.init.normal_(m.weight)
#             nn.init.zeros_(m.bias) # keras bias in default is initialized into 0
#     model.apply(weight_init)
    return model


    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = {
#     'lr': 1e-5,
#     'wd':1e-1,
#     'batch_size_pretrained_infer': 128,
    'batch_size_reg_train':8,
    'max_len':256,
    'epochs':7,
    'nfolds':3,
    'seed':42,
}



# construct model
regressor = crt_model()
regressor.to(device)
model = ClrpModel(vocab, BasicTextFieldEmbedder(token_embedders={'tokens': text_field_embedder}), regressor)

In [None]:
# train model
class CLRPDataset(nn.Module):
    def __init__(self, instances):
        self.instances = instances
    
    def __getitem__(self,idx):
        
        return self.instances[idx]
    
    def __len__(self):
        return len(self.instances)
    
def collate_fn(instances):
    # instances into a dictinoary of tensor
    instances = Batch(instances)
    instances.index_instances(vocab)
    return instances.as_tensor_dict()


optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = None

best_loss = 99999
best_valid_predictions = list()

train_dl = DataLoader(CLRPDataset(train_instances),
                batch_size=config["batch_size_reg_train"],
                shuffle=False,
                collate_fn=collate_fn,
#                 num_workers = 4,
                pin_memory=True,
                drop_last=False)

# for i in range(config['epochs']):
model.train()
train_loss = 0
for i, inputs in enumerate(train_dl):
    inputs = util.move_to_device(inputs, device)
    optimizer.zero_grad()
    outputs = model(**inputs)
    loss = outputs['loss']
    loss.backward()
    optimizer.step()

    train_loss += loss.item()

#     print(f"epoch:{i} ")

#     valid_loss, valid_predictions = valid_loop(valid_dl, model, loss_fn, device)
#     print(f" | Validation loss:{valid_loss}  ")
#     if valid_loss <= best_loss:

#         print(f"Validation loss Decreased from {best_loss} to {valid_loss}")

#         best_loss = valid_loss
#         best_valid_predictions = valid_predictions

# fold_valid_predictions.append(best_valid_predictions)
# fold_valid_targets.append(target.tolist())

# torch.save(model.state_dict(), './model.th')
train_loss /= len(train_dl)
print(f"Training loss:{train_loss}")

In [None]:
# predict
test = pd.read_csv(data_dir + 'test.csv')
test_instances = list()
for text in list(test['excerpt']):
    tokens = tokenizer.tokenize(text)
    text_field = TextField(tokens, token_indexers={'tokens': indexer})
    fields = {"tokens": text_field}
    test_instances.append(Instance(fields))
test_instances = collate_fn(test_instances)
with torch.no_grad():
    test_instances = util.move_to_device(test_instances, device)
    predictions = model(**test_instances)
submission = pd.DataFrame({'id':test.id,'target':predictions['logits'].squeeze().cpu().detach().numpy()})
submission.to_csv('submission.csv',index=False)