In [1]:
import os
from tqdm.auto import tqdm

## Source
1. https://link.springer.com/chapter/10.1007/978-3-030-14799-0_11
1. https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU#scrollTo=dYapTjoYa0kO

In [None]:
# Bert 
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertModel

In [None]:
for i in df['clean_txt'].head(10):
    text = i
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

In [None]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
    print(encoded_layers)

In [None]:
print('Tensor shape for each layer: ', encoded_layers[0].size())

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained(PRETRAINED_MODEL)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)

In [None]:
# Global variables
#PAD_TOKEN_LABEL_ID = CrossEntropyLoss().ignore_index  #FIXME Not used anywhere
PRETRAINED_MODEL = 'bert-base-multilingual-uncased'
BATCH_SIZE = 2
LEARNING_RATE_MODEL = 1e-5
LEARNING_RATE_CLASSIFIER = 1e-3
WARMUP_STEPS = 0
GRADIENT_ACCUMULATION_STEPS = 1
MAX_GRAD_NORM = 1.0
SEED = 42
NO_CUDA = False
MAX_SENTENCE_LENGTH = 128
#NUM_LABELS = len(df_train.y.unique())
EPOCHS = 2
#MODEL_DIR = "./model/{}-batch-{}-epoch-{}/".format(PRETRAINED_MODEL, BATCH_SIZE, EPOCHS)
#try: os.mkdir(MODEL_DIR) 
#except FileExistsError: pass

In [None]:
class Transformers:
    model = None

    def __init__(self, tokenizer):
        self.device = torch.device("cuda" if torch.cuda.is_available() and not NO_CUDA else "cpu")
        self.tokenizer = tokenizer

    def predict(self, sentence):
        if self.model is None or self.tokenizer is None:
            self.load()

        embeddings = list(convert_to_embedding([(sentence, -1)]))
        preds = self._predict_tags_batched(embeddings)
        return preds

    def evaluate(self, dataloader, y_true):
        from sklearn.metrics import classification_report
        y_pred = self._predict_tags_batched(dataloader)
        score = classification_report(y_true, y_pred)
        print(score)

    def _predict_tags_batched(self, dataloader):
        preds = []
        self.model.eval()
        # Predict
        for batch in tqdm(dataloader, desc="Computing NER tags"):
            batch = tuple(t.to(self.device) for t in batch)
          # Telling the model not to compute or store gradients, saving memory and 
          # speeding up prediction
            with torch.no_grad():
                outputs = self.model(batch[0])
                _, is_neg = torch.max(outputs[0], 1)
                preds.extend(is_neg.cpu().detach().numpy())

        return preds

    def train(self, dataloader, model, epochs):
        assert self.model is None  # make sure we are not training after load() command
        model.to(self.device)
        self.model = model

        t_total = len(dataloader) // GRADIENT_ACCUMULATION_STEPS * epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        optimizer_grouped_parameters = [
            {"params": model.bert.parameters(), "lr": LEARNING_RATE_MODEL},
            {"params": model.classifier.parameters(), "lr": LEARNING_RATE_CLASSIFIER}
        ]
        optimizer = AdamW(optimizer_grouped_parameters)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

        # Train!
        print("***** Running training *****")
        print("Using: {}".format(self.device))
        print("Training on %d examples", len(dataloader))
        print("Num Epochs = %d", epochs)
        print("Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(epochs, desc="Epoch")
        self._set_seed()
        for _ in train_iterator:
            epoch_iterator = tqdm(dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(self.device) for t in batch)
                outputs = model(batch[0], labels=batch[1])
                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / GRADIENT_ACCUMULATION_STEPS

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule

                    model.zero_grad()
                    global_step += 1

        self.model = model

        return global_step, tr_loss / global_step

    def _set_seed(self):
        torch.manual_seed(SEED)
        if self.device == 'gpu':
            torch.cuda.manual_seed_all(SEED)

    def load(self, model_dir='weights/'):
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.model = BertForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)

In [None]:
dataloader_test = torch.load('./model/dataLoaders/dataTest')
dataloader_train = torch.load('./model/dataLoaders/dataTrain')

In [None]:
config = BertConfig.from_pretrained(PRETRAINED_MODEL, 
                                    num_labels=NUM_LABELS)
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL, 
                                          do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, 
                                                      config=config)

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
evaluate(MODEL_DIR, y_true=df_test_test.y.values)

In [None]:
predictor = Transformers(tokenizer)
#predictor.load(MODEL_DIR)
predictor.train(dataloader, model, epochs=EPOCHS)

In [None]:
def evaluate(model_dir, y_true):
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL, do_lower_case=True)
    predictor = Transformers(tokenizer)
    predictor.load(model_dir=model_dir)
    predictor.evaluate(data_loader_test, y_true)