## Colab Setup

In [1]:
import sys
#sys.path.append("..") # only used if we can a notebook folder?

is_in_colab = 'google.colab' in sys.modules

if is_in_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    sys.path.insert(0,'/content/drive/MyDrive/nlp_question_answer_project')

    %cd /content/drive/MyDrive/nlp_question_answer_project/

Mounted at /content/drive
/content/drive/MyDrive/nlp_question_answer_project


In [2]:
%%capture
!pip install -r requirements.txt

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Code

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from utils.dataset import load_datasets_by_language, save_dataset, load_dataset

import pandas as pd

import torch
from torch import nn
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score # @TODO: pytorch equviavlent or my own implementation?

import numpy as np

from tqdm import tqdm

import datasets
import math

import gc

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
path_to_training_set   = "data/raw_train_set.pkl"
path_to_validation_set = "data/raw_validation_set.pkl"
dataset = load_datasets_by_language(path_to_training_set, path_to_validation_set)

Beam search: Repeating patterns
No repeat ngram: Remedy to repeating patterns
Sample from top K: Leads to more variation. However, the setences make less sense given the context
Sample from top p (nucleus): Again more variation, and possibly the words make more sense

In [None]:
def transformers_generate_text(prompt: str, tokenizer, model, max_length: int = 50):
    ## Beam search
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5)
    print("----- Beam Search -----")
    for i, output in enumerate(outputs):
        print("{}: <S>{}<E>".format(i, tokenizer.decode(output, skip_special_tokens=True)))

    ## Beam search + no repeat ngram
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, no_repeat_ngram_size=2)
    print("\n\n----- Beam Search + no repeat ngram=2 -----")
    for i, output in enumerate(outputs):
        print("{}: <S>{}<E>".format(i, tokenizer.decode(output, skip_special_tokens=True)))

    ## sample from Top k words
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(input_ids, do_sample=True, max_length=max_length, top_k=50, num_return_sequences=5, early_stopping=True)
    print("\n\n----- Top 50 words -----")
    for i, output in enumerate(outputs):
        print("{}: <S>{}<E>".format(i, tokenizer.decode(output, skip_special_tokens=True)))

    ## sample from Top p (nucleus) words with top k to filter out low probability words
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(input_ids, do_sample=True, max_length=max_length, top_p=0.80, top_k=50, num_return_sequences=5, early_stopping=True)
    print("\n\n----- Intersection between Top 80% words + Top 50 words -----")
    for i, output in enumerate(outputs):
        print("{}: <S>{}<E>".format(i, tokenizer.decode(output, skip_special_tokens=True)))

# English

## A. Pretrained

In [7]:
model_checkpoint = "gpt2"
tokenizer_en = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model_en = AutoModelForCausalLM.from_pretrained(model_checkpoint, pad_token_id=tokenizer_en.eos_token_id)

model_en.to(device)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

## A. Finetune

In [8]:
def chunkify_transformer_tokens(tokens, chunk_size: int = 128):
    concatenated_examples = {k: sum(tokens[k], []) for k in tokens.keys()}
  
    # Drop the last chunk if it's smaller than chunk_size
    total_length = len(concatenated_examples[list(tokens.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size

    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

    # Create a new labels column
    result["label"] = result["input_ids"].copy()
    result["labels"] = result["input_ids"].copy()

    return result


In [9]:
def prepare_text_for_finetuning(text):
    tokens = tokenizer_en(text)
    token_chunks = chunkify_transformer_tokens(tokens)
    return datasets.Dataset.from_dict(token_chunks)

In [10]:
document = dataset["en"]["train_set"]["document_plaintext"].tolist()
train_set = prepare_text_for_finetuning(document)

document = dataset["en"]["validation_set"]["document_plaintext"].tolist()
eval_set = prepare_text_for_finetuning(document)

Token indices sequence length is longer than the specified maximum sequence length for this model (1173 > 1024). Running this sequence through the model will result in indexing errors


In [11]:
training_args = TrainingArguments(
    "lm-finetuned/gpt_en",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    #num_train_epochs=1,
)

trainer = Trainer(
    model=model_en,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=eval_set
)

In [12]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

trainer.train()

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1032
  Batch size = 8


***** Running training *****
  Num examples = 7586
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2847


Perplexity: 37.97


Epoch,Training Loss,Validation Loss
1,3.7035,3.466416
2,3.5179,3.463253
3,3.4396,3.464621


Saving model checkpoint to lm-finetuned/gpt_en/checkpoint-500
Configuration saved in lm-finetuned/gpt_en/checkpoint-500/config.json
Model weights saved in lm-finetuned/gpt_en/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1032
  Batch size = 8
Saving model checkpoint to lm-finetuned/gpt_en/checkpoint-1000
Configuration saved in lm-finetuned/gpt_en/checkpoint-1000/config.json
Model weights saved in lm-finetuned/gpt_en/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to lm-finetuned/gpt_en/checkpoint-1500
Configuration saved in lm-finetuned/gpt_en/checkpoint-1500/config.json
Model weights saved in lm-finetuned/gpt_en/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1032
  Batch size = 8
Saving model checkpoint to lm-finetuned/gpt_en/checkpoint-2000
Configuration saved in lm-finetuned/gpt_en/checkpoint-2000/config.json
Model weights saved in lm-finetuned/gpt_en/checkpoint-2000/pytorch_model.bin
Saving model checkp

Perplexity: 31.96


In [18]:
model_en.save_pretrained("gpt2-en-fine")

NameError: ignored

## B.

In [None]:
transformers_generate_text("whales are", tokenizer_en, model_en)

## (C.)

## D.

We can you use trainer here to compute the perplexity.
Or use https://huggingface.co/docs/transformers/perplexity

## E.

In [7]:
model_checkpoint = "distilgpt2" # @TODO: Use our finetuned model
tokenizer_en = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer_en.pad_token = tokenizer_en.eos_token
model_en = AutoModelForCausalLM.from_pretrained(model_checkpoint, pad_token_id=tokenizer_en.eos_token_id)

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [8]:
class Tiny_Network(nn.Module):

    def __init__(self, input_dim):
        super(Tiny_Network, self).__init__()

        self.hidden = nn.Linear(2*input_dim, 64)
        self.relu = nn.ReLU()
        self.linear_out = nn.Linear(64, 2)
        
    def forward(self, question, document):
        x = torch.cat((question, document), -1)
        x = self.hidden(x)
        x = self.relu(x)
        logits = self.linear_out(x)
        return logits

In [9]:
net = Tiny_Network(6)
net

Tiny_Network(
  (hidden): Linear(in_features=12, out_features=64, bias=True)
  (relu): ReLU()
  (linear_out): Linear(in_features=64, out_features=2, bias=True)
)

In [10]:
net(torch.zeros(6), torch.ones(6))

tensor([0.1507, 0.3388], grad_fn=<AddBackward0>)

In [37]:
class IsQuestionAnsweredDataset(Dataset):
    
    def __init__(self, dataset_question_embeddings: pd.DataFrame, dataset_document_embeddings: pd.DataFrame, labels: np.ndarray):
        self.dataset_question_embeddings = dataset_question_embeddings
        self.dataset_document_embeddings = dataset_document_embeddings

        self.labels = labels
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        question = self.dataset_question_embeddings.values[idx]
        document = self.dataset_document_embeddings.values[idx]

        label = self.labels[idx]
        
        return question, document, label

In [38]:
def get_labels_from_dataset(dataset: pd.DataFrame):
    annotation_column = dataset['annotations']
    
    labels = np.empty(annotation_column.shape[0], dtype=np.int32)
    for i, annotation in enumerate(annotation_column):
        labels[i] = 0 if annotation["answer_start"][0] == -1 else 1
        
    return labels

In [39]:
def mean_pooling(model_output, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
    masked_model_output = model_output * input_mask_expanded
    
    sum_embeddings = torch.sum(masked_model_output, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_pool = sum_embeddings / sum_mask
    
    return mean_pool

In [40]:
def get_lm_last_hidden_state(model_output, attention_mask):
    num_tokens = torch.sum(attention_mask)
    last = model_output[:, num_tokens-1, :]
    return last

In [41]:
def preprocess_with_language_model(lm_tokenizer, lm_model, dataset_column):
    n_obs = dataset_column.shape[0]
    emb_dims = 768

    storage = np.zeros((n_obs, 2, emb_dims))
    for i in tqdm(range(n_obs)):
        element = dataset_column.iloc[i]

        tokens = tokenizer_en(element, padding=True, truncation=True, return_tensors="pt")
        attention_mask = tokens["attention_mask"]
        
        with torch.no_grad():
            lm_output = model_en(tokens["input_ids"], output_hidden_states=True)
        
        lm_hidden_states_all_decoders = lm_output["hidden_states"]
        lm_hidden_states_last_decoder = lm_hidden_states_all_decoders[-1].detach()
        pooled = mean_pooling(lm_hidden_states_last_decoder, attention_mask)
        last_state = get_lm_last_hidden_state(lm_hidden_states_last_decoder, attention_mask)

        storage[i,0] = pooled.numpy()
        storage[i,1] = last_state.numpy()

        gc.collect()

    return {"mean_pooling": storage[:,0,:], "last_state": storage[:,1,:]}

In [42]:
def preprocess_dataset_with_language_model_then_save(lm_tokenizer, lm_model, dataset: pd.DataFrame, language: str, dataset_type: str):

    def preprocess_dataset_column_with_language_model_then_save(lm_tokenizer, lm_model, dataset: pd.DataFrame, column_name: str, language: str, dataset_type: str):
        output = preprocess_with_language_model(lm_tokenizer, lm_model, dataset[column_name])
        
        mean_df = pd.DataFrame(output["mean_pooling"])
        save_dataset(mean_df, f"data/lm_{column_name}_mean_pooling_{dataset_type}_{language}.pkl")

        last_state_df = pd.DataFrame(output["last_state"])
        save_dataset(last_state_df, f"data/lm_{column_name}_last_state_{dataset_type}_{language}.pkl")


    preprocess_dataset_column_with_language_model_then_save(lm_tokenizer, lm_model, dataset, "question_text", language, dataset_type)
    print("\nCompleted preprocessing question column with language model!")

    preprocess_dataset_column_with_language_model_then_save(lm_tokenizer, lm_model, dataset, "document_plaintext", language, dataset_type)
    print("\nCompleted preprocessing document column with language model!")
    

In [43]:
preprocess_dataset_with_language_model_then_save(tokenizer_en, model_en, dataset["en"]["train_set"], "en", "train")

  0%|          | 3/7389 [00:01<45:33,  2.70it/s]


KeyboardInterrupt: ignored

In [None]:
preprocess_dataset_with_language_model_then_save(tokenizer_en, model_en, dataset["en"]["validation_set"], "en", "validation")

In [52]:
def train(model: nn.Module, lm_tokenizer, lm_model, question_train_data, document_train_data, train_labels, question_validation_data, document_validation_data, validation_labels, n_epochs = 5, batch_size = 16, weight_decay=1e-6):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=weight_decay)

    # Load dataset
    train_dataset = IsQuestionAnsweredDataset(question_train_data, document_train_data, train_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = IsQuestionAnsweredDataset(question_validation_data, document_validation_data, validation_labels)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # store improvement per epoch
    train_losses = []
    train_accuracies = []
    test_accuracies = []

    for epoch in range(n_epochs):
        
        ### Training
        model.train()

        # Store batch loss and accuracy
        loss_epoch = []
        accuracy_epoch = []

        batch_pbar = tqdm(train_dataloader)
        for batch in batch_pbar:
            questions, documents, targets = batch
            
            questions = questions.float().to(device)
            documents = documents.float().to(device)
            targets = targets.long().to(device)

            # training
            outputs = model(questions, documents)
            loss = loss_fn(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # prediction
            predictions = outputs.detach().cpu().max(1)[1]
            accuracy = accuracy_score(targets.detach().cpu(), predictions)

            loss_epoch.append(loss.detach().cpu().item())
            accuracy_epoch.append(accuracy)

            batch_pbar.set_description(f"epoch={epoch+1}/{n_epochs} | loss={loss.item():.2f}, accuracy={accuracy:.2f}")

            gc.collect()

        train_loss = np.mean(loss_epoch)
        train_losses.append(train_loss)

        train_acc = np.mean(accuracy_epoch)
        train_accuracies.append(train_acc)

        ### Evaluation
        #test_acc = evaluate_inplace(model, val_dataloader) # @TODO: copy&paste
        test_acc = 0
        test_accuracies.append(test_acc)

        # @TODO: live plot
        print(f"epoch={epoch+1}/{n_epochs} | loss={train_loss:.2f}, train_accuracy={train_acc:.2f}, test_accuracy={test_acc:.2f}")

    print("Finished training.")
    
    return train_losses, train_accuracies, test_accuracies

In [53]:
question_train_en = load_dataset("data/lm_question_text_last_state_train_en.pkl")
document_train_en = load_dataset("data/lm_document_plaintext_last_state_train_en.pkl")

#question_validation_en = load_dataset("data/lm_question_text_last_state_validation_en.pkl")
#document_validation_en = load_dataset("data/lm_document_plaintext_last_state_validation_en.pkl")
question_validation_en = question_train_en
document_validation_en = document_train_en

net = Tiny_Network(768)
net.to(device)

train(net, tokenizer_en, model_en, question_train_en, document_train_en, get_labels_from_dataset(dataset["en"]["train_set"]), question_validation_en, document_validation_en, get_labels_from_dataset(dataset["en"]["validation_set"]))

epoch=1/5 | loss=0.67, accuracy=0.62: 100%|██████████| 462/462 [01:41<00:00,  4.55it/s]


epoch=1/5 | loss=0.65, train_accuracy=0.67, test_accuracy=0.00


epoch=2/5 | loss=0.36, accuracy=0.85: 100%|██████████| 462/462 [01:29<00:00,  5.14it/s]


epoch=2/5 | loss=0.55, train_accuracy=0.73, test_accuracy=0.00


epoch=3/5 | loss=0.19, accuracy=0.92: 100%|██████████| 462/462 [01:33<00:00,  4.96it/s]


epoch=3/5 | loss=0.51, train_accuracy=0.75, test_accuracy=0.00


epoch=4/5 | loss=0.47, accuracy=0.69: 100%|██████████| 462/462 [01:30<00:00,  5.10it/s]


epoch=4/5 | loss=0.49, train_accuracy=0.77, test_accuracy=0.00


epoch=5/5 | loss=0.56, accuracy=0.69: 100%|██████████| 462/462 [01:28<00:00,  5.20it/s]

epoch=5/5 | loss=0.49, train_accuracy=0.77, test_accuracy=0.00
Finished training.





([0.6481658019673773,
  0.5488219412761334,
  0.510776621374217,
  0.48926949894531463,
  0.4875315646943334],
 [0.673274642024642,
  0.7276161338661338,
  0.7468573093573093,
  0.7669205794205793,
  0.7659736097236097],
 [0, 0, 0, 0, 0])

In [33]:
dataset["en"]["validation_set"]

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
30,What is a way to increase your wound healing s...,Wound healing,english,"{'answer_start': [51], 'answer_text': ['cleani...",Wound care encourages and speeds wound healing...,https://en.wikipedia.org/wiki/Wound%20healing
47,Who founded the Burntisland Shipbuilding Company?,Burntisland Shipbuilding Company,english,"{'answer_start': [0], 'answer_text': ['Brother...",Brothers Amos and Wilfrid Ayre founded Burntis...,https://en.wikipedia.org/wiki/Burntisland%20Sh...
59,What is the surface area of the human cortex?,Cerebral cortex,english,"{'answer_start': [295], 'answer_text': ['2.3–2...","For species of mammals, larger brains (in abso...",https://en.wikipedia.org/wiki/Cerebral%20cortex
77,When did the case of R (Factortame Ltd) v Secr...,R (Factortame Ltd) v Secretary of State for Tr...,english,"{'answer_start': [352], 'answer_text': ['Decem...","As from 31 March 1989, fishing vessel registra...",https://en.wikipedia.org/wiki/R%20%28Factortam...
106,When was Quezon City founded?,Quezon City,english,"{'answer_start': [32], 'answer_text': ['1939']}","When Quezon City was created in 1939, the foll...",https://en.wikipedia.org/wiki/Quezon%20City
...,...,...,...,...,...,...
13276,What's the average income in West Virginia?,List of West Virginia locations by per capita ...,english,"{'answer_start': [-1], 'answer_text': ['']}",Note: County Data is from the 2011–2015 Americ...,https://en.wikipedia.org/wiki/List%20of%20West...
13287,How often do LSAT tests take place?,Law School Admission Test,english,"{'answer_start': [-1], 'answer_text': ['']}",A recent controversy surrounding the LSAT was ...,https://en.wikipedia.org/wiki/Law%20School%20A...
13298,How much does a bushel of barley weigh?,Bushel,english,"{'answer_start': [-1], 'answer_text': ['']}",\nThe Spanish bushel (fanega) was used as a me...,https://en.wikipedia.org/wiki/Bushel
13305,What is the most common first word by babies?,Vocabulary development,english,"{'answer_start': [-1], 'answer_text': ['']}","Social pragmatic theories, also in contrast to...",https://en.wikipedia.org/wiki/Vocabulary%20dev...


# Finnish

## A.

In [None]:
tokenizer_fi = AutoTokenizer.from_pretrained("Finnish-NLP/gpt2-finnish")
model_fi = AutoModelForCausalLM.from_pretrained("Finnish-NLP/gpt2-finnish", pad_token_id=tokenizer_fi.eos_token_id)

## B.

In [None]:
generate_text("valaat ovat", tokenizer_fi, model_fi)

# Japanese

## A.

In [None]:
tokenizer_ja = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-medium")
model_ja = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium", pad_token_id=tokenizer_ja.eos_token_id)

## B.

In [None]:
generate_text("クジラは", tokenizer_ja, model_ja)