
# **Analytics 2 :** <font color=#DF4807>**Transformers**</font>





In [None]:
#load libraries
#!pip install transformers datasets torch


In [None]:
from datasets import load_dataset
from datasets import load_metric

from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, DistilBertConfig
from transformers import Trainer, TrainingArguments

from sklearn.metrics import f1_score

import torch

##**Load Data**

We will be using the TyDiQA (Typologically Diverse Question Answering) dataset from Google research. It includes "over 200,000 question-answer pairs from 11 languages representing a diverse range of linguistic phenomena and data challenges". For today we will only be using the English subset of the data. Loading the data will require a few minutes.

Further reading: https://ai.google.com/research/tydiqa/dataset


In [None]:
train_data = load_dataset('tydiqa', 'primary_task')
tydiqa_data = train_data.filter(lambda example: example['language'] == 'english') #extract English only data

In [None]:
# view data structure
tydiqa_data

DatasetDict({
    train: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 9211
    })
    validation: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 1031
    })
})

Each example of the data is stored as a dictionary object. Data is stored as questions, contexts and indeces. The indeces show where in the context the answer to the question lies. You can access the index using the `annotations` key. Let's look at a sample question  and answer:

In [None]:
index = 700 #changing the index will give you differnet Q&A pairs

# starting index
start_index = tydiqa_data['train'][index]['annotations']['minimal_answers_start_byte'][0]

# ending index
end_index = tydiqa_data['train'][index]['annotations']['minimal_answers_end_byte'][0]

print("Question: " + tydiqa_data['train'][index]['question_text'])
print("\nContext: "+ tydiqa_data['train'][index]['document_plaintext'][0:550] + '...') # some of the contexts are quite long so we will only view a part of it.
print("\nAnswer: " + tydiqa_data['train'][index]['document_plaintext'][start_index:end_index])

Question: What was the lingua franca of the Ottoman Empire?

Context: 

The language of the court and government of the Ottoman Empire was Ottoman Turkish,[1] but many other languages were in contemporary use in parts of the empire. Although the minorities of the Ottoman Empire were free to use their language amongst themselves, if they needed to communicate with the government they had to use Ottoman Turkish.[2]
The Ottomans had three influential languages: Turkish, spoken by the majority of the people in Anatolia and by the majority of Muslims of the Balkans except in Albania, Bosnia, and various Aegean Sea isl...

Answer: Ottoman Turkish


The model we are going to train predicts the start end of where the answer will be in the context. So we need to extract the start and end points from the data. It should be noted that some of the questions do not have answers. In these cases, the start and end indices are set to -1.





In [None]:
#lets look at a sample with out an answer.
tydiqa_data['train'][0]['annotations']

{'passage_answer_candidate_index': [-1],
 'minimal_answers_start_byte': [-1],
 'minimal_answers_end_byte': [-1],
 'yes_no_answer': ['NONE']}

In [None]:
#lets look at a sample with an answer.
tydiqa_data['train'][index]['annotations']

{'passage_answer_candidate_index': [0],
 'minimal_answers_start_byte': [69],
 'minimal_answers_end_byte': [84],
 'yes_no_answer': ['NONE']}

To make the processing a bit easier, we will flatten the data so we don't have to work with a dictionary structure. Flattening the data, will give the data a table struncture.

In [None]:
# Flattening the datasets
flattened_train_data = tydiqa_data['train'].flatten().select(range(3500)) #flatten and take a subset of the data
flattened_test_data =  tydiqa_data['validation'].flatten().select(range(1000))


##**Tokenization**

If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the vocab) during pretraining.

In [None]:
# Import the AutoTokenizer from the transformers library
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

In [None]:
# Processing samples using the 3 steps described.
def process_samples(sample):
    tokenized_data = tokenizer(sample['document_plaintext'], sample['question_text'], truncation="only_first", padding="max_length")

    input_ids = tokenized_data["input_ids"] #sequence of integers that represent the tokens

    # We will label impossible answers with the index of the CLS token.
    cls_index = input_ids.index(tokenizer.cls_token_id) #cls_token_id is a hugging face attribute that converts "cls" to an int

    # If no answers are given, set the cls_index as answer.
    if sample["annotations.minimal_answers_start_byte"][0] == -1:
        start_position = cls_index
        end_position = cls_index
    else:
        # Start/end character index of the answer in the text.
        answer_text = sample["document_plaintext"][sample['annotations.minimal_answers_start_byte'][0]:sample['annotations.minimal_answers_end_byte'][0]]
        start_char = sample["annotations.minimal_answers_start_byte"][0]
        end_char = sample['annotations.minimal_answers_end_byte'][0] #start_char + len(answer_text)

        # sometimes answers are off by a character or two
        if sample['document_plaintext'][start_char-1:end_char-1] == answer_text:
            start_char = start_char - 1
            end_char = end_char - 1     # When the answer label is off by one character
        elif sample['document_plaintext'][start_char-2:end_char-2] == answer_text:
            start_char = start_char - 2
            end_char = end_char - 2     # When the answer label is off by two characters

        start_token = tokenized_data.char_to_token(start_char)
        end_token = tokenized_data.char_to_token(end_char - 1)

        # if start position is None, the answer passage has been truncated
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        start_position = start_token
        end_position = end_token

    return {'input_ids': tokenized_data['input_ids'],
          'attention_mask': tokenized_data['attention_mask'],
          'start_positions': start_position,
          'end_positions': end_position}


In [None]:
# Tokenizing and processing the flattened dataset
processed_train_data = flattened_train_data.map(process_samples)
processed_test_data = flattened_test_data.map(process_samples)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

##**Model**

Read Me:  https://huggingface.co/transformers/v3.0.2/model_doc/auto.html

Read Me: https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodelforquestionanswering

The model we will use is the "distilbert-base-cased-distilled-squad". The DistilBert model is a smaller and lighter and as a result faster. If you read the documentation here....

Link: https://huggingface.co/distilbert-base-cased-distilled-squad#model-details

you will see that it is 40% lighter than  bert-base-uncased, runs 60% faster but performs at 95% of BERT's performance (GLUE language understanding benchmark).

In [None]:
# Import the AutoModelForQuestionAnswering for the pre-trained model. We will only fine tune the head of the model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

In [None]:
columns_to_return = ['input_ids','attention_mask', 'start_positions', 'end_positions'] #attention_mask tells the model which inputs are tokens and which are padding
processed_train_data.set_format(type='pt', columns=columns_to_return)
processed_test_data.set_format(type='pt', columns=columns_to_return)

##**Compiling and Fine Tuning**

In [None]:
#!pip install accelerate -U #setting up distributed training in pytorch

In [None]:
# Training the model may take around 15 minutes.
training_args = TrainingArguments(
    output_dir='model_results5',    # output directory where the model predictions and checkpoints will be written.
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    num_train_epochs=4,             # total number of training epochs
    per_device_train_batch_size=4,  # batch size per GPU/TPU core/CPU for training
    per_device_eval_batch_size=4,   # batch size per GPU/TPU core/CPU for evaluation
    warmup_steps=20,                # number of steps used for a linear warmup from 0 to learning_rate.
    learning_rate = 0.00001,        # learning rate
    weight_decay=0.0001,            # strength of weight decay (regularisation technique to reduce overfitting)
    logging_steps=50,               # how often to log metrics
    optim="adamw_torch",            # optimization algorithm
)

trainer = Trainer(
    model = model, # transformer model to be trained
    args = training_args, # training arguments, defined above
    train_dataset = processed_train_data, # training dataset
    eval_dataset = processed_test_data, # evaluation dataset
)

trainer.train() #will take about 12 minutes

Epoch,Training Loss,Validation Loss
1,1.8846,1.816227
2,1.2202,1.907752
3,1.2878,2.081372
4,1.0617,2.170346


TrainOutput(global_step=3500, training_loss=1.4096927141462055, metrics={'train_runtime': 856.1443, 'train_samples_per_second': 16.352, 'train_steps_per_second': 4.088, 'total_flos': 1829143400448000.0, 'train_loss': 1.4096927141462055, 'epoch': 4.0})

Note: If you get an error message when running Training arguments, follow these steps:


  1. Run pip install accelerate -U in a cell
  2. In the top menu click Runtime → Restart Runtime
  3. Do not rerun any cells with !pip install in them
  4. Rerun all the other code cells and you should be good to go!


In [None]:
# The evaluation may take around 30 seconds
trainer.evaluate(processed_test_data)

{'eval_loss': 2.1703460216522217,
 'eval_runtime': 17.7679,
 'eval_samples_per_second': 56.281,
 'eval_steps_per_second': 14.07,
 'epoch': 4.0}

##**Testing**

In [None]:

text = r"""
Heidelberg has a humid subtropical climate.
The year round warm temperatures are determined by air currents.
This results in drier summers and wetter winters.
Heidelberg's position in the valley causes more wind than average.
Spring starts early and is one of the warmest in Europe.
April can be very dry.
The rising temperatures in May can create some storms.
Nights start cold and stay fresh throughout spring.
Day temperatures can become hot from April.
The avarege temperature in April is about 30°C.
Summers are long, hot and mostly dry.
Day temperatures are around 28-30 on average.
Temperatures will often rise beyond 35°C in midsummer.
July is also the sunniest month of the year.
Autumn starts very warm and cools down by the end of November.
The region gets affected by fog from the second part of October on.
Day temperatures will stay around 20°C until at least mid-October.
Nights cool down during October, but remain above 10°C.
Winters are mostly mild.
Snow is a rare event and it rains often.
Winters are the wettest time of the year.
Storms can create severe damage.
The region is often affected by floods.
"""

questions = ["What has a humid climate?",
             "What is the sunniest month?",
             "What is the average temperature in April?",
             "What is the wettest time of year?"]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, return_tensors="pt")
    #print("inputs", inputs)
    #print("inputs", type(inputs))
    input_ids = inputs["input_ids"].tolist()[0]
    inputs.to("cuda")

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_model = model(**inputs)

    answer_start = torch.argmax(
        answer_model['start_logits']
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_model['end_logits']) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: What has a humid climate?
Answer: Heidelberg

Question: What is the sunniest month?
Answer: July

Question: What is the average temperature in April?
Answer: 30°C

Question: What is the wettest time of year?
Answer: [CLS]



Note, remember that if you get a CLS token as an answer, the model was not able to find an answer.

## **Adding Layers to a Pre-Trained Model**

ResNet-50 is a convolutional neural network that is 50 layers deep.


```
from tensorflow.keras import models

ResNet = ResNet50(
    include_top= None, weights='imagenet', input_tensor=None, input_shape=([128, 217, 3]),
    pooling=None, classes=5)
model = models.Sequential()
model.add(ResNet)
model.add(Flatten())
model.add(Dense(units=512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=5, activation='softmax'))

```

## **Freezing Layers**




```
# Create a ResNet50 model with pre-trained weights
ResNet = ResNet50(
    include_top=None, weights='imagenet', input_tensor=None, input_shape=(128, 217, 3),
    pooling=None, classes=5)

# Set the ResNet layers as untrainable
for layer in ResNet.layers:
    layer.trainable = False

model = models.Sequential()
model.add(ResNet)
model.add(layers.Flatten())
model.add(layers.Dense(units=512, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(units=256, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(units=5, activation='softmax'))

# Compile and train the model
```