In [1]:
# Import libraries for data loading and cleaning

import re
import numpy as np
import pandas as pd
import json

In [2]:
# log in to HuggingFace for fine tunning

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Reading dataset from Stanford website

data = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
data.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [4]:
# Clean and Create a new dataset from raw data

#NOTE: I use question, answer, story(context), and answer_starting column for question-answering model

all_list = []
for i in range(len(data)):
    for j in range(len(data.data[i]['questions'])):
        context = data.data[i]['story']
        question = data.data[i]['questions'][j]['input_text']
        answer = data.data[i]['answers'][j]['span_text']
        answer_start = data.data[i]['answers'][j]['span_start']
        all_list.append([context,question,answer, answer_start])

coll = ['Context', 'Question', 'Answer', 'Answer_Start']

df_new = pd.DataFrame(all_list, columns= coll)
df_new.head()

Unnamed: 0,Context,Question,Answer,Answer_Start
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,Formally established in 1475,151
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,he Vatican Library is a research library,454
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,Vatican Library is a research library for hist...,457
3,"The Vatican Apostolic Library (), more commonl...",and?,Vatican Library is a research library for hist...,457
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,"March 2014, the Vatican Library began an initi...",769


### Even without fine-tunning the model on our data set, it is possible to get good answers with using the pre-trained models.
### First, I will use pre-trained large BERT model from Transformors. Then, i will fine-tune the model on my data set to improve the answers given by the model.
### I chose large BERT model for question-answering text because;
###### Using a large BERT model from the Hugging Face Transformers library for question answering tasks provides a powerful combination of advanced contextual understanding, transfer learning benefits, ease of use, and community support, leading to highly accurate and efficient question answering systems.

### LARGE BERT MODEL WITH HUGGING FACE TRANSFORMERS

In [6]:
# Import libraries for base model

from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering
import tensorflow as tf

### CREATE A FUNCTION PIPELINE TO AUTOMATE THE QUESTION-ANSWER TASK

In [7]:

def answering(question, context):

    #Tokenize the context and return TensorFlow tensors
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    inputs = tokenizer(question, context, return_tensors= 'tf')

    #passing inputs into the model and return logits
    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    outputs = model(**inputs)

    #Catch Answer start index from dataset and calculate answer end index
    start_index = int(tf.math.argmax(outputs.start_logits, axis = -1)[0])
    end_index = int(tf.math.argmax(outputs.end_logits, axis = -1)[0])

    #Predict the answer as ids
    predict_answer = inputs.input_ids[0, start_index: end_index + 1]

    #Convert answer to the string
    return tokenizer.decode(predict_answer)

In [8]:
#Pick a random number to test the function
rand_num = np.random.randint(0, len(df_new))

qs = df_new['Question'][rand_num]
ct = df_new['Context'][rand_num]
orginal_answer = df_new['Answer'][rand_num]

In [9]:
print(ct)

CHAPTER XXXVI 

Selingman had scarcely left the place when Ernshaw arrived, piloted into the room by Aaron, who had been waiting for him below. Maraton and he gripped hands heartily. During the first few days of the campaign they had been constant companions. 

"At least," he declared, as he looked into Maraton's face, "whatever the world may think of the justice of their cause, no one will ever any longer deny the might of the people." 

"None but fools ever did deny it," Maraton answered. 

"How are they in the north?" Ernshaw asked. 

"United and confident," Maraton assured him. "Up there I don't think they realise the position so much as here. In Nottingham and Leicester, people are leading their usual daily lives. It was only as we neared London that one began to understand." 

"London is paralysed with fear," Ernshaw asserted, "perhaps with reason. The Government are working the telephones and telegraph to a very small extent. The army engineers are doing the best they can with t

In [10]:
print(qs)

Who arrived shortly after he left?


In [11]:
print(orginal_answer)

Selingman had scarcely left the place when Ernshaw arrived


In [12]:
#Run the function to see base model performance
print('Model Answer:')
answering(qs,ct)

Model Answer:


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


'ernshaw'

In [13]:
#Ask a random question from text
new_q = 'what does Tom do at the weekend'
answering(new_q,ct)


All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


'[CLS] what does tom do at the weekend [SEP]'

### NOTE: Even without fine tunning, the model performs well on most of the context. I will apply fine tunning to improve model performance.

## FINE TUNNING #####

In [16]:
#Before starting fine-tuning process, i will separate the dataset in to train and test

# Use first 90000 rows for training, rest of them for testing

train_data, test_data = df_new[:90000], df_new[90000:]

print('Train dataset length:', len(train_data), 'Test dataset length:', len(test_data))

Train dataset length: 90000 Test dataset length: 18647


In [17]:
train_data.head()

Unnamed: 0,Context,Question,Answer,Answer_Start
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,Formally established in 1475,151
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,he Vatican Library is a research library,454
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,Vatican Library is a research library for hist...,457
3,"The Vatican Apostolic Library (), more commonl...",and?,Vatican Library is a research library for hist...,457
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,"March 2014, the Vatican Library began an initi...",769


In [18]:
# Import AutoTokenizer from Transformers.
from transformers import AutoTokenizer

#Base BERT modle
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



### Preprocessing the data for fine-tunning

#### 1- To deal with large text sequences  that are too big for the model's maximum input size, I set 

handle text sequences that are too long for the model's maximum input size

In [19]:
max_length = 384

def preprocess_finetuning(dataset):
    
    # Tokenize questions, and context
    questions = [q.strip() for q in dataset["Question"]]
    contexts = [c.strip() for c in dataset["Context"]]
    inputs = tokenizer(
          questions,
          contexts,
          max_length= max_length,
          truncation="only_second", #Truncate the context part if the text sequence is too large for the model.
          return_offsets_mapping=True, 
          padding="max_length",
  )
    
    # When you set return_offset_mapping=True, the tokenizer returns an additional output 
    #   called offset_mapping. This offset_mapping is a list of tuples, where each tuple represents 
    #   the start and end character positions of the corresponding token in the original text.
    
    
    offset_mapping = inputs.pop("offset_mapping")
    answers = dataset["Answer"]
    answers_start = dataset['Answer_Start']
    start_positions = []
    end_positions = []

    # finding the answer
    for i, offset in enumerate(offset_mapping):
        
        answer = answers[i]
        start_char = answers_start[i]
        end_char = answers_start[i] + len(answers[i])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

          # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
            
    # Adding start and end possitions into inputs.
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [20]:
# calling preprocessing function
inputs = preprocess_finetuning(train_data)

In [21]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [22]:
#!pip install datasets

In [23]:
len(inputs['input_ids'])

90000

In [24]:
# Convert data set into proper format

from datasets import Dataset, DatasetDict

dataset_dict = {
    "input_ids": inputs["input_ids"],
    'token_type_ids': inputs["token_type_ids"],
    "attention_mask": inputs["attention_mask"],
    'start_positions': inputs['start_positions'],
    'end_positions' : inputs['end_positions'],
}

train_data = Dataset.from_dict(dataset_dict)

print(train_data)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 90000
})


In [25]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [26]:
#Setting up an optimizer function, learning rate schedule

from transformers import create_optimizer

batch_size = 64
num_epochs = 2
total_train_steps = (len(train_data) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [27]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Split train data into train and test (test data set will be used for validation in the model.)
train_data = train_data.train_test_split(test_size=0.2)

In [29]:
#Converting the dataset to tensorflow dataset

tf_train_set = model.prepare_tf_dataset(
    train_data["train"],
    shuffle=True,
    batch_size=64,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    train_data["test"],
    shuffle=False,
    batch_size=64,
    collate_fn=data_collator,
)

In [30]:
import tensorflow as tf

#compiling the model
model.compile(optimizer=optimizer)

In [31]:
from transformers.keras_callbacks import PushToHubCallback

#Pushing my model to the hub
callback = PushToHubCallback(
    output_dir="test_onrt",
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/OnurT/test_onrt into local empty directory.


Download file tf_model.h5:   0%|          | 24.0k/411M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/411M [00:00<?, ?B/s]

In [None]:
# Training the model
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])

In [None]:
from transformers import pipeline

#trying my fined-tuned model 
question_answerer = pipeline("question-answering", model="test_onrt")
question_answerer(question=df_new['Question'][100004], context=df_new['Context'][100004])

In [None]:
df_new['Context'][100004]

In [None]:
df_new['Question'][100004]

In [None]:
df_new['Answer'][100004]

In [None]:
answering('Who helped him',df_new['Context'][100004])
