In [1]:
# #installing required libraries
# !pip install datasets
# !pip install transformers
# !pip install datasets evaluate transformers
# !pip install bert_score

In [2]:
from datasets import load_dataset
from datasets import ClassLabel, Value
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
import collections
from tqdm.auto import tqdm
import numpy as np
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from evaluate import load
import evaluate
import json
import torch
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# select device
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
device

device(type='cuda', index=2)

In [4]:
root_path = "/soe/vigneshs/projects/nlp_243/project_1/data/qa/"
results_dir = root_path+"results"

original_train_path = root_path+"train_qa_data_all.json"
original_val_path = root_path+"val_qa_data_all.json"
original_val_top10 = root_path+"val_qa_data_custom_bert_lstm_top_10.json"
original_val_top4 = root_path+"val_qa_data_custom_bert_lstm_top_4.json"
original_val_oracle = root_path+"val_qa_data_oracle.json"

original_train_path_reformat = root_path+"train_qa_data_all_reformat.json"
original_val_path_reformat = root_path+"val_qa_data_all_reformat.json"
original_val_top10_reformat = root_path+"val_qa_data_custom_bert_lstm_top_10_reformat.json"
original_val_top4_reformat = root_path+"val_qa_data_custom_bert_lstm_top_4_reformat.json"
original_val_oracle_reformat = root_path+"val_qa_data_oracle_reformat.json"

original_train_path_unstructured = root_path+"train_qa_data_all_unstructured.json"
original_val_path_unstructured = root_path+"val_qa_data_all_unstructured.json"
original_val_top10_unstructured = root_path+"val_qa_data_custom_bert_lstm_top_10_unstructured.json"
original_val_top4_unstructured = root_path+"val_qa_data_custom_bert_lstm_top_4_unstructured.json"
original_val_oracle_unstructured = root_path+"val_qa_data_oracle_unstructured.json"

original_train_path_structured = root_path+"train_qa_data_all_structured.json"
original_val_path_structured = root_path+"val_qa_data_all_structured.json"
original_val_top10_structured = root_path+"val_qa_data_custom_bert_lstm_top_10_structured.json"
original_val_top4_structured = root_path+"val_qa_data_custom_bert_lstm_top_4_structured.json"
original_val_oracle_structured = root_path+"val_qa_data_oracle_structured.json"

In [5]:

#loading model check points for tokenizers
bleu = evaluate.load("bleu")
model_checkpoint = "deepset/bert-base-uncased-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
bertscore = load("bertscore")
model_type_bert_score = "distilbert-base-uncased"


In [6]:

def reformat_write(source_p, destination_p):
  f = json.load(open(source_p,"r"))
  json_object_ = json.dumps(f)
  with open(destination_p, "w") as outfile:
    outfile.write(json_object_)


In [7]:
reformat_write(original_train_path, original_train_path_reformat)
reformat_write(original_val_top10, original_val_top10_reformat)
reformat_write(original_val_path , original_val_path_reformat)
reformat_write(original_val_top4, original_val_top4_reformat)
reformat_write(original_val_oracle, original_val_oracle_reformat)


In [8]:
def load_data(file_path):
  """
  Reads the input file and extracts sentences for each input data point
  from the answer index and return the list of json objects
  

  Parameters
  ----------
  file_path : Str
      path to the input jsonl file

  Returns
  -------
  filtered_sentences : List
      Contains json objects with the following fields:
      'uuid', 'postText', 'targetTitle', 'targetParagraph', 'spoilerType', 'spoilerPos', 'answers', 'id'

  """

  with open(file_path, 'r') as f:
    sentences = [json.loads(sentence.rstrip()) for sentence in f.readlines()]
    filtered_sentences = []
    c=0
    for _sentence in sentences[0]:
      # print(_sentence)
      # break
      temp_dict = dict()
      for _key, _value in _sentence.items():
        temp_dict[_key] = _value
      temp_dict_ansers = dict()
      temp_text = []
      temp_answer_start = []
      spoilerPoss = temp_dict['spoilerPos']
      if spoilerPoss is not None:
        for spoilpos in spoilerPoss:
          ttext = temp_dict['targetParagraph']
          temp_text.append(ttext[spoilpos[0]:spoilpos[1]])
          temp_answer_start.append(spoilpos[0])
      else:
        temp_text.append("")
        temp_answer_start.append(0)
      temp_dict_ansers['text'] = temp_text
      temp_dict_ansers['answer_start'] = temp_answer_start
      temp_dict['answers'] = temp_dict_ansers
      temp_dict['id'] = c
      c = c+1
      filtered_sentences.append(temp_dict)
      # print(temp_dict)
      


    # filtered_sentences = [{_key: _value for _key, _value in _sentence.items() } for _sentence in sentences[0]]

    return filtered_sentences

In [9]:
train_unstructured = load_data(original_train_path_reformat)
val_unstructured = load_data(original_val_path_reformat)
val_oracle_unstructured = load_data(original_val_oracle_reformat)

In [10]:
#remove improper datapoints
train_file = []
for train_file_ in  train_unstructured:
  # if train_file_['id']> 30000 and train_file_['id']< 40000:
  #   continue
  if train_file_['id']> 15000 and train_file_['id']< 17000:
    continue
  if train_file_['id']> 46999 and train_file_['id']< 47998:
    continue
  else:
    train_file.append(train_file_)
val_file = []
for val_file_ in  val_unstructured:
  if val_file_['id']> 1000 and val_file_['id']< 2000:
    continue
  if val_file_['id']> 9999 and val_file_['id']< 10998:
    continue
  else:
    val_file.append(val_file_)

In [11]:
def write_structured_data(structured_data, out_file_path):
    """
    Writes the strucured data on the disk in json format

    Parameters
    ----------
    structured_data : Dict
        Contains data to be written
    out_file_path : Str
        Path of the file

    Returns
    -------
    None.

    """
    with open(out_file_path, 'w') as outfile:
        for entry in structured_data:
            json.dump(entry, outfile)
            outfile.write('\n')

In [12]:
# train_file = train_file[:50]
write_structured_data(train_file, original_train_path_structured)

In [13]:
# val_file = val_file[:50]
write_structured_data(val_file, original_val_path_structured)
write_structured_data(val_oracle_unstructured, original_val_oracle_structured)

In [14]:

train_file[0]

{'uuid': '0af11f6b-c889-4520-9372-66ba25cb7657',
 'targetParagraphId': 0,
 'postText': 'Wes Welker Wanted Dinner With Tom Brady, But Patriots QB Had Better Idea',
 'targetTitle': 'Wes Welker Wanted Dinner With Tom Brady, But Patriots QB Had A Better Idea',
 'targetParagraph': 'Wes Welker Wanted Dinner With Tom Brady, But Patriots QB Had A Better Idea',
 'spoilerType': None,
 'spoilerPos': None,
 'answers': {'text': [''], 'answer_start': [0]},
 'id': 0}

In [15]:
#reading structured to Hugging face dataset object
dict_train = load_dataset("json", data_files= original_train_path_structured , split='train')
dict_val = load_dataset("json", data_files= original_val_path_structured , split='train')
dict_val_oracle = load_dataset("json", data_files= original_val_oracle_structured, split='train')

Using custom data configuration default-cdf7e8672a42dea5


Downloading and preparing dataset json/default to /soe/vigneshs/.cache/huggingface/datasets/json/default-cdf7e8672a42dea5/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2420.26it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 406.35it/s]
                            

Dataset json downloaded and prepared to /soe/vigneshs/.cache/huggingface/datasets/json/default-cdf7e8672a42dea5/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


Using custom data configuration default-7ed95695a5796a9f


Downloading and preparing dataset json/default to /soe/vigneshs/.cache/huggingface/datasets/json/default-7ed95695a5796a9f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1738.21it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 464.74it/s]
                            

Dataset json downloaded and prepared to /soe/vigneshs/.cache/huggingface/datasets/json/default-7ed95695a5796a9f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


Using custom data configuration default-64b0c3f88f5ceb34


Downloading and preparing dataset json/default to /soe/vigneshs/.cache/huggingface/datasets/json/default-64b0c3f88f5ceb34/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2761.23it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 418.80it/s]
                            

Dataset json downloaded and prepared to /soe/vigneshs/.cache/huggingface/datasets/json/default-64b0c3f88f5ceb34/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




In [16]:
#renaming and dropping columns in training file
dict_train = dict_train.rename_column("targetTitle", "title")
dict_train = dict_train.rename_column("targetParagraph", "context")
dict_train = dict_train.rename_column("postText", "question")

dict_train = dict_train.remove_columns(['spoilerPos'])

In [17]:
#renaming and dropping columns in validation file
dict_val = dict_val.rename_column("targetTitle", "title")
dict_val = dict_val.rename_column("targetParagraph", "context")
dict_val = dict_val.rename_column("postText", "question")

dict_val = dict_val.remove_columns(['spoilerPos'])

In [18]:
#renaming and dropping columns in validation file oracle
dict_val_oracle = dict_val_oracle.rename_column("targetTitle", "title")
dict_val_oracle = dict_val_oracle.rename_column("targetParagraph", "context")
dict_val_oracle = dict_val_oracle.rename_column("postText", "question")

dict_val_oracle = dict_val_oracle.remove_columns(['spoilerPos'])

In [19]:
dict_train

Dataset({
    features: ['uuid', 'targetParagraphId', 'question', 'title', 'context', 'spoilerType', 'answers', 'id'],
    num_rows: 45564
})

In [20]:
#initializing tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [21]:
# Tokenize the training data
def preprocess_function(examples):
      """
      Tokenizes the input data and also formats it according to the input format of bert trained on squad.
      
      Parameters
      ----------
      examples : Dataset Dict

      Returns
      -------
      inputs : Dataset Dict

      """
  # try:
      questions = [q.strip() for q in examples["question"]]
      inputs = tokenizer(
          questions,
          examples["context"],
          max_length=384,
          truncation="only_second",
          return_offsets_mapping=True,
          padding="max_length",
      )

      offset_mapping = inputs.pop("offset_mapping")
      answers = examples["answers"]
      start_positions = []
      end_positions = []

      try:
        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            try:
              end_char = answer["answer_start"][0] + len(answer["text"][0])
            except Exception as E:

              end_char = 0 + len(answer["text"][0])

            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0

            while sequence_ids[idx] != 1:
                idx += 1

            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            try:
              if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                  start_positions.append(0)
                  end_positions.append(0)
              else:
                  # Otherwise it's the start and end token positions
                  idx = context_start
                  while idx <= context_end and offset[idx][0] <= start_char:
                      idx += 1
                  start_positions.append(idx - 1)

                  idx = context_end
                  while idx >= context_start and offset[idx][1] >= end_char:
                      idx -= 1
                  end_positions.append(idx + 1)
            except:
              start_positions.append(0)
              end_positions.append(0)
      except:
        print(examples["id"])

        

      inputs["start_positions"] = start_positions
      inputs["end_positions"] = end_positions
      return inputs


tokenized_train = dict_train.map(preprocess_function, batched=True)#, remove_columns=dict_train.column_names)
tokenized_val = dict_val.map(preprocess_function, batched=True)#, remove_columns=dict_train["train"].column_names)

 98%|█████████▊| 45/46 [00:08<00:00,  5.01ba/s]
 91%|█████████ | 10/11 [00:02<00:00,  4.74ba/s]


In [22]:
dict_train.column_names

['uuid',
 'targetParagraphId',
 'question',
 'title',
 'context',
 'spoilerType',
 'answers',
 'id']

In [23]:
data_collator = DefaultDataCollator()

In [24]:
#Training the model
training_args = TrainingArguments(
    output_dir=results_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=2,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



The following columns in the training set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: spoilerType, targetParagraphId, context, title, answers, uuid, question, id. If spoilerType, targetParagraphId, context, title, answers, uuid, question, id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 45564
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2848
  Number of trainable parameters = 108893186


Epoch,Training Loss,Validation Loss
1,0.4716,0.354292


Saving model checkpoint to /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-500
Configuration saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-500/config.json
Model weights saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-1000
Configuration saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-1000/config.json
Model weights saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /soe/vigneshs/projects/nlp_243/project_1/data/qa_1/results/che

TrainOutput(global_step=2848, training_loss=0.3745309078288945, metrics={'train_runtime': 1032.5666, 'train_samples_per_second': 44.127, 'train_steps_per_second': 2.758, 'total_flos': 8929294067939328.0, 'train_loss': 0.3745309078288945, 'epoch': 1.0})

In [25]:
#tokenize the validation data and Oracle data
def preprocess_function_val(examples):
    """
    Tokenizes the validation data and also formats it according to the input format of bert trained on squad.
    
    Parameters
    ----------
    examples : Dataset Dict

    Returns
    -------
    inputs : Dataset Dict

    """
  # try:
    questions = [q.strip() for q in examples["question"]]
    try:
      inputs = tokenizer(
          questions,
          examples["context"],
          max_length=384,
          truncation="only_second",
          return_offsets_mapping=True,
          return_overflowing_tokens=True,
          padding="max_length",
      )
    except Exception as E:
      # print(e)
      pass
      # print(examples['id'])


    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    max_len = min(len(inputs['offset_mapping']),len(inputs['input_ids']),len(inputs['attention_mask']), 1000 )
    # Some Lenghts are problematic and have been handled here
    print(max_len)
    if(max_len < 1000):
      max_len = min(len(inputs['offset_mapping']),len(inputs['input_ids']),len(inputs['attention_mask']))
      # if(max_len < 790):
      #   max_len = 400
    if len(inputs['input_ids']) == 824:
      max_len = 818 
    if len(inputs['input_ids']) == 407:
      max_len = 406
    if len(inputs['input_ids']) == 410:
      max_len = 408
      # print(len(inputs['offset_mapping']),len(inputs['input_ids']),len(inputs['attention_mask']))
    inputs['input_ids'] = inputs['input_ids'][:max_len]
    inputs['token_type_ids'] = inputs['token_type_ids'][:max_len]
    inputs['attention_mask'] = inputs['attention_mask'][:max_len]
    inputs['offset_mapping'] = inputs['offset_mapping'][:max_len]

    try:
      for i in range(len(inputs["input_ids"])):
          sample_idx = sample_map[i]
          example_ids.append(examples["id"][sample_idx])

          sequence_ids = inputs.sequence_ids(i)
          offset = inputs["offset_mapping"][i]
          inputs["offset_mapping"][i] = [
              o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
          ]

      inputs["example_id"] = example_ids
    except:
      pass
      # print("a")
      # print(examples['id'])

    return inputs

In [26]:
tokenized_val = dict_val.map(preprocess_function_val, batched=True)
tokenized_val_oracle = dict_val_oracle.map(preprocess_function_val, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

1000


 18%|█▊        | 2/11 [00:00<00:02,  3.37ba/s]

1000


 27%|██▋       | 3/11 [00:00<00:02,  3.57ba/s]

1000


 36%|███▋      | 4/11 [00:01<00:01,  3.75ba/s]

1000


 45%|████▌     | 5/11 [00:01<00:01,  3.80ba/s]

1000


 55%|█████▍    | 6/11 [00:01<00:01,  3.84ba/s]

1000


 64%|██████▎   | 7/11 [00:02<00:01,  2.76ba/s]

1000


 73%|███████▎  | 8/11 [00:02<00:00,  3.03ba/s]

1000


 82%|████████▏ | 9/11 [00:02<00:00,  3.25ba/s]

1000


 91%|█████████ | 10/11 [00:02<00:00,  3.41ba/s]

1000


 91%|█████████ | 10/11 [00:03<00:00,  3.23ba/s]


410


  0%|          | 0/3 [00:00<?, ?ba/s]

1000


 67%|██████▋   | 2/3 [00:00<00:00,  3.69ba/s]

1000


 67%|██████▋   | 2/3 [00:00<00:00,  2.66ba/s]


824


In [27]:


def compute_metrics(start_logits, end_logits, features, examples):
    """
    Converts the logits back to text and returns the predicted text and the original text
    """
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    n_best = 20
    max_answer_length = 30
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return predicted_answers, theoretical_answers, examples
    # return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [28]:
#######


# LOAD THE MODEL

#########



In [29]:
training_args = TrainingArguments(
    output_dir= results_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=2,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)



PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
#Restructure the predictions and print the blue score for complete validation data and oracle data
def print_single_blue(predicted_answers, theoretical_answers):
  pred_list = []
  references_list = []
  for i in range(0,len(predicted_answers) ):
    pred_list.append(predicted_answers[i]['prediction_text'])
    references_list.append(theoretical_answers[i]['answers']['text'])
  results = bleu.compute(predictions=pred_list, references=references_list)
  print(results)
  return

In [31]:

#Restructure the predictions and print the bert score for complete validation data and oracle datadef print_single_bert(predicted_answers, theoretical_answers):
def print_single_bert(predicted_answers, theoretical_answers):
  pred_list = []
  references_list = []
  for i in range(0,len(predicted_answers) ):
    pred_list.append(predicted_answers[i]['prediction_text'])
    references_list.append(theoretical_answers[i]['answers']['text'])
  results = bertscore.compute(predictions=pred_list, references=references_list, model_type=model_type_bert_score)
  # print(results)
  # print(results)
  print("F1 Score")
  print(sum(results['f1'])/len(results['f1']))

  print("Precision Score")
  print(sum(results['precision'])/len(results['precision']))

  print("Recall Score")
  print(sum(results['recall'])/len(results['recall']))
  return

In [32]:
#get Predictions on the complete validation data and print bert and bleu scores
predictions, _, _ = trainer.predict(tokenized_val)
start_logits, end_logits = predictions
predicted_answers, theoretical_answers, examples = compute_metrics(start_logits, end_logits, tokenized_val, dict_val)
print("Bleu Score For Entire Validation Set")
print_single_blue(predicted_answers, theoretical_answers)

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id. If spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10408
  Batch size = 16


100%|██████████| 10408/10408 [00:07<00:00, 1429.08it/s]


Bleu Score For Entire Validation Set
{'bleu': 0.48928033324159165, 'precisions': [0.12419190200748553, 0.7744305657604702, 0.772572402044293, 0.7712871287128713], 'brevity_penalty': 1.0, 'length_ratio': 1.0691160421971626, 'translation_length': 11756, 'reference_length': 10996}


In [33]:
#get Predictions on the oracle data and print bert and bleu scores
predictions, _, _ = trainer.predict(tokenized_val_oracle)
start_logits, end_logits = predictions
predicted_answers, theoretical_answers, examples = compute_metrics(start_logits, end_logits, tokenized_val_oracle, dict_val_oracle)
print("Bleu Score For Oracle Validation Set")
print_single_bert(predicted_answers, theoretical_answers)

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id. If spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2818
  Batch size = 16


100%|██████████| 2818/2818 [00:01<00:00, 1411.65it/s]


Bleu Score For Oracle Validation Set


loading configuration file config.json from cache at /soe/vigneshs/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /soe/vigneshs/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from c

F1 Score
0.7479740014898599
Precision Score
0.7968280240734729
Recall Score
0.7114927988153049




Getting Results on Top 10 and Top 4 Predictios of the Classification Model

In [34]:
def load_data_top(file_path):
    """
    Reads the input file and extracts sentences for each input data point
    from the answer index and return the list of json objects
    

    Parameters
    ----------
    file_path : Str
        path to the input jsonl file

    Returns
    -------
    filtered_sentences : List
        Contains json objects with the following fields:
        'uuid', 'postText', 'targetTitle', 'targetParagraph', 'spoilerType', 'spoilerPos', 'answers', 'id'

    """

    with open(file_path, 'r') as f:
      sentences = [json.loads(sentence.rstrip()) for sentence in f.readlines()]
      filtered_sentences = []
      c=0
      for _sentence in sentences[0]:

        temp_dict = dict()
        for _key, _value in _sentence.items():
          temp_dict[_key] = _value
        temp_dict_ansers = dict()
        temp_text = []
        temp_answer_start = []
        spoilerPoss = temp_dict['spoilerPos']
        if spoilerPoss is not None:
          for spoilpos in spoilerPoss:
            ttext = temp_dict['targetParagraph']
            temp_text.append(ttext[spoilpos[0]:spoilpos[1]])
            temp_answer_start.append(spoilpos[0])
        else:
          temp_text.append("")
          temp_answer_start.append(0)
        temp_dict_ansers['text'] = temp_text
        temp_dict_ansers['answer_start'] = temp_answer_start
        temp_dict['answers'] = temp_dict_ansers
        temp_dict['id'] = c
        temp_dict['uuid'] = temp_dict['uuid']
        c = c+1
        filtered_sentences.append(temp_dict)
      return filtered_sentences

In [35]:
#loading data for top 10 and top4 predictions
val_top10_unstructured = load_data_top(original_val_top10_reformat)
val_top4_unstructured = load_data_top(original_val_top4_reformat)

In [36]:
#writing structered data to the disk
write_structured_data(val_top10_unstructured, original_val_top10_structured)
write_structured_data(val_top4_unstructured, original_val_top4_structured)

In [37]:
#Reading Structred data files in a hugging face data object
dict_val_top10 = load_dataset("json", data_files= original_val_top10_structured , split='train')
dict_val_top4 = load_dataset("json", data_files= original_val_top4_structured , split='train')

Using custom data configuration default-5821924657eebb7a


Downloading and preparing dataset json/default to /soe/vigneshs/.cache/huggingface/datasets/json/default-5821924657eebb7a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 3146.51it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 306.24it/s]
                            

Dataset json downloaded and prepared to /soe/vigneshs/.cache/huggingface/datasets/json/default-5821924657eebb7a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


Using custom data configuration default-aaff0922598291ee


Downloading and preparing dataset json/default to /soe/vigneshs/.cache/huggingface/datasets/json/default-aaff0922598291ee/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1895.30it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 473.67it/s]
                            

Dataset json downloaded and prepared to /soe/vigneshs/.cache/huggingface/datasets/json/default-aaff0922598291ee/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




In [38]:
#renaming and dropping columns
dict_val_top10 = dict_val_top10.rename_column("targetTitle", "title")
dict_val_top10 = dict_val_top10.rename_column("targetParagraph", "context")
dict_val_top10 = dict_val_top10.rename_column("postText", "question")
dict_val_top10 = dict_val_top10.remove_columns(['spoilerPos'])

In [39]:
#renaming and dropping columns
dict_val_top4 = dict_val_top4.rename_column("targetTitle", "title")
dict_val_top4 = dict_val_top4.rename_column("targetParagraph", "context")
dict_val_top4 = dict_val_top4.rename_column("postText", "question")

dict_val_top4 = dict_val_top4.remove_columns(['spoilerPos'])

In [40]:
# Create Tokenizer for Top 10 and Top 4 Data inputs. There are seperate tokenizers for different data sets because of the extra parameter requirements.
def preprocess_function_inference_top(examples):
    """
    Tokenizes the input data and also formats it according to the input format of bert trained on squad.
    
    Parameters
    ----------
    examples : Dataset Dict

    Returns
    -------
    inputs : Dataset Dict

    """
  # try:
    questions = [q.strip() for q in examples["question"]]
    try:
      inputs = tokenizer(
          questions,
          examples["context"],
          max_length=384,
          truncation="only_second",
          return_offsets_mapping=True,
          return_overflowing_tokens=True,
          padding="max_length",
      )
    except:
      pass
      # print(examples['id'])


    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    max_len = min(len(inputs['offset_mapping']),len(inputs['input_ids']),len(inputs['attention_mask']), 1000 )
    # if max_len 
    print(max_len)
    # Some Lenghts are problematic and have been handled here
    if(max_len < 1000):
      max_len = 790
    if len(inputs['input_ids'])== 822:
      max_len = 820
    if len(inputs['input_ids'])== 148:
      max_len = 147
      # print(len(inputs['offset_mapping']),len(inputs['input_ids']),len(inputs['attention_mask']))
    inputs['input_ids'] = inputs['input_ids'][:max_len]
    inputs['token_type_ids'] = inputs['token_type_ids'][:max_len]
    inputs['attention_mask'] = inputs['attention_mask'][:max_len]
    inputs['offset_mapping'] = inputs['offset_mapping'][:max_len]

    try:
      for i in range(len(inputs["input_ids"])):
          sample_idx = sample_map[i]
          example_ids.append(examples["id"][sample_idx])

          sequence_ids = inputs.sequence_ids(i)
          offset = inputs["offset_mapping"][i]
          inputs["offset_mapping"][i] = [
              o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
          ]

      inputs["example_id"] = example_ids
    except:
      pass
      # print("a")
      # print(examples['id'])

    return inputs



tokenized_val_top10 = dict_val_top10.map(preprocess_function_inference_top, batched=True)
tokenized_val_top4 = dict_val_top4.map(preprocess_function_inference_top, batched=True)

 14%|█▍        | 1/7 [00:00<00:01,  3.18ba/s]

1000


 29%|██▊       | 2/7 [00:00<00:01,  3.80ba/s]

1000


 43%|████▎     | 3/7 [00:00<00:00,  4.04ba/s]

1000


 57%|█████▋    | 4/7 [00:01<00:01,  2.85ba/s]

1000


 71%|███████▏  | 5/7 [00:01<00:00,  3.24ba/s]

1000


 86%|████████▌ | 6/7 [00:01<00:00,  3.40ba/s]

1000


 86%|████████▌ | 6/7 [00:01<00:00,  3.05ba/s]


822


  0%|          | 0/4 [00:00<?, ?ba/s]

1000


 50%|█████     | 2/4 [00:00<00:00,  3.63ba/s]

1000


 75%|███████▌  | 3/4 [00:00<00:00,  3.45ba/s]

1000
148





In [41]:
#create tokenizedval_example_id to uuid mapping:
def create_tokenizedval_example_id(tokenized_val_c):
    """
    Create a mapping of index id to uuid which is unique
    for every datapoint in the original.

    Parameters
    ----------
    tokenized_val_c : Dict
        tokenized input

    Returns
    -------
    id_uuid_mapping : Dcit
        Contains a mapping of example_id to uuid

    """
    id_uuid_mapping = dict()
    for i in tokenized_val_c:
      temp_uid = i['uuid']
      id_uuid_mapping[i['example_id']] = temp_uid
      # id_uuid_mapping[temp_uid] = id_uuid_mapping.get(temp_uid, [])+[i['example_id']]
    return id_uuid_mapping 



In [42]:
#getting id and uuid mappings for both the inputs
id_uuid_mapping_top10 = create_tokenizedval_example_id(tokenized_val_top10)
id_uuid_mapping_top4 = create_tokenizedval_example_id(tokenized_val_top4)

In [43]:
# Bleu Score
def print_bleu_score_top(uuid_theory,uuid_pred):
    """
    Reformats the data according to the input format of bleu.
    Calculates the bleu score and prints it.

    Parameters
    ----------
    uuid_theory : Dict
        Contains the original labels
    uuid_pred : Dict
        Contains the answers predicted by the model

    Returns
    -------
    None.

    """
    final_predictions_list = []
    final_reference_list = []

    for key, value in uuid_theory.items():
      temp_ref_list= value
      temp_pred_list = uuid_pred[key]
      for i in temp_pred_list:
        final_predictions_list.append(i)
        final_reference_list.append(temp_ref_list)


    results = bleu.compute(predictions=final_predictions_list,
                          references=final_reference_list)
    print(results)
    return

In [44]:
# Bert Score
def print_bert_score_top(uuid_theory,uuid_pred):
    """
    Reformats the data according to the input format of bertscore function.
    Calculates the bert score and prints it.

    Parameters
    ----------
    uuid_theory : Dict
        Contains the original labels
    uuid_pred : Dict
        Contains the answers predicted by the model

    Returns
    -------
    None.

    """
    final_predictions_list = []
    final_reference_list = []

    for key, value in uuid_theory.items():
      temp_ref_list= value
      temp_pred_list = uuid_pred[key]
      for i in temp_pred_list:
        final_predictions_list.append(i)
        final_reference_list.append(temp_ref_list[0])


    results = bertscore.compute(predictions=final_predictions_list, references=final_reference_list,
                                model_type= model_type_bert_score)
    # print(results)
    print("F1 Score")
    print(sum(results['f1'])/len(results['f1']))

    print("Precision Score")
    print(sum(results['precision'])/len(results['precision']))

    print("Recall Score")
    print(sum(results['recall'])/len(results['recall']))
    return

In [45]:
#Get QA predictions for Top 10 Predictions of the Classification Model and print the Bleu and Bert Scores
predictions, _, _ = trainer.predict(tokenized_val_top10)
start_logits, end_logits = predictions
predicted_answers, theoretical_answers, examples = compute_metrics(start_logits, end_logits, tokenized_val_top10, dict_val_top10)
uuid_theory = dict()
uuid_pred = dict()
for i in range(len(theoretical_answers)):
  try:
    temp_uuid = id_uuid_mapping_top10[theoretical_answers[i]['id']]
    # print(temp_uuid)
    uuid_theory[temp_uuid] = uuid_theory.get(temp_uuid, []) + theoretical_answers[i]['answers']['text']
    uuid_pred[temp_uuid] = uuid_pred.get(temp_uuid, []) + [predicted_answers[i]['prediction_text']]
  except Exception as e:
    pass



for key, value in uuid_pred.items():
  temp_string = uuid_pred[key]
  uuid_pred[key] = list(set(temp_string))


for key, value in uuid_theory.items():
  temp_string = uuid_theory[key]
  temp_final_theory = []
  for i in temp_string:
    if len(i)> 0:
      temp_final_theory.append(i)
  if len(temp_final_theory)>0:
    uuid_theory[key] = temp_final_theory
  else:
    uuid_theory[key] = [""]

print("Bleu Score for top 10 is")
print_bleu_score_top(uuid_theory,uuid_pred )

print("Bert Score for top 10 is")
print_bert_score_top(uuid_theory,uuid_pred )

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id. If spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 6820
  Batch size = 16


100%|██████████| 6820/6820 [00:04<00:00, 1428.47it/s]


Bleu Score for top 10 is
{'bleu': 0.0009055218696851716, 'precisions': [0.25096679557274304, 0.7946317103620475, 0.7860230547550432, 0.7814070351758794], 'brevity_penalty': 0.0015306477376356225, 'length_ratio': 0.1336529550153276, 'translation_length': 7499, 'reference_length': 56108}
Bert Score for top 10 is




F1 Score
0.5627420705661971
Precision Score
0.5934504868873866
Recall Score
0.536867658605451


In [46]:
#Get QA predictions for Top 4 Predictions of the Classification Model and print the Bleu and Bert Scores
predictions, _, _ = trainer.predict(tokenized_val_top4)
start_logits, end_logits = predictions
predicted_answers, theoretical_answers, examples = compute_metrics(start_logits, end_logits, tokenized_val_top4, dict_val_top4)
uuid_theory = dict()
uuid_pred = dict()
for i in range(len(theoretical_answers)):
  try:
    temp_uuid = id_uuid_mapping_top4[theoretical_answers[i]['id']]
    uuid_theory[temp_uuid] = uuid_theory.get(temp_uuid, []) + theoretical_answers[i]['answers']['text']
    uuid_pred[temp_uuid] = uuid_pred.get(temp_uuid, []) + [predicted_answers[i]['prediction_text']]
  except Exception as e:
    pass



for key, value in uuid_pred.items():
  temp_string = uuid_pred[key]
  uuid_pred[key] = list(set(temp_string))


for key, value in uuid_theory.items():
  temp_string = uuid_theory[key]
  temp_final_theory = []
  for i in temp_string:
    if len(i)> 0:
      temp_final_theory.append(i)
  if len(temp_final_theory)>0:
    uuid_theory[key] = temp_final_theory
  else:
    uuid_theory[key] = [""]

print("Bleu Score for top4 is")
print_bleu_score_top(uuid_theory,uuid_pred )

print("Bert Score for top4 is")
print_bert_score_top(uuid_theory,uuid_pred )

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id. If spoilerType, targetParagraphId, context, example_id, title, offset_mapping, answers, uuid, question, id are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3147
  Batch size = 16


100%|██████████| 3147/3147 [00:02<00:00, 1410.46it/s]


Bleu Score for top4 is
{'bleu': 0.01177687857062422, 'precisions': [0.33403508771929824, 0.7913238593866866, 0.7865168539325843, 0.7853560682046138], 'brevity_penalty': 0.018526780265865017, 'length_ratio': 0.20045953296445654, 'translation_length': 4275, 'reference_length': 21326}
Bert Score for top4 is




F1 Score
0.46187906198512946
Precision Score
0.4860323393239384
Recall Score
0.4418107842591606
