In [1]:
# !pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# # To run the training on TPU, you will need to uncomment the following line:
# # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# !apt install git-lfs

In [2]:
import torch

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
from datasets import load_dataset

from datasets import load_from_disk

load_dir = "../data/my_data_NO_ZERO"

raw_datasets = load_from_disk(load_dir)
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 7464
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1792
    })
})


In [5]:
raw_datasets['train'][0]
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
tokenized_dataset = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

In [8]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

2024-04-26 00:07:32.547031: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 00:07:32.547079: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 00:07:32.547925: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 00:07:32.553978: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoConfig

config = AutoConfig.from_pretrained('distilbert-base-uncased')
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased").to('cuda')

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="span_nz",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msachinsharma[0m ([33miiitd-sachin[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.588528
2,0.933300,0.59974
3,0.470400,0.631586
4,0.349800,0.704786
5,0.268200,0.752165


TrainOutput(global_step=2335, training_loss=0.4655816023048548, metrics={'train_runtime': 390.9338, 'train_samples_per_second': 95.464, 'train_steps_per_second': 5.973, 'total_flos': 3656980269895680.0, 'train_loss': 0.4655816023048548, 'epoch': 5.0})

In [11]:
model.save_pretrained("../models/span_nz")
tokenizer.save_pretrained("../models/span_nz")

('../models/span_nz/tokenizer_config.json',
 '../models/span_nz/special_tokens_map.json',
 '../models/span_nz/vocab.txt',
 '../models/span_nz/added_tokens.json',
 '../models/span_nz/tokenizer.json')

In [12]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("../models/span_nz").to('cuda')

In [13]:
raw_datasets['validation']['question']

['hey',
 'check this out huh yeah that is the stuff what do you think',
 'all right look you are not really gonna buy that are you do not you think you have embarrassed me enough for one day',
 'oh i embarrass you',
 'oh i embarrass you',
 'all right look if you insist on wearing that in public you know you are gonna spend the rest of the afternoon all by yourself',
 'oh yeah if you are gonna make me choose between you and the hat i choose the hat',
 'oh yeah if you are gonna make me choose between you and the hat i choose the hat',
 'okay wait all right that is it okay i am out of here i am not going to be embarrassed anymore',
 'look this is ridiculous we should be packing you',
 'great monica moving',
 'i am not',
 'oh really then how come all your stuff is in this box',
 'hey',
 'hey',
 'hey',
 'you were the next caller five hours ago you must be going crazy',
 'you were the next caller five hours ago you must be going crazy',
 'hey hey what going on',
 'she broke my arm',
 'he tou

In [14]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="../models/span_nz", tokenizer="../models/span_nz", device=device)
predictions = question_answerer(question=raw_datasets['validation']['question'], context=raw_datasets['validation']['context'])
predictions

[{'score': 0.9999858140945435, 'start': 0, 'end': 3, 'answer': 'hey'},
 {'score': 0.4520484209060669,
  'start': 24,
  'end': 41,
  'answer': 'that is the stuff'},
 {'score': 0.2852475047111511,
  'start': 74,
  'end': 116,
  'answer': 'you have embarrassed me enough for one day'},
 {'score': 0.6674448251724243,
  'start': 74,
  'end': 116,
  'answer': 'you have embarrassed me enough for one day'},
 {'score': 0.992686927318573,
  'start': 3,
  'end': 18,
  'answer': 'i embarrass you'},
 {'score': 0.26844319701194763,
  'start': 18,
  'end': 54,
  'answer': 'you insist on wearing that in public'},
 {'score': 0.0586727000772953,
  'start': 15,
  'end': 54,
  'answer': 'if you insist on wearing that in public'},
 {'score': 0.21603237092494965,
  'start': 11,
  'end': 80,
  'answer': 'you are gonna make me choose between you and the hat i choose the hat'},
 {'score': 0.08288056403398514,
  'start': 11,
  'end': 80,
  'answer': 'you are gonna make me choose between you and the hat i choose 

In [15]:
import evaluate

def findEvalMetrics(true_labels, predictions):
    bleu = evaluate.load("bleu")
    results_bleu1 = bleu.compute(predictions=predictions, references=true_labels,max_order = 1)
    results_bleu2 = bleu.compute(predictions=predictions, references=true_labels,max_order = 2)
    results_bleu3 = bleu.compute(predictions=predictions, references=true_labels,max_order = 3)
    results_bleu4 = bleu.compute(predictions=predictions, references=true_labels,max_order = 4)

    results_bleu=[results_bleu1, results_bleu2, results_bleu3, results_bleu4]

    meteor = evaluate.load("meteor")
    results_meteor = meteor.compute(predictions=predictions, references=true_labels)

    return [results_bleu,results_meteor]

In [16]:
# true_labels = [x['text'][0] if x['text'][0] != '' else 'null' for x in raw_datasets['validation']['answers']]
true_labels = [x['text'][0] for x in raw_datasets['validation']['answers']]
true_labels

['hey',
 'check this out huh yeah that is the stuff',
 'you are not really gonna buy that are you do not you think you have embarrassed me enough for one day',
 'you have embarrassed me enough for one day',
 'i embarrass you',
 'you insist on wearing that in public',
 'if you insist on wearing that in public you know you are gonna spend the rest of the afternoon all by yourself',
 'you are gonna make me choose between you and the hat',
 'if you are gonna make me choose between you and the hat i choose the hat',
 'you are moving',
 'monica moving',
 'monica moving',
 'all your stuff is in this box',
 'hey',
 'hey',
 'hey',
 'you are the next caller',
 'you were the next caller five hours ago',
 'what going on',
 'she broke my arm',
 'he touched my fanny',
 'she touched mine first',
 'he touched my fanny',
 'no she touched mine first',
 'this girl is good',
 'this girl is good',
 'look at you all jealous',
 'everyone else is happy she is done',
 'my next song called phoebe buffay what ca

In [17]:
# pred_labels = [x['answer'] if len(x['answer'].split()) != 1 else 'null' for x in predictions]
# pred_labels = [x['answer'] if len(x['answer'].split()) != 1 else '' for x in predictions]
pred_labels = [x['answer'] for x in predictions]
pred_labels

['hey',
 'that is the stuff',
 'you have embarrassed me enough for one day',
 'you have embarrassed me enough for one day',
 'i embarrass you',
 'you insist on wearing that in public',
 'if you insist on wearing that in public',
 'you are gonna make me choose between you and the hat i choose the hat',
 'you are gonna make me choose between you and the hat i choose the hat',
 'how about you are moving',
 'great monica moving',
 'monica moving',
 'how come all your stuff is in this box',
 'hey',
 'hey',
 'hey',
 'you are the next caller',
 'you were the next caller five hours ago you must be going crazy',
 'what going on',
 'she broke my arm',
 'he touched my fanny',
 'she touched mine first',
 'he touched my fanny',
 'she touched mine first',
 'this girl is good',
 'this girl is good',
 'look at you all jealous',
 'everyone else is happy she is done',
 'my next song called phoebe buffay what can i say',
 'i should not have left you that way',
 'no one of those look for the hidden meanin

In [18]:
metrics = findEvalMetrics(true_labels, pred_labels)
metrics

[nltk_data] Downloading package wordnet to /home/nsl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/nsl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nsl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[[{'bleu': 0.7968855295258308,
   'precisions': [0.8980452280567267],
   'brevity_penalty': 0.8873556750033685,
   'length_ratio': 0.8932484250890167,
   'translation_length': 13045,
   'reference_length': 14604},
  {'bleu': 0.7903650243044782,
   'precisions': [0.8980452280567267, 0.8834088687461121],
   'brevity_penalty': 0.8873556750033685,
   'length_ratio': 0.8932484250890167,
   'translation_length': 13045,
   'reference_length': 14604},
  {'bleu': 0.7842001659169244,
   'precisions': [0.8980452280567267, 0.8834088687461121, 0.8700167855644146],
   'brevity_penalty': 0.8873556750033685,
   'length_ratio': 0.8932484250890167,
   'translation_length': 13045,
   'reference_length': 14604},
  {'bleu': 0.777436437515643,
   'precisions': [0.8980452280567267,
    0.8834088687461121,
    0.8700167855644146,
    0.8536523610583618],
   'brevity_penalty': 0.8873556750033685,
   'length_ratio': 0.8932484250890167,
   'translation_length': 13045,
   'reference_length': 14604}],
 {'meteor': 

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=1)
recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=1)
f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=1)

In [20]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.6166294642857143
Precision: 0.9844447544642857
Recall: 0.6166294642857143
F1 Score: 0.6218154167261309


In [21]:
# context = 'all right look you are not really gonna buy that are you do not you think you have embarrassed me enough for one day'
# question = 'oh i embarrass you'
# ans = 'you have embarrassed me enough for one day'
# question = 'Ohh , you are about to get a little luckier .'
# context = 'You look amazing . I am the luckiest man in the world .'
# answer = 'You look amazing . I am the luckiest man in the world .'
# from transformers import pipeline

# question_answerer = pipeline("question-answering", model="../models/span_nz", tokenizer="../models/span_nz", device=device)
# test_predictions = question_answerer(question=question, context=context)
# test_predictions
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
# inputs = tokenizer(question, context, return_tensors="pt")
# import torch
# from transformers import AutoModelForQuestionAnswering

# model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
# with torch.no_grad():
#     outputs = model(**inputs)
# answer_start_index = outputs.start_logits.argmax()
# answer_end_index = outputs.end_logits.argmax()
# predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
# tokenizer.decode(predict_answer_tokens)