In [1]:
# !pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# # To run the training on TPU, you will need to uncomment the following line:
# # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# !apt install git-lfs

In [1]:
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
from datasets import load_dataset

from datasets import load_from_disk

load_dir = "../data/my_data_50K"

raw_datasets = load_from_disk(load_dir)
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 34930
    })
})


In [4]:
raw_datasets['train'][0]
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
tokenized_dataset = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/34930 [00:00<?, ? examples/s]

In [7]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

2024-04-26 00:28:12.922577: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 00:28:12.922629: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 00:28:12.923430: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 00:28:12.929451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoConfig

config = AutoConfig.from_pretrained('distilbert-base-uncased')
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased").to('cuda')

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
training_args = TrainingArguments(
    output_dir="span_50",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msachinsharma[0m ([33miiitd-sachin[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.1745,0.151054
2,0.1326,0.148084
3,0.1099,0.201807
4,0.0927,0.210643
5,0.0772,0.248516


TrainOutput(global_step=15625, training_loss=0.1337382134399414, metrics={'train_runtime': 3026.4678, 'train_samples_per_second': 82.605, 'train_steps_per_second': 5.163, 'total_flos': 2.4497456256e+16, 'train_loss': 0.1337382134399414, 'epoch': 5.0})

In [10]:
model.save_pretrained("../models/span_50")
tokenizer.save_pretrained("../models/span_50")

('../models/span_50/tokenizer_config.json',
 '../models/span_50/special_tokens_map.json',
 '../models/span_50/vocab.txt',
 '../models/span_50/added_tokens.json',
 '../models/span_50/tokenizer.json')

In [11]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("../models/span_50").to('cuda')

In [12]:
raw_datasets['validation']['question']

['hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'hey',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have got uh scarves tulip post cards',
 'so what are you guys in the market for we have g

In [13]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="../models/span_50", tokenizer="../models/span_50", device=device)
predictions = question_answerer(question=raw_datasets['validation']['question'], context=raw_datasets['validation']['context'])
predictions

[{'score': 0.9999971389770508, 'start': 0, 'end': 3, 'answer': 'hey'},
 {'score': 0.9996564388275146, 'start': 0, 'end': 2, 'answer': 'so'},
 {'score': 0.999902606010437, 'start': 0, 'end': 5, 'answer': 'check'},
 {'score': 0.9999277591705322, 'start': 0, 'end': 4, 'answer': 'well'},
 {'score': 0.9996613264083862, 'start': 0, 'end': 4, 'answer': 'well'},
 {'score': 0.9979308843612671, 'start': 0, 'end': 3, 'answer': 'all'},
 {'score': 0.9998599290847778, 'start': 0, 'end': 2, 'answer': 'oh'},
 {'score': 0.9998688697814941, 'start': 0, 'end': 3, 'answer': 'how'},
 {'score': 0.9989694952964783, 'start': 0, 'end': 2, 'answer': 'oh'},
 {'score': 0.999990701675415, 'start': 0, 'end': 4, 'answer': 'they'},
 {'score': 0.995757520198822, 'start': 0, 'end': 3, 'answer': 'all'},
 {'score': 0.9993592500686646, 'start': 0, 'end': 2, 'answer': 'oh'},
 {'score': 0.999792754650116, 'start': 0, 'end': 4, 'answer': 'good'},
 {'score': 0.9999994039535522, 'start': 0, 'end': 6, 'answer': 'thanks'},
 {'sc

In [14]:
import evaluate

def findEvalMetrics(true_labels, predictions):
    bleu = evaluate.load("bleu")
    results_bleu1 = bleu.compute(predictions=predictions, references=true_labels,max_order = 1)
    results_bleu2 = bleu.compute(predictions=predictions, references=true_labels,max_order = 2)
    results_bleu3 = bleu.compute(predictions=predictions, references=true_labels,max_order = 3)
    results_bleu4 = bleu.compute(predictions=predictions, references=true_labels,max_order = 4)

    results_bleu=[results_bleu1, results_bleu2, results_bleu3, results_bleu4]

    meteor = evaluate.load("meteor")
    results_meteor = meteor.compute(predictions=predictions, references=true_labels)

    return [results_bleu,results_meteor]

In [18]:
null_token = '<null>'

In [19]:
true_labels = [x['text'][0] if x['text'][0] != '' else null_token for x in raw_datasets['validation']['answers']]
# true_labels = [x['text'][0] for x in raw_datasets['validation']['answers']]
true_labels

['hey',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 'check this out huh yeah that is the stuff',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 'you are not really gonna buy that are you do not you think you have embarrassed me eno

In [20]:
pred_labels = [x['answer'] if len(x['answer'].split()) != 1 else null_token for x in predictions]
# pred_labels = [x['answer'] if len(x['answer'].split()) != 1 else '' for x in predictions]
# pred_labels = [x['answer'] for x in predictions]
pred_labels

['<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 'well i like it here you go',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 'you are not really gonna buy',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 '<null>',
 

In [21]:
metrics = findEvalMetrics(true_labels, pred_labels)
metrics

[nltk_data] Downloading package wordnet to /home/nsl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/nsl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nsl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[[{'bleu': 0.888278584925901,
   'precisions': [0.9271250617351698],
   'brevity_penalty': 0.9581000682512396,
   'length_ratio': 0.9589538493922012,
   'translation_length': 109338,
   'reference_length': 114018},
  {'bleu': 0.8831873725710925,
   'precisions': [0.9271250617351698, 0.9165277927104613],
   'brevity_penalty': 0.9581000682512396,
   'length_ratio': 0.9589538493922012,
   'translation_length': 109338,
   'reference_length': 114018},
  {'bleu': 0.8720846642255463,
   'precisions': [0.9271250617351698, 0.9165277927104613, 0.8874816353412027],
   'brevity_penalty': 0.9581000682512396,
   'length_ratio': 0.9589538493922012,
   'translation_length': 109338,
   'reference_length': 114018},
  {'bleu': 0.7167426776110766,
   'precisions': [0.9271250617351698,
    0.9165277927104613,
    0.8874816353412027,
    0.41530412034009156],
   'brevity_penalty': 0.9581000682512396,
   'length_ratio': 0.9589538493922012,
   'translation_length': 109338,
   'reference_length': 114018}],
 {'

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score

accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=1)
recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=1)
f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=1)

In [23]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9438877755511023
Precision: 0.9615942397799855
Recall: 0.9438877755511023
F1 Score: 0.9346441248870244


In [21]:
# context = 'all right look you are not really gonna buy that are you do not you think you have embarrassed me enough for one day'
# question = 'oh i embarrass you'
# ans = 'you have embarrassed me enough for one day'
# question = 'Ohh , you are about to get a little luckier .'
# context = 'You look amazing . I am the luckiest man in the world .'
# answer = 'You look amazing . I am the luckiest man in the world .'
# from transformers import pipeline

# question_answerer = pipeline("question-answering", model="../models/span_nz", tokenizer="../models/span_nz", device=device)
# test_predictions = question_answerer(question=question, context=context)
# test_predictions
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
# inputs = tokenizer(question, context, return_tensors="pt")
# import torch
# from transformers import AutoModelForQuestionAnswering

# model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
# with torch.no_grad():
#     outputs = model(**inputs)
# answer_start_index = outputs.start_logits.argmax()
# answer_end_index = outputs.end_logits.argmax()
# predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
# tokenizer.decode(predict_answer_tokens)