In [1]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForQuestionAnswering, DefaultDataCollator
from datasets import load_dataset, Dataset
import torch
import numpy as np
import tensorflow as tf

In [11]:
def sentence_length(sentence):
    return len(str(sentence).split())

In [12]:
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [13]:
science_questions = load_dataset("sciq")

Found cached dataset sciq (C:/Users/Sravanth/.cache/huggingface/datasets/sciq/default/0.1.0/50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493)


  0%|          | 0/3 [00:00<?, ?it/s]

In [23]:

from help import arrange_data
arrange_data(science_questions, context="support", file_name="sciq")

In [47]:
def new_columns(dataset, list_of_columns):
    for name in list_of_columns:
        try:
            main = name.split('.')[0]
            sub = name.split('.')[1]
            dataset[main] = dataset[main].rename_column(name, sub)
        except ValueError:
            pass

In [43]:
train_dataset = Dataset.from_dict({'train': science_questions['train']})
validation_dataset = Dataset.from_dict({'validation': science_questions['validation']})
test_dataset = Dataset.from_dict({'test': science_questions['test']})


In [44]:
science_questions = science_questions.flatten()

In [52]:
max_length = tokenizer.model_max_length
def preprocess_inputs(data):

    tokenized = tokenizer(
        data["question"],
        data["support"],
        max_length=max_length,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    return tokenized

In [53]:
science_questions

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})

In [54]:
new_columns(science_questions, data_columns)
tokenized_datasets = science_questions.map(preprocess_inputs, batched=True, remove_columns=science_questions["train"].column_names)


Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [26]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [56]:
training = tokenized_datasets["train"].with_format("torch")
validation = tokenized_datasets["validation"].with_format("torch")

In [None]:
tokenized_datasets["validation"].with_format("torch")


In [29]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the chec

In [None]:
model_name = "SciQ_1"

training_args = TrainingArguments(
    output_dir=f"./{model_name}",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f"./{model_name}/logs",
    logging_steps=10,
    save_steps=1000,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    do_train=True,
    do_eval=True,
    learning_rate=5e-5,
)

data_collator = DefaultDataCollator()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training,
    eval_dataset=validation,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_2 = AutoModelForQuestionAnswering.from_pretrained("deepset/tinyroberta-squad2")
tokenizer_2 = AutoTokenizer.from_pretrained("deepset/tinyroberta-squad2")

import torch
# Define the question and context
question = "What is the capital of France?"
context = "Paris is the capital and most populous city of France."

# Tokenize the question and context
inputs = tokenizer_2.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

# Make predictions on the tokenized input
start_scores, end_scores = model_2(**inputs)

print(start_scores)
print(end_scores)

# Get the predicted answer
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer_2.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

print(f"Question: {question}")
print(f"Answer: {answer}")