In [None]:
!pip install datasets



PREPROCESSING

In [None]:
import json
from datasets import Dataset

In [None]:
with open("/content/book_qa_SLM.json", "r") as json_file:
    qa_data = json.load(json_file)

In [None]:
context_train = []
question_train = []
answer_train = []
answer_starts_train = []

In [None]:
for paragraph in qa_data["data"][0]["paragraphs"]:
    context = paragraph["context"]
    for qa in paragraph["qas"]:
        question_train.append(qa["question"])
        context_train.append(context)
        answer_train.append(qa["answers"][0]["text"])
        answer_starts_train.append(qa["answers"][0]["answer_start"])

In [None]:
train_data = {
    "question": question_train,
    "answer": answer_train,
    "context": context_train,
    "answer_starts": answer_starts_train
}

train_dataset = Dataset.from_dict(train_data)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        # Assuming 'answer' is a string containing the answer text and
        # 'examples["answer_starts"]' holds the answer start positions:
        start_char = examples["answer_starts"][sample_idx]
        end_char = start_char + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Training

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_name = "bert-base-cased"  # Change to another model if needed
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from datasets import DatasetDict

# Splitting the dataset: 90% training, 10% validation
split_dataset = train_dataset.train_test_split(test_size=0.1)
dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qa_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Adjust based on performance
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2
)




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [None]:
!pip install wandb
import wandb





In [None]:
wandb.init(project="SLM-QA")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprerna7788arora[0m ([33mprerna7788arora-rajiv-gandhi-institute-of-petroleum-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.007843
2,No log,0.003548


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,0.007843
2,No log,0.003548
3,No log,0.003035


TrainOutput(global_step=21, training_loss=0.2420180184500558, metrics={'train_runtime': 770.0701, 'train_samples_per_second': 0.199, 'train_steps_per_second': 0.027, 'total_flos': 49973004725760.0, 'train_loss': 0.2420180184500558, 'epoch': 3.0})

In [None]:
trainer.evaluate()


{'eval_loss': 0.0030345541890710592,
 'eval_runtime': 17.9261,
 'eval_samples_per_second': 0.335,
 'eval_steps_per_second': 0.056,
 'epoch': 3.0}

In [None]:
model.save_pretrained("./qa_model")
tokenizer.save_pretrained("./qa_model")


('./qa_model/tokenizer_config.json',
 './qa_model/special_tokens_map.json',
 './qa_model/vocab.txt',
 './qa_model/added_tokens.json',
 './qa_model/tokenizer.json')

Inference

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="./qa_model", tokenizer="./qa_model")

def answer_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result["answer"]

Device set to use cpu


In [32]:
context = "The capital of France is Paris. It is known for the Eiffel Tower."
question = "What is the capital of France?"
print(answer_question(question, context))

France is Paris. It is known for the Eiffel


In [33]:
import shutil
shutil.make_archive('/content/project', 'zip', '/content/')


'/content/project.zip'