In [206]:
import torch
from transformers import TrainingArguments, Trainer, default_data_collator
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from datasets import Dataset, load_metric
import json
import pandas as pd
import numpy as np
import accelerate
from sklearn.model_selection import train_test_split

In [207]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available. Using MPS device.")
else:
    device = torch.device("cpu")
    print("MPS device not found. Using CPU.")

MPS is available. Using MPS device.


In [208]:
# Tokenizer and model initialization for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")


In [209]:
# Read context data and questions
context_data_files = [
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-vietnam.json"
]
dataset_files = [
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-vietnam.json"
]

contexts = []
questions_dataset = []
answers_text = []
answers_start = []

# Load context data
context_data = {}
for i, file_path in enumerate(context_data_files):
    with open(file_path, "r") as file:
        context_data[i] = json.load(file)

# Define questions
questions = [
    "What is the name of the attraction?",
    "What is the location of the attraction?",
    "Describe the attraction in detail.",
    "What type of attraction is it? (e.g. historical, natural, amusement, beach)"
]

# Read dataset files
for i, file_path in enumerate(dataset_files):
    with open(file_path, "r") as file:
        dataset = json.load(file)
        for entry in dataset:
            id = entry['context_index']
            for question in questions:
                if question == entry['question'] and str(id) in context_data[i].keys():
                    contexts.append(context_data[i][str(id)])
                    questions_dataset.append(entry["question"])
                    answers_text.append(entry["answer"])
                    answers_start.append(0)

# Create DataFrame
df = pd.DataFrame({
    'context': contexts,
    'question': questions_dataset,
    'answers': answers_text,
})

print(df.head())

def tokenize_function(examples):
    context = examples["context"]
    questions = examples["question"]
    answers = examples["answers"]

    input_texts = [f"Context: {context[i]} Question: {questions[i]}" for i in range(len(context))]
    target_texts = answers

    model_inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



# Map tokenization function to dataset
train, test = train_test_split(df, test_size=0.2)
print(train.shape)
print(test.shape)
train_dataset = Dataset.from_pandas(train).map(tokenize_function, batched=True)
test_dataset = Dataset.from_pandas(test).map(tokenize_function, batched=True)

print("Train dataset type:", type(train_dataset))
print("Test dataset type:", type(test_dataset))

print(train_dataset['input_ids'][0])
print(train_dataset['question'][0])
print(train_dataset['answers'][0])

                                             context  \
0   Aguada Fort: Beautiful Ambiance  Image Source...   
1   Aguada Fort: Beautiful Ambiance  Image Source...   
2   Aguada Fort: Beautiful Ambiance  Image Source...   
3   Aguada Fort: Beautiful Ambiance  Image Source...   
4   Chapora Fort: For Selfie Lovers  Image Source...   

                                            question  \
0                What is the name of the attraction?   
1            What is the location of the attraction?   
2                 Describe the attraction in detail.   
3  What type of attraction is it? (e.g. historica...   
4                What is the name of the attraction?   

                                             answers  
0                                        Aguada Fort  
1  Fort Aguada Rd, Aguada Fort Area, Candolim, Go...  
2  Sightseeing in Goa is incomplete without a vis...  
3                                         Historical  
4                                       Chapora For


Map:   0%|          | 0/428 [00:00<?, ? examples/s]


KeyError: 'questions'

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Load metric
metric = load_metric("squad")

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=metric.compute
)




In [None]:

print("Train dataset batch size:", len(train_dataset))
print("Test dataset batch size:", len(test_dataset))

trainer.train()


# Save the fine-tuned model
trainer.save_model("fine-tuned-distilbert-model")


  0%|          | 0/81 [00:21<?, ?it/s]


Train dataset batch size: 428
Test dataset batch size: 108




ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,attention_mask.