In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from datasets import Dataset, load_metric
import json
import pandas as pd
import numpy as np
import accelerate


In [20]:
# Tokenizer and model initialization for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")


In [21]:
# Read context data and questions
context_data_files = [
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-traveltriangle-vietnam.json"
]
dataset_files = [
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-vietnam.json"
]

contexts = []
questions_dataset = []
answers_text = []
answers_start = []

# Load context data
context_data = {}
for i, file_path in enumerate(context_data_files):
    with open(file_path, "r") as file:
        context_data[i] = json.load(file)

# Define questions
questions = [
    "What is the name of the attraction?",
    "What is the location of the attraction?",
    "Describe the attraction in detail.",
    "What type of attraction is it? (e.g. historical, natural, amusement, beach)"
]

# Read dataset files
for i, file_path in enumerate(dataset_files):
    with open(file_path, "r") as file:
        dataset = json.load(file)
        for entry in dataset:
            id = entry['context_index']
            for question in questions:
                if question == entry['question'] and str(id) in context_data[i].keys():
                    contexts.append(context_data[i][str(id)])
                    questions_dataset.append(entry["question"])
                    answers_text.append(entry["answer"])
                    answers_start.append(0)

# Create DataFrame
df = pd.DataFrame({
    'context': contexts,
    'question': questions_dataset,
    'answers_text': answers_text,
    'answers_start': answers_start
})

def tokenize_function(examples):
    return tokenizer(
        examples['context'], 
        examples['question'], 
        truncation=True, 
        padding='max_length',  # Ensure all examples are padded to max length
        max_length=256  # Adjust max_length as per your model's requirements
    )


# Map tokenization function to dataset
tokenized_datasets = Dataset.from_pandas(df).map(tokenize_function, batched=True)



[A

Map: 100%|██████████| 536/536 [00:00<00:00, 9470.83 examples/s]


In [22]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Load metric
metric = load_metric("squad")

# Compute metrics function
# Compute metrics function
def compute_metrics(pred):
    # pred.predictions is a tuple: (start_logits, end_logits)
    start_logits, end_logits = pred.predictions

    # Convert logits to predictions
    predictions = postprocess_qa_predictions(
        examples=pred.predictions,
        features=pred.label_ids,
        tokenizer=tokenizer,
        max_answer_length=30,  # Adjust as needed
        squad_v2=False,  # Depending on whether your dataset is SQuAD v1 or v2
    )

    # Calculate exact match and F1 score using the SQuAD evaluation metric
    squad_metric = load_metric("squad")
    results = squad_metric.compute(
        predictions=predictions, references=pred.label_ids
    )

    return {
        "exact_match": results["exact"],
        "f1": results["f1"],
    }

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        
        if outputs is None:
            raise ValueError("Model outputs are None.")
        
        start_logits = outputs.start_logits
        
        if start_logits is None:
            raise ValueError("Missing start_logits in model outputs.")
        
        start_positions = inputs.get("start_positions")
        
        if start_positions is None:
            raise ValueError("Missing start_positions in inputs.")
        
        # Compute the CrossEntropy loss for start positions
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        start_loss = loss_fct(start_logits.view(-1, 2), start_positions.view(-1))
        
        return (start_loss, outputs) if return_outputs else start_loss





# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Example: using training data for evaluation
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)




In [23]:
# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("fine-tuned-distilbert-model")


  0%|          | 0/102 [01:18<?, ?it/s]
  0%|          | 0/102 [00:00<?, ?it/s]

ValueError: Missing start_positions in inputs.