In [1]:
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
import json
from sklearn.model_selection import train_test_split
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available. Using MPS device.")
else:
    device = torch.device("cpu")
    print("MPS device not found. Using CPU.")

MPS is available. Using MPS device.


In [3]:
context_data_files = [
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-goa_traveltriangle.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-japan_traveltriangle.json",
    "../NLP Processing/after_scraping/Context-Data/fine-tuning-vietnam_traveltriangle.json"
]
dataset_files = [
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-goa.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-japan.json",
    "../NLP Processing/after_scraping/four_qns/fine-tuning-dataset-traveltriangle-vietnam.json"
]

context_data = {}
for i, file_path in enumerate(context_data_files):
    with open(file_path, "r") as file:
        context_data[i] = json.load(file)
print(context_data)
questions = [
    "What is the name of the attraction?",
    "What is the location of the attraction?",
    "Describe the attraction in detail.",
    "What type of attraction is it? (e.g. historical, natural, amusement, beach)"
]


{0: {'0': 'Aguada Fort Beautiful Ambiance Sightseeing in Goa is incomplete without a visit to Fort Aguada strategically situated at the estuary of the river Mandovi On the north side of the fort a rampart of laterite just into the bay to form a jetty between two small sandy coves This picturesque spot is known as Sinquerim Beach Location Fort Aguada Rd Aguada Fort Area Candolim Goa Timings AM PM all days of the week Built By Portuguese Houses The Central Jail and a 19th Century Lighthouse How To Reach The fort is located on the Road and can be easily reached by road Entry Fee No entry fee Must Read 26 Beach Resorts In Goa Planning your holiday in Goa but confused about what to do These Goa travel stories help you find your best trip ever Romance Beaches Churches Rahul Talks Of A Sizzling Honeymoon In Goa No wonder Goa is the Beach Capital of India Read More Iresh Lists The Most Romantic Experiences From His Goa Honeymoon Trip Dolphin spotting snorkeling boat rides top the list Read Mor

In [4]:
training_data = []
for i, file_path in enumerate(dataset_files):
    with open(file_path, "r") as file:
        dataset = json.load(file)
        for entry in dataset:
    
            if entry['question'] != "What type of attraction is it? (e.g. historical, natural, amusement, beach)":
                unique_id = str(i) + str(entry["context_index"])
                ans = {
                    "context": context_data[i][str(entry['context_index'])],
                    "qas": [{
                        "id": unique_id,
                        "is_impossible": False,
                        "question": entry["question"],
                        "answers": [{"text": entry["answer"], "answer_start": 0}]
                    }]
                }
                training_data.append(ans)
                print(ans)


print(len(training_data))
training_data, testing_data = train_test_split(training_data, test_size=0.2, random_state=42)

print(len(training_data))
print(len(testing_data))

{'context': 'Aguada Fort Beautiful Ambiance Sightseeing in Goa is incomplete without a visit to Fort Aguada strategically situated at the estuary of the river Mandovi On the north side of the fort a rampart of laterite just into the bay to form a jetty between two small sandy coves This picturesque spot is known as Sinquerim Beach Location Fort Aguada Rd Aguada Fort Area Candolim Goa Timings AM PM all days of the week Built By Portuguese Houses The Central Jail and a 19th Century Lighthouse How To Reach The fort is located on the Road and can be easily reached by road Entry Fee No entry fee Must Read 26 Beach Resorts In Goa Planning your holiday in Goa but confused about what to do These Goa travel stories help you find your best trip ever Romance Beaches Churches Rahul Talks Of A Sizzling Honeymoon In Goa No wonder Goa is the Beach Capital of India Read More Iresh Lists The Most Romantic Experiences From His Goa Honeymoon Trip Dolphin spotting snorkeling boat rides top the list Read M

In [5]:
model_type = "distilbert"
model_name = "distilbert-base-uncased"

model_args = QuestionAnsweringArgs()
model_args.train_batch_size = 8
model_args.evaluate_during_training = True
model_args.n_best_size = 3
model_args.num_train_epochs = 3

In [6]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"outputs/{model_type}/best_model",
    "evaluate_during_training": True,
    "max_seq_length": 512,
    "num_train_epochs": 3,
    "evaluate_during_training_steps": 1000,
    "wandb_project": "Question-Answering",
    "wandb_kwargs": {"name": model_name},
    "save_model_every_epoch": True,
    "save_eval_checkpoints": False,
    "n_best_size": 3
}

In [7]:
model = QuestionAnsweringModel(
    model_type,
    model_name,
    args=train_args,
    use_cuda=False
)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.train_model(training_data, eval_data=testing_data)

convert squad examples to features:   0%|          | 0/321 [00:00<?, ?it/s]Could not find answer: 'Nagano Great For Family Nihonsuku for wikipedia You can not miss one of the best places to visit in Japan on your vacation which is the tropical retreat to beat the heat Located in the heart of central Japan Nagano is home to a lot of hidden gems like the Ninja Village for kids Shiga Kogen Ski Resort and Temple which make it a perfect place for all the types of travellers But what makes it more exclusive is the pleasant breeze that surrounds the city throughout the year You can not miss one of the best places to visit in Japan on your vacation Top Attractions Karuizawa Matsumoto Castle Kamikouchi Best Things To Do Visit the Temple' vs. 'Located in the heart of central Japan, Nagano is known for hidden gems like the Ninja Village for kids, Shiga Kogen Ski Resort, and Zenko-ji Temple. The city offers a pleasant breeze throughout the year. Top attractions include Karuizawa, Matsumoto Castle,

Epochs 1/3. Running Loss:    2.3613: 100%|██████████| 14/14 [00:30<00:00,  2.19s/it]
convert squad examples to features: 100%|██████████| 81/81 [00:00<00:00, 391.64it/s]
add example index and unique id: 100%|██████████| 81/81 [00:00<00:00, 716748.15it/s]
Running Evaluation: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]
Epochs 2/3. Running Loss:    1.0397: 100%|██████████| 14/14 [00:31<00:00,  2.22s/it]
Running Evaluation: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]
Epochs 3/3. Running Loss:    1.1818: 100%|██████████| 14/14 [00:30<00:00,  2.20s/it]
Running Evaluation: 100%|██████████| 1/1 [00:06<00:00,  6.84s/it]
Epoch 3 of 3: 100%|██████████| 3/3 [02:07<00:00, 42.54s/it]


(42,
 {'global_step': [14, 28, 42],
  'correct': [8, 9, 8],
  'similar': [19, 38, 20],
  'incorrect': [39, 19, 38],
  'train_loss': [2.361349582672119, 1.0397248268127441, 1.1817796230316162],
  'eval_loss': [-1.7760356664657593, -3.611274003982544, -3.9986178874969482]})

In [9]:
print(testing_data[2])
prediction = model.predict([testing_data[2]])
print(prediction)

{'context': 'The Fisherman s Wharf For Seafood Lovers This place has been attracting crowds for years now While Salcette offers a decor and serves delicious tandoori pomfret masala fried prawns and fish curry rice Live performances add up to the fun at this beautiful place On Sundays you can enjoy the live performances during lunch hours as well The view of backwaters make the moments memorable for lifetime So next time you are in Goa don t forget to add this to your list of places to visit in Goa Location Behind The Leela Mobor Cavelossim Goa Timings AM all days of the week How To Reach The place is by road Price For Two INR 1500', 'qas': [{'id': '011', 'is_impossible': False, 'question': 'What is the name of the attraction?', 'answers': [{'text': "The Fisherman's Wharf", 'answer_start': 0}]}]}


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 437.82it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 18477.11it/s]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00, 11.03it/s]

([{'id': '011', 'answer': ['The Fisherman', 'The', 'The Fisherman s Wharf For']}], [{'id': '011', 'probability': [0.4873323617507335, 0.26063586780803966, 0.2511648080568431]}])





In [11]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

# Load the model and tokenizer using transformers library
model = DistilBertForQuestionAnswering.from_pretrained("outputs/distilbert/best_model")
tokenizer = DistilBertTokenizer.from_pretrained("outputs/distilbert/best_model")



# Push the model and tokenizer to the Hub
model.push_to_hub("fine-tuned-distilbert", token='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')
tokenizer.push_to_hub("fine-tuned-distilbert", token="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")

print("Model pushed to Hugging Face Hub")

model.safetensors: 100%|██████████| 265M/265M [01:34<00:00, 2.82MB/s] 


Model pushed to Hugging Face Hub
