In [1]:
!pip install transformers
from torchvision.datasets.utils import download_url



In [39]:
from google.colab import drive
from sklearn.model_selection import train_test_split
import pandas as pd

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Load the dataset
# Replace 'your_dataset.csv' with the path to your file in Google Drive
file_path = '/content/drive/MyDrive/Squad.v2/SQuAD_data_10K_sample.csv'
data = pd.read_csv(file_path)

# Step 3: Split the dataset into train (80%) and test (20%) with a fixed random seed
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data =train_test_split(train_data, test_size=0.15, random_state=42 )
# Verify the split
print("Training set size:", len(train_data))
print("Testing set size:", len(test_data))
print("Validation set size", len(val_data))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training set size: 6800
Testing set size: 2000
Validation set size 1200


In [45]:
train_data = train_data.reset_index(drop=True)


In [46]:
test_data = test_data.reset_index(drop=True)

In [47]:
val_data = val_data.reset_index(drop=True)

In [10]:
print(train_data)

                                              knowledge  \
0     American Idol employs a panel of judges who cr...   
1     I/O is the means by which a computer exchanges...   
2     Henry VIII assumed direct royal control in 153...   
3     Alternatives to pesticides are available and i...   
4     Presently, the Central African Republic has ac...   
...                                                 ...   
7995  In 1393 King Richard II compelled landlords to...   
7996  New York became the most populous urbanized ar...   
7997  Many other databases have application software...   
7998  The Strategic Defence and Security Review 2015...   
7999  In early 1991, non-Arabs of the Zaghawa tribe ...   

                                               question  \
0     Which original judge was a record producer and...   
1     Devices that give input or output to a compute...   
2     What did changing the status of the abbey create?   
3     What is one thing that can be used in place of...

In [35]:
print(test_data)

                                              knowledge  \
0     The Somali language is a member of the Cushiti...   
1     A nonstandard dialect, like a standard dialect...   
2     A new index was released on December 18, 2008....   
3     Hunting big game typically requires a "tag" fo...   
4     In the days following the disaster, an interna...   
...                                                 ...   
1995  Just over a quarter of the jobs available in t...   
1996  However questions still remain, as some of the...   
1997  Treaties can be loosely compared to contracts:...   
1998  The North American mnemonic "spring forward, f...   
1999  The game features nine dungeons—large, contain...   

                                               question  \
0     Before what year did studies on the Somali lan...   
1     What was created for the sake of comparing Eng...   
2     What was the index published on December 18, 2...   
3     What kind of stamp is required to hunt migrato...

In [61]:
def read_squad_from_csv(data):
    contexts = data['knowledge'].tolist()
    questions = data['question'].tolist()
    answers = []

    for i in range(len(data)):
        context = data.loc[i, 'knowledge']
        answer_text = data.loc[i, 'right_answer']

        # Find the start index of the answer within the context
        answer_start = context.find(answer_text)
        answer_end = answer_start + len(answer_text)  # Calculate the end index of the answer

        # Append the answer as a dictionary with text, answer_start, and answer_end
        answer = {
            'text': answer_text,
            'answer_start': answer_start,
            'answer_end': answer_end
        }
        answers.append(answer)

    return contexts, questions, answers


In [62]:
import json
from pathlib import Path


# Example usage
train_contexts, train_questions, train_answers = read_squad_from_csv(train_data)
val_contexts, val_questions, val_answers = read_squad_from_csv(val_data)


# Read all data
#train_contexts, train_questions, train_answers = read_squad(train_data)
#val_contexts, val_questions, val_answers = read_squad('dev-v2.0.json')

# Limit the datasets to the first 10,000 and 3,000 samples, respectively
#train_contexts = train_contexts[:10000]
#train_questions = train_questions[:10000]
#train_answers = train_answers[:10000]

#val_contexts = val_contexts[:3000]
#val_questions = val_questions[:3000]
#val_answers = val_answers[:3000]


In [31]:
answers[0]

{'text': '78.3 years', 'answer_start': 198}

In [52]:
print(f'{train_contexts[0]} \n')
print(f'{train_questions[0]} \n')
print(train_answers[0])

The BeiDou Navigation Satellite System (BDS, simplified Chinese: 北斗卫星导航系统; traditional Chinese: 北斗衛星導航系統; pinyin: Běidǒu wèixīng dǎoháng xìtǒng) is a Chinese satellite navigation system. It consists of two separate satellite constellations – a limited test system that has been operating since 2000, and a full-scale global navigation system that is currently under construction. 

How long has the limited test system been operating? 

{'text': 'since 2000', 'answer_start': 288}


In [63]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)



In [36]:
train_encodings[0:5]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [60]:
print(train_answers[1])

{'text': 'to create a more responsive market', 'answer_start': 174}


In [64]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [65]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [66]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
from tqdm.notebook import tqdm

In [68]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()



  0%|          | 0/425 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [69]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=75a9bf6cd0fc8a97b7beb0a0279153b3ff166fa5b8f2063abdba84d8f2952c8d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [70]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
import numpy as np

# Set device and model to evaluation mode
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Training loop
for epoch in range(3):
    model.train()
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

# Evaluation loop
model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Process predictions and references
        for i in range(len(start_logits)):
            # Get predicted and true answer spans
            pred_start = torch.argmax(start_logits[i])
            pred_end = torch.argmax(end_logits[i])
            pred_answer = tokenizer.decode(input_ids[i][pred_start:pred_end+1], skip_special_tokens=True)
            true_answer = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i]+1], skip_special_tokens=True)

            predictions.append(pred_answer)
            references.append(true_answer)

# Calculate metrics
def compute_metrics(predictions, references):
    # Calculate Exact Match and Partial Match
    exact_matches = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    partial_matches = [1 if ref in pred or pred in ref else 0 for pred, ref in zip(predictions, references)]
    exact_match_score = np.mean(exact_matches)
    partial_match_score = np.mean(partial_matches)

    # Calculate Accuracy, Precision, Recall, and F1
    binary_predictions = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    accuracy = accuracy_score(binary_predictions, [1] * len(binary_predictions))
    precision, recall, f1, _ = precision_recall_fscore_support(binary_predictions, [1] * len(binary_predictions), average='binary')

    # Calculate BLEU Score
    bleu_scores = [sentence_bleu([ref.split()], pred.split()) for ref, pred in zip(references, predictions)]
    bleu_score_avg = np.mean(bleu_scores)

    # Calculate ROUGE Scores
    rouge_1, rouge_2, rouge_L = [], [], []
    for ref, pred in zip(references, predictions):
        rouge_scores = rouge.score(ref, pred)
        rouge_1.append(rouge_scores['rouge1'].fmeasure)
        rouge_2.append(rouge_scores['rouge2'].fmeasure)
        rouge_L.append(rouge_scores['rougeL'].fmeasure)

    rouge_1_avg = np.mean(rouge_1)
    rouge_2_avg = np.mean(rouge_2)
    rouge_L_avg = np.mean(rouge_L)

    # Combine all metrics in a dictionary
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "exact_match": exact_match_score,
        "partial_match": partial_match_score,
        "bleu": bleu_score_avg,
        "rouge_1": rouge_1_avg,
        "rouge_2": rouge_2_avg,
        "rouge_L": rouge_L_avg,
    }
    return metrics

# Compute and print evaluation metrics
metrics = compute_metrics(predictions, references)
print("Evaluation Metrics:", metrics)


100%|██████████| 425/425 [01:25<00:00,  4.99it/s]
100%|██████████| 425/425 [01:24<00:00,  5.02it/s]
100%|██████████| 425/425 [01:24<00:00,  5.02it/s]
100%|██████████| 75/75 [00:05<00:00, 13.78it/s]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Evaluation Metrics: {'accuracy': 0.4691666666666667, 'precision': 0.4691666666666667, 'recall': 1.0, 'f1_score': 0.6386840612592173, 'exact_match': 0.4691666666666667, 'partial_match': 0.8025, 'bleu': 0.09142507426118104, 'rouge_1': 0.6273409803098385, 'rouge_2': 0.3875488402713125, 'rouge_L': 0.627216842378804}
