In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the medical dataset
data = pd.read_csv("data.csv")
data = data.head(10)
# Split the dataset into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    if type(text) != str:
      text = str(text)
      text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply preprocessing to questions and answers columns
train_data['questions'] = train_data['question'].apply(preprocess_text)
train_data['answers'] = train_data['answer'].apply(preprocess_text)
val_data['questions'] = val_data['question'].apply(preprocess_text)
val_data['answers'] = val_data['answer'].apply(preprocess_text)

In [2]:
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize and preprocess training data
train_inputs = tokenizer(train_data['questions'].tolist(), train_data['answers'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Convert answers to start and end token positions
train_start_positions = []
train_end_positions = []

for i in range(len(train_data)):
    encoding = tokenizer(train_data['answers'].iloc[i], return_offsets_mapping=True, padding=True, truncation=True)
    start_idx = encoding.char_to_token(0) if encoding.char_to_token(0) is not None else 0
    end_idx = encoding.char_to_token(len(train_data['answers'].iloc[i]) - 1) if encoding.char_to_token(len(train_data['answers'].iloc[i]) - 1) is not None else 0
    train_start_positions.append(start_idx)
    train_end_positions.append(end_idx)

# Create TensorDataset for training
train_dataset = TensorDataset(train_inputs.input_ids, train_inputs.attention_mask, train_inputs.token_type_ids,
                              torch.tensor(train_start_positions), torch.tensor(train_end_positions))

# Define DataLoader for training
batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Load pre-trained BERT-based QA model
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Define optimizer
optimizer = torch.optim.AdamW(qa_model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3
qa_model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch[0]
        attention_mask = batch[1]
        token_type_ids = batch[2]
        start_positions = batch[3]
        end_positions = batch[4]
        optimizer.zero_grad()

        outputs = qa_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the trained QA model
qa_model.save_pretrained('trained_qa_model')

# Tokenize and preprocess validation data
val_inputs = tokenizer(val_data['questions'].tolist(), val_data['answers'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Convert answers to start and end token positions
val_start_positions = []
val_end_positions = []

for i in range(len(val_data)):
    encoding = tokenizer(val_data['answers'].iloc[i], return_offsets_mapping=True, padding=True, truncation=True)
    start_idx = encoding.char_to_token(0)
    end_idx = encoding.char_to_token(len(val_data['answers'].iloc[i]) - 1)
    val_start_positions.append(start_idx)
    val_end_positions.append(end_idx)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Create TensorDataset for validation
val_dataset = TensorDataset(val_inputs.input_ids, val_inputs.attention_mask, val_inputs.token_type_ids,
                            torch.tensor(val_start_positions), torch.tensor(val_end_positions))

# Define DataLoader for validation
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Evaluate the trained QA model on the validation set
qa_model.eval()
predictions = []

for batch in tqdm(val_dataloader, desc="Evaluating"):
    with torch.no_grad():
        input_ids = batch[0]
        attention_mask = batch[1]
        token_type_ids = batch[2]

        outputs = qa_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Get the best start and end positions
        start_index = torch.argmax(start_logits, dim=1)
        end_index = torch.argmax(end_logits, dim=1)

        # Get predicted answers
        for i in range(len(input_ids)):
            pred_answer = tokenizer.decode(input_ids[i][start_index[i]:end_index[i]+1])
            predictions.append(pred_answer)

# Calculate F1 score
f1 = f1_score(val_data['answers'], predictions, average='micro')
print("F1 Score:", f1)

Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]

F1 Score: 0.0





In [None]:
f1 = f1_score(val_data['answers'], predictions, average='micro')
print("F1 Score:", f1)

In [5]:
from sklearn.metrics import accuracy_score

def mean_reciprocal_rank(targets, predictions):
    ranks = []
    for target, prediction in zip(targets, predictions):
        if target in prediction:
            rank = 1 / (prediction.index(target) + 1)
            ranks.append(rank)
        else:
            ranks.append(0)
    return np.mean(ranks)

def exact_match(targets, predictions):
    em = 0
    for target, prediction in zip(targets, predictions):
        if target == prediction:
            em += 1
    return em / len(targets)

# Evaluate the trained QA model on the validation set
qa_model.eval()
all_targets = val_data['answers'].tolist()
all_predictions = []

for batch in tqdm(val_dataloader, desc="Evaluating"):
    with torch.no_grad():
        input_ids = batch[0]
        attention_mask = batch[1]
        token_type_ids = batch[2]

        outputs = qa_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Get the best start and end positions
        start_index = torch.argmax(start_logits, dim=1)
        end_index = torch.argmax(end_logits, dim=1)

        # Get predicted answers
        for i in range(len(input_ids)):
            pred_answer = tokenizer.decode(input_ids[i][start_index[i]:end_index[i]+1])
            all_predictions.append(pred_answer)

# Calculate MRR
mrr = mean_reciprocal_rank(all_targets, all_predictions)
print("Mean Reciprocal Rank (MRR):", mrr)

# Calculate Exact Match (EM)
em = exact_match(all_targets, all_predictions)
print("Exact Match (EM):", em)


Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]

Mean Reciprocal Rank (MRR): 0.0
Exact Match (EM): 0.0





In [9]:
all_targets

['Encourage comprehensive dilated eye exam least every two year Remember lowering eye pressure glaucoma early stage slows progression disease help save vision Get tip finding eye care professional',
 'The optic nerve bundle million nerve fiber It connects retina brain']

In [7]:
all_predictions

['who risk glaucoma', 'what glaucoma']