### Installation & Loading Dataset

In [1]:
!pip install datasets

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is 

In [2]:
!pip install transformers



In [None]:
# !pip install torchtext==0.17.0 --index-url https://download.pytorch.org/whl/cu118

In [1]:

import torch
# import torchtext
import transformers

print(f'torch version: {torch.__version__}')
# print(f'torchtext version: {torchtext.__version__}')
print(f'transformers version: {transformers.__version__}')

torch version: 2.5.1+cu124
transformers version: 4.51.1


### Load and preprocess data

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
# from torchtext.vocab import GloVe, build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import itertools

import torch.optim as optim
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ## Load and preprocess dataset

# dataset = load_dataset('squad')

# print(dataset)

# df = pd.DataFrame(dataset['train'])
# df = df[['context', 'question', 'answers']]
# df['answer_text'] = df['answers'].apply(lambda x: x['text'][0])
# df['answer_start'] = df['answers'].apply(lambda x: x['answer_start'][0])
# df = df.drop(columns=['answers'])

# print(df.head(2))

# # Use a subset
# df['context_len'] = df['context'].apply(len)
# df = df.sort_values(by='context_len').iloc[:10000].drop(columns=['context_len'])

# # Tokenization
# df['context_tok'] = df['context'].apply(lambda x: word_tokenize(x.lower()))
# df['question_tok'] = df['question'].apply(lambda x: word_tokenize(x.lower()))
# df['answer_tok'] = df['answer_text'].apply(lambda x: word_tokenize(x.lower()))

dataset = load_dataset('squad')

print(dataset)

# Take the first 16k rows with the shortest context (train data)
# train_data_old = dataset['train']
# train_data_old = train_data_old.map(lambda example: {'context_length': len(example['context'])})
# train_data_sorted = train_data_old.sort('context_length').select(range(16000))
train_df = dataset['train'].to_pandas()
train_df['context_length'] = train_df['context'].apply(len)
train_df = train_df.sort_values('context_length').head(16000)

# Take the first 4k rows with the shortest context (validation data)
# val_data_old = dataset['validation']
# val_data_old = val_data_old.map(lambda example: {'context_length': len(example['context'])})
# val_data_sorted = val_data_old.sort('context_length').select(range(10000))
val_df = dataset['validation'].to_pandas()
val_df['context_length'] = val_df['context'].apply(len)
val_df = val_df.sort_values('context_length').head(4000)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

test_tok = 2

def tokenize_data(example):
    # Tokenize the context and question
    encoding = tokenizer(example['context'], example['question'], truncation=True, padding='max_length', max_length=512, return_offsets_mapping=True)
    offsets = encoding['offset_mapping']
    input_ids = encoding['input_ids']

    # Convert the answer start index from character index to token index
    start_char = example['answers']['answer_start'][0]
    end_char = start_char + len(example['answers']['text'][0])

    start_token = end_token = None
    for idx, (start, end) in enumerate(offsets):
        if start_token is None and start <= start_char < end:
            start_token = idx
        if end_token is None and start < end_char <= end:
            end_token = idx

    if start_token is None or end_token is None:
        return None

    encoding['start_positions'] = start_token
    encoding['end_positions'] = end_token

    reconstructed_answer = tokenizer.decode(input_ids[start_token:end_token + 1], skip_special_tokens=True)

    global test_tok
    if test_tok > 0:
        test_tok = test_tok - 1
        print('\n\n')
        print(example)
        print(f'start_char ${start_char}')
        print(f'end_char ${end_char}')
        print(f'start_token ${start_token}')
        print(f'end_token ${end_token}')

        print(f"\n\nOriginal Answer: {example['answers']['text'][0]}")
        print(f"Reconstructed Answer: {reconstructed_answer}")

    return encoding

# Apply tokenization to both training and validation sets

# tokenized_train = train_data_sorted.map(tokenize_data, remove_columns=train_data_sorted.column_names)
# tokenized_train = tokenized_train.filter(lambda x: x is not None)

# tokenized_val = val_data_sorted.map(tokenize_data, remove_columns=val_data_sorted.column_names)
# tokenized_val = tokenized_val.filter(lambda x: x is not None)

# train_encoded = train_df.apply(tokenize_data, axis=1).dropna()
# val_encoded = val_df.apply(tokenize_data, axis=1).dropna()

print(train_df.head(2))

# Convert filtered DataFrames back into Hugging Face Datasets
# train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
# val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# # Apply tokenization using map (which expects dictionary input/output)
# tokenized_train = train_dataset.map(tokenize_data, remove_columns=train_dataset.column_names)
# tokenized_train = tokenized_train.filter(lambda x: x is not None)

# tokenized_val = val_dataset.map(tokenize_data, remove_columns=val_dataset.column_names)
# tokenized_val = tokenized_val.filter(lambda x: x is not None)

# # Convert tokenized data to pandas DataFrames
# train_encoded_df = tokenized_train.to_pandas()
# val_encoded_df = tokenized_val.to_pandas()

# train_data = train_data_sorted.map(tokenize_data, remove_columns=train_data_sorted.column_names)
# train_data = train_data.filter(lambda x: x is not None)
# val_data = val_data_sorted.map(tokenize_data, remove_columns=val_data_sorted.column_names)
# val_data = val_data.filter(lambda x: x is not None)
# Apply tokenization to both training and validation sets
train_encoded_df = train_df.apply(tokenize_data, axis=1)
train_encoded_df = train_encoded_df.dropna()
train_encoded_df = train_encoded_df.reset_index(drop=True)
val_encoded_df = val_df.apply(tokenize_data, axis=1)
val_encoded_df = val_encoded_df.dropna()
val_encoded_df = val_encoded_df.reset_index(drop=True)
print(f"Train set size: {len(train_encoded_df)}")
print(f"Validation set size: {len(val_encoded_df)}")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
                             id          title  \
12947  56df7f0156340a1900b29c38  Oklahoma_City   
12697  56df6bf35ca0a614008f9a00      Christian   

                                                 context  \
12947  The Oklahoma School of Science and Mathematics...   
12697  In the past, the Malays used to call the Portu...   

                                                question  \
12947  Where is The Oklahoma School of Science and Ma...   
12697                  What does the term refer to now?    

                                                 answers  context_length  
12947  {'text': ['Oklahoma City'], 'answer_start': [1...             151  
12697  {'text': ['the modern Kristang creoles of Mala...             151 

In [21]:
# print(train_df_sorted.head(4))
# print(df.head(4))
# tokenized_train2 = tokenized_train.select(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
# tokenized_train_focused = tokenized_train.remove_columns(['token_type_ids', 'offset_mapping'])
# tokenized_val_focused = tokenized_val.remove_columns(['token_type_ids', 'offset_mapping'])

# print(tokenized_train_focused)
# print(tokenized_val_focused)

train_encoded_df.sample(2)
# print(val_encoded.head(2))

11600    [input_ids, token_type_ids, attention_mask, of...
13825    [input_ids, token_type_ids, attention_mask, of...
dtype: object

### Build Vocabulary & Embeddings

In [37]:
# # ## Build vocabulary

# specials = ['<pad>', '<unk>']

# def yield_tokens():
#     for tokens in itertools.chain(df['context_tok'], df['question_tok'], df['answer_tok']):
#         yield tokens

# vocab = build_vocab_from_iterator(yield_tokens(), specials=specials, max_tokens=25000)
# vocab.set_default_index(vocab['<unk>'])

# # Load GloVe vectors
# vectors = GloVe(name='6B', dim=100)
# embedding_matrix = torch.zeros(len(vocab), 100, dtype=torch.float32)
# for idx, token in enumerate(vocab.get_itos()):
#     if token in vectors.stoi:
#         embedding_matrix[idx] = vectors[token].float()
#     else:
#         embedding_matrix[idx] = torch.randn(100) * 0.1

# def encode(tokens):
#     return [vocab[t] for t in tokens]

# df['context_ids'] = df['context_tok'].apply(encode)
# df['question_ids'] = df['question_tok'].apply(encode)

In [38]:
# train_df_sorted[['context_idx', 'question_idx', 'answer_idx']].head(2)
# df[['context_ids', 'question_ids']].head(2)

Unnamed: 0,context_ids,question_ids
12947,"[2, 377, 135, 4, 1249, 7, 7799, 3, 9, 135, 15,...","[78, 11, 2, 377, 135, 4, 1249, 7, 7799, 172, 10]"
12697,"[6, 2, 855, 3, 2, 16898, 57, 8, 1131, 2, 669, ...","[13, 99, 2, 120, 1143, 8, 223, 10]"


In [39]:
# # ## Compute start and end positions

# def find_sublist(full, sub):
#     for i in range(len(full) - len(sub) + 1):
#         if full[i:i+len(sub)] == sub:
#             return i
#     return -1

# start_positions = []
# end_positions = []

# for idx, row in df.iterrows():
#     context = row['context_tok']
#     answer = row['answer_tok']
#     start_idx = find_sublist(context, answer)
#     if start_idx == -1:
#         start_positions.append(None)
#         end_positions.append(None)
#     else:
#         start_positions.append(start_idx)
#         end_positions.append(start_idx + len(answer) - 1)

# df['start_pos'] = start_positions
# df['end_pos'] = end_positions

# df = df[df['start_pos'].notnull()]  # Remove problematic rows

# df.head(2)

Unnamed: 0,context,question,answer_text,answer_start,context_tok,question_tok,answer_tok,context_ids,question_ids,start_pos,end_pos
12947,The Oklahoma School of Science and Mathematics...,Where is The Oklahoma School of Science and Ma...,Oklahoma City,137,"[the, oklahoma, school, of, science, and, math...","[where, is, the, oklahoma, school, of, science...","[oklahoma, city]","[2, 377, 135, 4, 1249, 7, 7799, 3, 9, 135, 15,...","[78, 11, 2, 377, 135, 4, 1249, 7, 7799, 172, 10]",27.0,28.0
12697,"In the past, the Malays used to call the Portu...",What does the term refer to now?,the modern Kristang creoles of Malaysia,111,"[in, the, past, ,, the, malays, used, to, call...","[what, does, the, term, refer, to, now, ?]","[the, modern, kristang, creoles, of, malaysia]","[6, 2, 855, 3, 2, 16898, 57, 8, 1131, 2, 669, ...","[13, 99, 2, 120, 1143, 8, 223, 10]",23.0,28.0


### DataLoader

In [3]:
class QA_Model(nn.Module):
    def __init__(self):
        super(QA_Model, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.start_linear = nn.Linear(self.bert.config.hidden_size, 1)
        self.end_linear = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        start_logits = self.start_linear(sequence_output).squeeze(-1)
        end_logits = self.end_linear(sequence_output).squeeze(-1)
        return start_logits, end_logits

In [4]:
import torch
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long),
            'start': torch.tensor(row['start_positions'], dtype=torch.long),
            'end': torch.tensor(row['end_positions'], dtype=torch.long)
        }

In [5]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    start = torch.stack([item['start'] for item in batch])
    end = torch.stack([item['end'] for item in batch])
    return input_ids, attention_mask, start, end

In [6]:
from torch.utils.data import DataLoader

train_dataset = QADataset(train_encoded)
val_dataset = QADataset(val_encoded)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn, pin_memory=True)

In [41]:
# class QADataset(Dataset):
#     def __init__(self, df, max_len=512):
#         self.df = df
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         context = row['context_ids'][:self.max_len]
#         question = row['question_ids'][:64]
#         return {
#             'context': torch.tensor(context),
#             'question': torch.tensor(question),
#             'start': torch.tensor(row['start_pos']),
#             'end': torch.tensor(row['end_pos']),
#         }

# def collate_fn(batch):
#     def pad(seq, max_len):
#         return F.pad(seq, (0, max_len - len(seq)), value=vocab['<pad>'])

#     context_lens = [len(x['context']) for x in batch]
#     question_lens = [len(x['question']) for x in batch]

#     context_max = max(context_lens)
#     question_max = max(question_lens)

#     context = torch.stack([pad(x['context'], context_max) for x in batch])
#     question = torch.stack([pad(x['question'], question_max) for x in batch])
#     start = torch.stack([x['start'] for x in batch])
#     end = torch.stack([x['end'] for x in batch])

#     return context, question, start, end

# # train_dataset = QADataset(df)
# # train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

### Model (Encoder + Attention + Decoder)

In [42]:
# class EncoderQA(nn.Module):
#     def __init__(self, vocab_size, emb_dim, hidden_dim, embedding_matrix, dropout=0.3):
#         super().__init__()
#         self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
#         self.context_lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=dropout)
#         self.question_lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=dropout)

#         self.linear_start = nn.Linear(hidden_dim * 2, 1)
#         self.linear_end = nn.Linear(hidden_dim * 2, 1)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, context, question):
#         context_embed = self.embedding(context)
#         question_embed = self.embedding(question)

#         context_out, _ = self.context_lstm(context_embed)
#         question_out, _ = self.question_lstm(question_embed)

#         # Attention mechanism: Compute dot product attention
#         attention_scores = torch.matmul(context_out, question_out.transpose(1, 2))
#         attention_weights = F.softmax(attention_scores, dim=-1)
#         attended_context = torch.matmul(attention_weights, question_out)

#         # Apply dropout to regularize the model
#         attended_context = self.dropout(attended_context)

#         start_logits = self.linear_start(attended_context).squeeze(-1)
#         end_logits = self.linear_end(attended_context).squeeze(-1)

#         return start_logits.float(), end_logits.float()

In [12]:
train_encoded.to_frame()

Unnamed: 0,0
12947,"[input_ids, token_type_ids, attention_mask, of..."
12697,"[input_ids, token_type_ids, attention_mask, of..."
12696,"[input_ids, token_type_ids, attention_mask, of..."
24533,"[input_ids, token_type_ids, attention_mask, of..."
24531,"[input_ids, token_type_ids, attention_mask, of..."
...,...
74447,"[input_ids, token_type_ids, attention_mask, of..."
62008,"[input_ids, token_type_ids, attention_mask, of..."
28774,"[input_ids, token_type_ids, attention_mask, of..."
28775,"[input_ids, token_type_ids, attention_mask, of..."


### Training Setup & Loop

In [12]:
# Define loss function and optimizer
import time
from tqdm import tqdm
import gc

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
model = QA_Model().to(device)
# optimizer = optim.Adam(model.parameters(), lr=3e-5)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

def train_model(model, train_loader, optimizer, loss_fn, num_epochs=3):
    train_losses = []
    val_losses = []

    gc.collect()
    torch.cuda.empty_cache()

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        epoch_start_time = time.time()  # Track the time for the epoch

        # Use tqdm for a progress bar
        for i, (input_ids, attention_mask, start, end) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}", ncols=100)):
            t0 = time.time()

            # Step 1: Move to GPU
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            start = start.to(device)
            end = end.to(device)
            t1 = time.time()
        
            # Step 2: Forward pass
            start_logits, end_logits = model(input_ids, attention_mask)
            t2 = time.time()
        
            # Step 3: Loss and Backward
            loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)
            optimizer.zero_grad()
            loss.backward()
            t3 = time.time()
        
            # Step 4: Optimizer step
            optimizer.step()
            t4 = time.time()

            if i < 10:
                print(f"Batch {i}: To GPU: {t1 - t0:.3f}s | Forward: {t2 - t1:.3f}s | Backward: {t3 - t2:.3f}s | Step: {t4 - t3:.3f}s")
            
            # if i == 10: break
                
            # input_ids = input_ids.to(device)
            # attention_mask = attention_mask.to(device)
            # start = start.to(device)
            # end = end.to(device)

            # optimizer.zero_grad()

            # start_logits, end_logits = model(input_ids, attention_mask)
            # loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)

            # loss.backward()
            # optimizer.step()

            # total_train_loss += loss.item()

            # Print every 100 steps in the training loop
            if (i + 1) % 100 == 0:
                print(f"Step {i + 1}, Loss: {loss.item():.4f}")

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Calculate validation loss
        val_loss = evaluate_model(model, val_loader, loss_fn)
        val_losses.append(val_loss)

        print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    return train_losses, val_losses

# print(f'train_loader: ${train')
train_losses, val_losses = train_model(model, train_loader, optimizer, loss_fn, num_epochs=20)
# Plot training and validation loss
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

Epoch 1:   0%|                                                              | 0/500 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
def evaluate_model(model, val_loader, loss_fn):
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, start, end = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            start = start.to(device)
            end = end.to(device)
            t1 = time.time()
            start_logits, end_logits = model(input_ids, attention_mask)
            loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    return avg_val_loss

eval_losses = evaluate_model(model, val_loader, loss_fn)
print(f"Validation Loss: {eval_losses:.4f}")
# Plot evaluation loss
plt.plot(eval_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Validation Loss')
plt.legend()
plt.show()

Validation Loss: 2.6121


NameError: name 'train_losses' is not defined

In [None]:
from sklearn.metrics import jaccard_score
from nltk.translate.bleu_score import sentence_bleu

def test_model(model, dataset, tokenizer, max_samples=10):
    model.eval()
    results = []

    with torch.no_grad():
        for i in range(min(max_samples, len(dataset))):
            sample = dataset[i]
            input_ids = sample['input_ids'].unsqueeze(0).to(device)
            attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
            start_true = sample['start'].item()
            end_true = sample['end'].item()

            # Get model predictions
            start_logits, end_logits = model(input_ids, attention_mask)
            pred_start = torch.argmax(start_logits, dim=1).item()
            pred_end = torch.argmax(end_logits, dim=1).item()

            # Clip to valid range
            if pred_start > pred_end:
                pred_start, pred_end = pred_end, pred_start

            # Decode context, predicted answer, and true answer
            context_tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'].tolist())
            pred_answer = context_tokens[pred_start:pred_end+1]
            true_answer = context_tokens[start_true:end_true+1]

            pred_answer_text = tokenizer.convert_tokens_to_string(pred_answer)
            true_answer_text = tokenizer.convert_tokens_to_string(true_answer)
            context_text = tokenizer.convert_tokens_to_string(context_tokens)
            question_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)

            # Calculate similarity metrics
            jaccard = jaccard_score(
                set(pred_answer_text.split()), 
                set(true_answer_text.split()), 
                average='macro'
            )
            bleu = sentence_bleu([true_answer_text.split()], pred_answer_text.split())

            # Print details
            print(f"Sample {i+1}:")
            print(f"Context: {context_text}")
            print(f"Question: {question_text}")
            print(f"Predicted Answer: {pred_answer_text}")
            print(f"True Answer: {true_answer_text}")
            print(f"Jaccard Similarity: {jaccard:.4f}")
            print(f"BLEU Score: {bleu:.4f}")
            print("-" * 50)

            results.append({
                "context": context_text,
                "question": question_text,
                "predicted_answer": pred_answer_text,
                "true_answer": true_answer_text,
                "jaccard": jaccard,
                "bleu": bleu
            })

    return results

# Run the testing process
test_results = test_model(model, val_dataset, tokenizer)

In [45]:
# # Split dataset into train and validation (80% training, 20% validation)
# from sklearn.model_selection import train_test_split

# train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# # Create Dataset objects for training and validation
# train_dataset = QADataset(train_df)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# val_dataset = QADataset(val_df)
# val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# # Function to compute loss
# def compute_loss(model, loader, loss_fn):
#     model.eval()
#     total_loss = 0
#     with torch.no_grad():
#         for context, question, start, end in loader:
#             context = context.to(device)
#             question = question.to(device)
#             start = start.to(device)
#             end = end.to(device)

#             start_logits, end_logits = model(context, question)

#             start_logits = start_logits.float()
#             end_logits = end_logits.float()
            
#             start = start.long()
#             end = end.long() 
            
#             loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)
#             total_loss += loss.item()

#     return total_loss / len(loader)

# # Early stopping parameters
# best_val_loss = float('inf')
# patience = 3  # How many epochs to wait for improvement
# epochs_no_improve = 0  # Counter for early stopping

# # Training loop with validation loss monitoring
# model = EncoderQA(len(vocab), 100, 128, embedding_matrix).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
# loss_fn = nn.CrossEntropyLoss()

# for epoch in range(30):  # Number of epochs you want to train
#     model.train()
#     total_loss = 0
#     for context, question, start, end in train_loader:
#         context = context.to(device)
#         question = question.to(device)
#         start = start.to(device)
#         end = end.to(device)

#         start_logits, end_logits = model(context, question)
#         start_logits = start_logits.float()
#         end_logits = end_logits.float()
        
#         start = start.long()
#         end = end.long()
        
#         loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     # Calculate validation loss
#     val_loss = compute_loss(model, val_loader, loss_fn)

#     print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader):.4f}, Validation Loss: {val_loss:.4f}")

#     # Early stopping check
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         epochs_no_improve = 0  # Reset the counter
#     else:
#         epochs_no_improve += 1

#     # Stop training if validation loss hasn't improved for 'patience' epochs
#     if epochs_no_improve >= patience:
#         print(f"Early stopping triggered at epoch {epoch+1}.")
#         break

# # Evaluate the model on the validation set
# results = evaluate(model, val_dataset, decode, vocab)
# print(f"\nValidation Evaluation -> Exact Match: {results['EM']:.4f}, F1 Score: {results['F1']:.4f}")

Epoch 1, Training Loss: 8.2623, Validation Loss: 7.2884
Epoch 2, Training Loss: 7.0736, Validation Loss: 6.9463
Epoch 3, Training Loss: 6.7766, Validation Loss: 6.8104
Epoch 4, Training Loss: 6.6115, Validation Loss: 6.7409
Epoch 5, Training Loss: 6.4355, Validation Loss: 6.6933
Epoch 6, Training Loss: 6.2452, Validation Loss: 6.6355
Epoch 7, Training Loss: 6.0362, Validation Loss: 6.6488
Epoch 8, Training Loss: 5.8400, Validation Loss: 6.6328
Epoch 9, Training Loss: 5.6340, Validation Loss: 6.5947
Epoch 10, Training Loss: 5.4059, Validation Loss: 6.5028
Epoch 11, Training Loss: 5.2022, Validation Loss: 6.6447
Epoch 12, Training Loss: 5.0028, Validation Loss: 6.7459
Epoch 13, Training Loss: 4.8243, Validation Loss: 6.7187
Early stopping triggered at epoch 13.


TypeError: slice indices must be integers or None or have an __index__ method

In [50]:
# ## Evaluation

def decode(tokens):
    return [vocab.get_itos()[token] for token in tokens]

def evaluate(model, dataset, tokenizer, vocab, max_samples=100):
    model.eval()
    EM_total, F1_total = 0, 0

    with torch.no_grad():
        for i in range(min(max_samples, len(dataset))):
            sample = dataset[i]
            context = sample['context'].unsqueeze(0).to(device)
            question = sample['question'].unsqueeze(0).to(device)
            start_true = sample['start'].item()
            end_true = sample['end'].item()

            start_true = int(start_true)
            end_true = int(end_true)

            start_logits, end_logits = model(context, question)
            pred_start = torch.argmax(start_logits, dim=1).item()
            pred_end = torch.argmax(end_logits, dim=1).item()

            # Clip to valid range
            if pred_start > pred_end:
                pred_start, pred_end = pred_end, pred_start

            context_tokens = decode(sample['context'])
            pred_answer = context_tokens[pred_start:pred_end+1]
            true_answer = context_tokens[start_true:end_true+1]

            pred_answer_text = " ".join(pred_answer).replace(" ##", "")
            true_answer_text = " ".join(true_answer).replace(" ##", "")

            # Exact Match
            EM = int(pred_answer_text == true_answer_text)
            EM_total += EM

            # F1 Score
            pred_set = set(pred_answer_text.split())
            true_set = set(true_answer_text.split())
            common = pred_set & true_set
            if len(common) == 0:
                F1 = 0
            else:
                precision = len(common) / len(pred_set)
                recall = len(common) / len(true_set)
                F1 = 2 * precision * recall / (precision + recall)
            F1_total += F1

    return {
        "EM": EM_total / max_samples,
        "F1": F1_total / max_samples
    }

# Run evaluation
results = evaluate(model, val_dataset, decode, vocab)
print(f"\nEvaluation -> Exact Match: {results['EM']:.4f}, F1 Score: {results['F1']:.4f}")


Evaluation -> Exact Match: 0.0600, F1 Score: 0.1196
