<a href="https://colab.research.google.com/github/Shaanu-cheeku/question-generation-project/blob/main/Code-Part3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving WikiQA-dev.tsv to WikiQA-dev.tsv
Saving WikiQA-test.tsv to WikiQA-test.tsv
Saving WikiQA-train.tsv to WikiQA-train.tsv


In [None]:
# Install necessary libraries if not already installed
!pip install transformers

# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_scheduler
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import logging
import numpy as np

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# Define file paths (Modify these paths based on where you upload your data)
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

print("Data loaded successfully!")

# =========================
# Step 2: Tokenization Setup
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization Function
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# =========================
# Step 4: Prepare Labels
# =========================
train_labels = torch.tensor(train_data["label"].tolist())
dev_labels = torch.tensor(dev_data["label"].tolist())
test_labels = torch.tensor(test_data["label"].tolist())

# =========================
# Step 5: Custom Dataset Class
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# =========================
# Step 6: DataLoader Setup
# =========================
batch_size = 8  # Reduced batch size for efficiency

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print("DataLoaders created successfully!")

# =========================
# Step 7: Model Definition
# =========================
class EncoderDecoderModel(torch.nn.Module):
    def __init__(self, encoder, hidden_size, num_labels, dropout_rate=0.3):
        super(EncoderDecoderModel, self).__init__()
        self.encoder = encoder
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.classifier = torch.nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = encoder_outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

# Load pre-trained BERT as encoder
encoder = BertModel.from_pretrained("bert-base-uncased")

# Instantiate the Encoder-Decoder model
hidden_size = encoder.config.hidden_size  # Typically 768 for bert-base
num_labels = 2  # Binary classification
model = EncoderDecoderModel(encoder, hidden_size, num_labels, dropout_rate=0.3)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

# =========================
# Step 8: Training Setup
# =========================
# Calculate class weights to handle class imbalance
positive_count = train_labels.sum().item()
negative_count = len(train_labels) - positive_count
total_count = len(train_labels)
class_weights = torch.tensor([1.0, total_count / (1.5 * positive_count)]).to(device)

# Define loss function with class weights
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Define optimizer with weight decay for regularization
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

# Define learning rate scheduler
num_epochs = 10  # Increased epochs for better convergence
num_training_steps = len(train_loader) * num_epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Early Stopping parameters
patience = 3  # Number of epochs to wait for improvement
best_f1 = 0.0
epochs_no_improve = 0

print("Training setup completed!")

# =========================
# Step 9: Training Loop with Early Stopping
# =========================
print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation on dev set
    model.eval()
    all_preds, all_labels_dev = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels_dev.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels_dev, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Check for improvement
    if f1 > best_f1:
        best_f1 = f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), "encoder_decoder_best_model.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")
    else:
        epochs_no_improve += 1
        print(f"No improvement in F1 for {epochs_no_improve} epoch(s)")

    # Early Stopping
    if epochs_no_improve >= patience:
        print("Early stopping triggered!")
        break

print("Training completed!")

# =========================
# Step 10: Load Best Model
# =========================
model.load_state_dict(torch.load("encoder_decoder_best_model.pt"))
model.to(device)
print("Best model loaded for evaluation.")

# =========================
# Step 11: Threshold Optimization
# =========================
def find_best_threshold(model, loader, thresholds):
    best_threshold = 0.5
    best_f1 = 0.0
    for threshold in thresholds:
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                logits = model(input_ids=input_ids, attention_mask=attention_mask)
                probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
                preds = (probas > threshold).long()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        _, _, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold, best_f1

print("Finding best threshold...")
thresholds = np.arange(0.1, 0.9, 0.05)
best_threshold, best_f1 = find_best_threshold(model, dev_loader, thresholds)
print(f"Best Threshold: {best_threshold:.2f}, Best F1: {best_f1:.4f}")

# =========================
# Step 12: Final Evaluation on Test Data
# =========================
print("Evaluating on Test Data with Optimized Threshold...")
model.eval()
all_preds_test, all_labels_test = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > best_threshold).long()
        all_preds_test.extend(preds.cpu().numpy())
        all_labels_test.extend(labels.cpu().numpy())

print("Test Set Classification Report:")
print(classification_report(all_labels_test, all_preds_test))


Data loaded successfully!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
DataLoaders created successfully!


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model loaded on cuda
Training setup completed!
Starting training...




Epoch 1/10, Loss: 0.4798
Dev Precision: 0.2705, Recall: 0.4000, F1: 0.3228
Checkpoint saved at epoch 1
Epoch 2/10, Loss: 0.3910
Dev Precision: 0.2093, Recall: 0.5786, F1: 0.3074
No improvement in F1 for 1 epoch(s)
Epoch 3/10, Loss: 0.2344
Dev Precision: 0.2500, Recall: 0.3143, F1: 0.2785
No improvement in F1 for 2 epoch(s)
Epoch 4/10, Loss: 0.1144
Dev Precision: 0.2500, Recall: 0.3929, F1: 0.3056
No improvement in F1 for 3 epoch(s)
Early stopping triggered!
Training completed!


  model.load_state_dict(torch.load("encoder_decoder_best_model.pt"))


Best model loaded for evaluation.
Finding best threshold...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Threshold: 0.45, Best F1: 0.3370
Evaluating on Test Data with Optimized Threshold...
Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95      5825
           1       0.23      0.42      0.30       291

    accuracy                           0.91      6116
   macro avg       0.60      0.68      0.62      6116
weighted avg       0.93      0.91      0.92      6116



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from imblearn.over_sampling import SMOTE
import numpy as np

# Suppress warnings from transformers
from transformers import logging
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

print("Data loaded successfully!")

# =========================
# Step 2: Tokenization
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 64  # Fixed max length for efficiency

def tokenize_data(data, max_length):
    tokenized = tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return tokenized, data["label"].tolist()

print("Tokenizing Train Data...")
train_encodings, train_labels = tokenize_data(train_data, max_length)
print("Tokenizing Dev Data...")
dev_encodings, dev_labels = tokenize_data(dev_data, max_length)
print("Tokenizing Test Data...")
test_encodings, test_labels = tokenize_data(test_data, max_length)

# =========================
# Step 3: SMOTE for Balancing
# =========================
# Flatten tokenized input IDs for SMOTE
input_ids_flat = train_encodings["input_ids"].numpy().reshape(len(train_labels), -1)

# Apply SMOTE
smote = SMOTE(random_state=42)
input_ids_resampled, labels_resampled = smote.fit_resample(input_ids_flat, train_labels)

# Generate new attention masks for resampled data
attention_masks_resampled = np.zeros_like(input_ids_resampled, dtype=int)
for i, row in enumerate(input_ids_resampled):
    attention_masks_resampled[i] = [1 if token != 0 else 0 for token in row]  # Mark non-padding tokens as 1

# Convert back to PyTorch tensors
train_encodings_resampled = {
    "input_ids": torch.tensor(input_ids_resampled),
    "attention_mask": torch.tensor(attention_masks_resampled)
}
train_labels_resampled = labels_resampled

print("SMOTE applied successfully!")


# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create DataLoaders
batch_size = 16
train_dataset = WikiQADataset(train_encodings_resampled, train_labels_resampled)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print("DataLoaders created successfully!")

# =========================
# Step 5: Model Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded on", device)

# =========================
# Step 6: Training Setup
# =========================
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 10
scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Weighted loss for imbalance
class_weights = torch.tensor([1.0, len(train_labels_resampled) / sum(train_labels_resampled)]).to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

print("Training setup completed!")

# =========================
# Step 7: Training Loop with Early Stopping
# =========================
epochs = 10
patience = 3
best_f1 = 0.0
patience_counter = 0

print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    # Evaluate on dev set
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Early Stopping
    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

print("Training completed!")

# =========================
# Step 8: Final Evaluation
# =========================
print("Evaluating on Test Data...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print("Test Set Classification Report:")
print(classification_report(all_labels, all_preds))


Data loaded successfully!
Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
SMOTE applied successfully!
DataLoaders created successfully!
Model loaded on cuda
Training setup completed!
Starting training...
Epoch 1/10, Loss: 262.0451
Dev Precision: 0.3750, Recall: 0.0429, F1: 0.0769
Checkpoint saved at epoch 1
Epoch 2/10, Loss: 203.4861
Dev Precision: 0.2651, Recall: 0.1571, F1: 0.1973
Checkpoint saved at epoch 2
Epoch 3/10, Loss: 125.7980
Dev Precision: 0.3115, Recall: 0.1357, F1: 0.1891
Epoch 4/10, Loss: 58.2211
Dev Precision: 0.2727, Recall: 0.1286, F1: 0.1748
Epoch 5/10, Loss: 33.4737
Dev Precision: 0.2568, Recall: 0.2714, F1: 0.2639
Checkpoint saved at epoch 5
Epoch 6/10, Loss: 21.4024
Dev Precision: 0.2653, Recall: 0.1857, F1: 0.2185
Epoch 7/10, Loss: 16.4814
Dev Precision: 0.2907, Recall: 0.1786, F1: 0.2212
Epoch 8/10, Loss: 12.7764
Dev Precision: 0.3256, Recall: 0.2000, F1: 0.2478
Early stopping triggered!
Training completed!
Evaluating on Test Data...
Test

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import logging
from torch.nn import CrossEntropyLoss

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)
print("Data loaded successfully!")

# =========================
# Step 2: Set Static max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

print("Tokenization completed!")

# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 16  # Adjusted batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print("DataLoaders created successfully!")

# =========================
# Step 5: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Calculate class weights
positive_count = sum(train_labels)
negative_count = len(train_labels) - positive_count
total_count = len(train_labels)
class_weights = torch.tensor([1.0, total_count / (2.0 * positive_count)]).to(device)

# Use CrossEntropyLoss with class weights
loss_fn = CrossEntropyLoss(weight=class_weights)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
num_training_steps = len(train_loader) * 10  # For 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("Model loaded on", device)

# =========================
# Step 6: Training Loop with Debugging
# =========================
epochs = 10
best_f1 = 0.0
patience = 3  # Early stopping patience
no_improvement_epochs = 0
decision_threshold = 0.5

print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()

        # Track accuracy during training
        preds = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

    train_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

    # Evaluate on dev set
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            preds = (probas > decision_threshold).long()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Save the best model
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_model.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")
        no_improvement_epochs = 0
    else:
        no_improvement_epochs += 1
        if no_improvement_epochs >= patience:
            print("Early stopping triggered!")
            break

print("Training completed!")

# =========================
# Step 7: Final Evaluation
# =========================
print("Evaluating on Test Data...")
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > decision_threshold).long()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print("Test Set Classification Report:")
print(classification_report(all_labels, all_preds))


Data loaded successfully!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
Tokenization completed!
DataLoaders created successfully!


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model loaded on cpu
Starting training...
Epoch 1/10, Loss: 814.4974


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Dev Precision: 0.0000, Recall: 0.0000, F1: 0.0000
No improvement in F1 for 1 epoch(s)


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import logging
from torch.nn import CrossEntropyLoss

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)
print("Data loaded successfully!")

# =========================
# Step 2: Set Static max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

print("Tokenization completed!")

# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 16  # Adjusted batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print("DataLoaders created successfully!")

# =========================
# Step 5: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Calculate class weights
positive_count = sum(train_labels)
negative_count = len(train_labels) - positive_count
total_count = len(train_labels)
class_weights = torch.tensor([1.0, total_count / (2.0 * positive_count)]).to(device)

# Use CrossEntropyLoss with class weights
loss_fn = CrossEntropyLoss(weight=class_weights)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
num_training_steps = len(train_loader) * 10  # For 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("Model loaded on", device)

# =========================
# Step 6: Training Loop with Debugging
# =========================
epochs = 10
best_f1 = 0.0
patience = 3  # Early stopping patience
no_improvement_epochs = 0
decision_threshold = 0.5

print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()

        # Track accuracy during training
        preds = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

    train_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

    # Evaluate on dev set
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            preds = (probas > decision_threshold).long()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Save the best model
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_model.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")
        no_improvement_epochs = 0
    else:
        no_improvement_epochs += 1
        if no_improvement_epochs >= patience:
            print("Early stopping triggered!")
            break

print("Training completed!")

# =========================
# Step 7: Final Evaluation
# =========================
print("Evaluating on Test Data...")
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > decision_threshold).long()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print("Test Set Classification Report:")
print(classification_report(all_labels, all_preds))


Data loaded successfully!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
Tokenization completed!
DataLoaders created successfully!


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model loaded on cpu
Starting training...
Epoch 1/10, Loss: 635.6202, Training Accuracy: 0.8763
Dev Precision: 0.2575, Recall: 0.4929, F1: 0.3382
Checkpoint saved at epoch 1


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import logging
import numpy as np

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
print("Loading data...")
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

columns = ["question_id", "question", "document_title", "sentence", "label"]

train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)
print("Data loaded successfully!")

# =========================
# Step 2: Tokenization
# =========================
print("Tokenizing data...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(data, max_length=64):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_data)
dev_encodings = tokenize_data(dev_data)
test_encodings = tokenize_data(test_data)
print("Tokenization completed!")

train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

# =========================
# Step 3: Dataset and DataLoaders
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)
print("DataLoaders created successfully!")

# =========================
# Step 4: Model Initialization
# =========================
print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# Calculate class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=np.array(train_labels)
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * 5
)
print("Model loaded on", device)

# =========================
# Step 5: Training with Early Stopping
# =========================
best_f1 = 0.0
patience = 3
no_improve_counter = 0
epochs = 5
print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_preds, total_preds = 0, 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

    train_acc = correct_preds / total_preds
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Training Accuracy: {train_acc:.4f}")

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Early Stopping
    if f1 > best_f1:
        best_f1 = f1
        no_improve_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")
    else:
        no_improve_counter += 1
        if no_improve_counter >= patience:
            print("Early stopping triggered!")
            break

print("Training completed!")

# =========================
# Step 6: Evaluation on Test Data
# =========================
print("Evaluating on Test Data...")
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Test Set Classification Report:")
print(classification_report(all_labels, all_preds))


Loading data...
Data loaded successfully!
Tokenizing data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenization completed!
DataLoaders created successfully!
Loading model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model loaded on cpu
Starting training...
