<a href="https://colab.research.google.com/github/Shaanu-cheeku/question-generation-project/blob/main/Code-Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving WikiQA-dev.tsv to WikiQA-dev.tsv
Saving WikiQA-test.tsv to WikiQA-test.tsv
Saving WikiQA-train.tsv to WikiQA-train.tsv


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from imblearn.over_sampling import SMOTE
from transformers import logging

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

# =========================
# Step 2: Determine Dynamic max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def compute_dynamic_max_length(data, percentile=99):
    lengths = [
        len(tokenizer.encode(q, s, truncation=False)) for q, s in zip(data["question"], data["sentence"])
    ]
    max_length = sorted(lengths)[int(len(lengths) * (percentile / 100))]
    print(f"Dynamic max_length set to: {max_length}")
    return max_length

# Compute max_length dynamically based on the 99th percentile
dynamic_max_length = compute_dynamic_max_length(train_data)

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=dynamic_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=dynamic_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=dynamic_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

# =========================
# Step 4: SMOTE Oversampling
# =========================
print("Applying SMOTE for oversampling...")
smote = SMOTE(random_state=42)

# Convert input_ids to a 2D array for SMOTE
input_ids_flat = train_encodings["input_ids"].numpy().reshape(len(train_labels), -1)
resampled_input_ids, resampled_labels = smote.fit_resample(input_ids_flat, train_labels)

# Reconstruct attention_mask to match resampled input_ids
attention_mask_resampled = torch.ones_like(torch.tensor(resampled_input_ids))

# Reshape back to tensors
train_encodings_resampled = {
    "input_ids": torch.tensor(resampled_input_ids),
    "attention_mask": attention_mask_resampled
}
train_labels_resampled = torch.tensor(resampled_labels)

# =========================
# Step 5: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings_resampled, train_labels_resampled)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# =========================
# Step 6: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cpu")  # Explicitly set to CPU
model.to(device)

# Weighted loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =========================
# Step 7: Evaluation and Error Analysis
# =========================
def evaluate_with_error_analysis(model, loader, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            preds = (probas > threshold).long()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    print(classification_report(all_labels, all_preds))
    return precision_recall_fscore_support(all_labels, all_preds, average="binary")

# =========================
# Step 8: Training Loop
# =========================
epochs = 3
early_stopping_patience = 2
best_f1 = 0.0
patience_counter = 0

print("Starting training...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")
    _, _, f1, _ = evaluate_with_error_analysis(model, dev_loader)
    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered!")
            break

print("Training completed!")
print("Testing on Test Data...")
evaluate_with_error_analysis(model, test_loader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Dynamic max_length set to: 82
Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
Applying SMOTE for oversampling...


  self.labels = torch.tensor(labels)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting training...


KeyboardInterrupt: 

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import logging

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

# =========================
# Step 2: Set Static max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 8  # Reduced batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# =========================
# Step 5: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =========================
# Step 6: Training Loop with Iterative Checkpoints
# =========================
epochs = 3
best_f1 = 0.0
print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    # Evaluate after each epoch
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Save checkpoint if F1 improves
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), f"bert_checkpoint_epoch_{epoch + 1}.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")

print("Training completed!")

# =========================
# Step 7: Final Evaluation
# =========================
print("Evaluating on Test Data...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print(classification_report(all_labels, all_preds))


Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
Starting training...
Epoch 1/3, Loss: 475.9792


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Dev Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Epoch 2/3, Loss: 385.1233
Dev Precision: 0.6667, Recall: 0.1143, F1: 0.1951
Checkpoint saved at epoch 2
Epoch 3/3, Loss: 215.5159
Dev Precision: 0.3261, Recall: 0.2143, F1: 0.2586
Checkpoint saved at epoch 3
Training completed!
Evaluating on Test Data...
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      5825
           1       0.23      0.17      0.20       291

    accuracy                           0.93      6116
   macro avg       0.59      0.57      0.58      6116
weighted avg       0.92      0.93      0.93      6116



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import logging

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

# =========================
# Step 2: Set Static max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 8  # Reduced batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# =========================
# Step 5: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Calculate class weights
positive_count = sum(train_labels)
negative_count = len(train_labels) - positive_count
total_count = len(train_labels)
class_weights = torch.tensor([1.0, total_count / (2.0 * positive_count)]).to(device)

# Define loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =========================
# Step 6: Training Loop with Iterative Checkpoints
# =========================
epochs = 3
best_f1 = 0.0
decision_threshold = 0.5  # Can adjust to optimize precision-recall tradeoff
print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    # Evaluate after each epoch
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            preds = (probas > decision_threshold).long()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Save checkpoint if F1 improves
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), f"bert_checkpoint_epoch_{epoch + 1}.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")

print("Training completed!")

# =========================
# Step 7: Final Evaluation
# =========================
print("Evaluating on Test Data...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > decision_threshold).long()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print(classification_report(all_labels, all_preds))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting training...
Epoch 1/3, Loss: 1155.7458
Dev Precision: 0.2607, Recall: 0.5214, F1: 0.3476
Checkpoint saved at epoch 1
Epoch 2/3, Loss: 857.8455
Dev Precision: 0.2921, Recall: 0.3714, F1: 0.3270
Epoch 3/3, Loss: 399.8821
Dev Precision: 0.2995, Recall: 0.4429, F1: 0.3573
Checkpoint saved at epoch 3
Training completed!
Evaluating on Test Data...
              precision    recall  f1-score   support

           0       0.97      0.93      0.95      5825
           1       0.20      0.36      0.26       291

    accuracy                           0.90      6116
   macro avg       0.58      0.64      0.60      6116
weighted avg       0.93      0.90      0.91      6116



In [None]:
# Add this to evaluate different thresholds after training
import numpy as np

def find_best_threshold(model, loader, thresholds):
    best_threshold = 0.5
    best_f1 = 0.0
    for threshold in thresholds:
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
                preds = (probas > threshold).long()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        _, _, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold, best_f1

# Find the best threshold after training
print("Finding best threshold...")
thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold, best_f1 = find_best_threshold(model, dev_loader, thresholds)
print(f"Best Threshold: {best_threshold}, Best F1: {best_f1:.4f}")

# Use the best threshold for final evaluation
print("Evaluating on Test Data with Optimized Threshold...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > best_threshold).long()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print(classification_report(all_labels, all_preds))


Finding best threshold...
Best Threshold: 0.4, Best F1: 0.3632
Evaluating on Test Data with Optimized Threshold...
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      5825
           1       0.19      0.41      0.26       291

    accuracy                           0.89      6116
   macro avg       0.58      0.66      0.60      6116
weighted avg       0.93      0.89      0.91      6116



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import logging
import numpy as np

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

# =========================
# Step 2: Set Static max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 8  # Reduced batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# =========================
# Step 5: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Calculate class weights
positive_count = sum(train_labels)
negative_count = len(train_labels) - positive_count
total_count = len(train_labels)
class_weights = torch.tensor([1.0, total_count / (2.0 * positive_count)]).to(device)

# Define loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =========================
# Step 6: Training Loop with Iterative Checkpoints
# =========================
epochs = 3
best_f1 = 0.0
decision_threshold = 0.5  # Can adjust to optimize precision-recall tradeoff
print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    # Evaluate after each epoch
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            preds = (probas > decision_threshold).long()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Save checkpoint if F1 improves
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), f"bert_checkpoint_epoch_{epoch + 1}.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")

print("Training completed!")

# =========================
# Step 7: Threshold Optimization
# =========================
def find_best_threshold(model, loader, thresholds):
    best_threshold = 0.5
    best_f1 = 0.0
    for threshold in thresholds:
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
                preds = (probas > threshold).long()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        _, _, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold, best_f1

print("Finding best threshold...")
thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold, best_f1 = find_best_threshold(model, dev_loader, thresholds)
print(f"Best Threshold: {best_threshold:.2f}, Best F1: {best_f1:.4f}")

# =========================
# Step 8: Final Evaluation
# =========================
print("Evaluating on Test Data with Optimized Threshold...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > best_threshold).long()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print(classification_report(all_labels, all_preds))


Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...
Starting training...
Epoch 1/3, Loss: 1134.5568
Dev Precision: 0.2206, Recall: 0.5500, F1: 0.3149
Checkpoint saved at epoch 1
Epoch 2/3, Loss: 876.1973
Dev Precision: 0.2596, Recall: 0.4357, F1: 0.3253
Checkpoint saved at epoch 2
Epoch 3/3, Loss: 502.3713
Dev Precision: 0.2567, Recall: 0.4786, F1: 0.3342
Checkpoint saved at epoch 3
Training completed!
Finding best threshold...
Best Threshold: 0.40, Best F1: 0.3356
Evaluating on Test Data with Optimized Threshold...
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      5825
           1       0.18      0.45      0.26       291

    accuracy                           0.87      6116
   macro avg       0.57      0.67      0.59      6116
weighted avg       0.93      0.87      0.90      6116



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import logging
import numpy as np

# Suppress warnings from transformers
logging.set_verbosity_error()

# =========================
# Step 1: Data Loading
# =========================
# File paths
train_file = "/content/WikiQA-train.tsv"
dev_file = "/content/WikiQA-dev.tsv"
test_file = "/content/WikiQA-test.tsv"

# Define column names
columns = ["question_id", "question", "document_title", "sentence", "label"]

# Load data
train_data = pd.read_csv(train_file, sep="\t", names=columns, header=0)
dev_data = pd.read_csv(dev_file, sep="\t", names=columns, header=0)
test_data = pd.read_csv(test_file, sep="\t", names=columns, header=0)

# =========================
# Step 2: Set Static max_length
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
static_max_length = 64  # Reduced sequence length for faster processing

# =========================
# Step 3: Tokenization
# =========================
def tokenize_data(data, max_length):
    return tokenizer(
        data["question"].tolist(),
        data["sentence"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
print("Tokenizing Train Data...")
train_encodings = tokenize_data(train_data, max_length=static_max_length)

print("Tokenizing Dev Data...")
dev_encodings = tokenize_data(dev_data, max_length=static_max_length)

print("Tokenizing Test Data...")
test_encodings = tokenize_data(test_data, max_length=static_max_length)

# Prepare labels
train_labels = train_data["label"].tolist()
dev_labels = dev_data["label"].tolist()
test_labels = test_data["label"].tolist()

# =========================
# Step 4: Custom Dataset and DataLoader
# =========================
class WikiQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Create PyTorch datasets
train_dataset = WikiQADataset(train_encodings, train_labels)
dev_dataset = WikiQADataset(dev_encodings, dev_labels)
test_dataset = WikiQADataset(test_encodings, test_labels)

# Create DataLoaders
batch_size = 8  # Reduced batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# =========================
# Step 5: Model and Loss Initialization
# =========================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Calculate class weights (increased penalty for minority class)
positive_count = sum(train_labels)
negative_count = len(train_labels) - positive_count
total_count = len(train_labels)
class_weights = torch.tensor([1.0, total_count / (1.5 * positive_count)]).to(device)

# Define loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =========================
# Step 6: Training Loop with Iterative Checkpoints
# =========================
epochs = 3
best_f1 = 0.0
decision_threshold = 0.5  # Default threshold for initial training
print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    # Evaluate after each epoch
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            preds = (probas > decision_threshold).long()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Dev Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Save checkpoint if F1 improves
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), f"bert_checkpoint_epoch_{epoch + 1}.pt")
        print(f"Checkpoint saved at epoch {epoch + 1}")

print("Training completed!")

# =========================
# Step 7: Threshold Optimization
# =========================
def find_best_threshold(model, loader, thresholds):
    best_threshold = 0.5
    best_f1 = 0.0
    for threshold in thresholds:
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
                preds = (probas > threshold).long()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        _, _, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold, best_f1

print("Finding best threshold...")
thresholds = np.arange(0.1, 0.9, 0.05)
best_threshold, best_f1 = find_best_threshold(model, dev_loader, thresholds)
print(f"Best Threshold: {best_threshold:.2f}, Best F1: {best_f1:.4f}")

# =========================
# Step 8: Final Evaluation
# =========================
print("Evaluating on Test Data with Optimized Threshold...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probas = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        preds = (probas > best_threshold).long()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print(classification_report(all_labels, all_preds))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing Train Data...
Tokenizing Dev Data...
Tokenizing Test Data...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting training...
Epoch 1/3, Loss: 1207.5452
Dev Precision: 0.2961, Recall: 0.4929, F1: 0.3700
Checkpoint saved at epoch 1
