In [None]:
# - transformers, datasets: for BERT fine-tuning
# - scikit-learn, pandas, matplotlib, seaborn for EDA and baseline models
# - accelerate optionally for faster Trainer
!pip install -q transformers datasets accelerate sentencepiece
!pip install -q scikit-learn pandas matplotlib seaborn
# Check GPU
import torch
print("Torch CUDA available:", torch.cuda.is_available())
print("Torch version:", torch.__version__)


Torch CUDA available: True
Torch version: 2.9.0+cu126


In [None]:
# Cell 2: Mount Google Drive + set dataset directory

from google.colab import drive
from pathlib import Path

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path to your WikiQA dataset folder inside Google Drive.
# Update the folder name if your directory is slightly different.
DATA_DIR = Path("/content/drive/MyDrive/WikiQACorpus")  # change this to match your folder

# Create if not exists (usually already exists in Drive)
DATA_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_DIR set to:", DATA_DIR)
print("Files present in DATA_DIR:")
for f in DATA_DIR.iterdir():
    print("-", f.name)

DATA_DIR set to: /content/drive/MyDrive/WikiQACorpus
Files present in DATA_DIR:
- WikiQA-dev.ref
- WikiQA-dev-filtered.ref
- WikiQA-dev.tsv
- LICENSE.pdf
- WikiQA-test.tsv
- eval.py
- README.txt
- Guidelines_Phase1.pdf
- WikiQASent.pos.ans.tsv
- WikiQA-train.txt
- WikiQA-train.ref
- Guidelines_Phase2.pdf
- WikiQA-dev.txt
- WikiQA-test.txt
- WikiQA-test.ref
- WikiQA-test-filtered.ref
- WikiQA-train.tsv
- WikiQA.tsv
- emnlp-table


In [None]:
# WikiQA variants: often a .tsv file or separate train/test .tsv. We'll search for files with "wiki" or "wikiqa" in the name.
import glob, pandas as pd

def find_files(directory):
    exts = ["*.tsv", "*.csv", "*.txt", "*.xml"]
    hits = []
    for e in exts:
        hits += glob.glob(str(Path(directory)/e))
    # filter by likely keywords
    hits = [h for h in hits if "wiki" in h.lower() or "wikiqa" in h.lower() or "question" in h.lower() or "answer" in h.lower()]
    return sorted(hits)

candidates = find_files(DATA_DIR)
print("Detected files:", candidates)


Detected files: ['/content/drive/MyDrive/WikiQACorpus/README.txt', '/content/drive/MyDrive/WikiQACorpus/WikiQA-dev.tsv', '/content/drive/MyDrive/WikiQACorpus/WikiQA-dev.txt', '/content/drive/MyDrive/WikiQACorpus/WikiQA-test.tsv', '/content/drive/MyDrive/WikiQACorpus/WikiQA-test.txt', '/content/drive/MyDrive/WikiQACorpus/WikiQA-train.tsv', '/content/drive/MyDrive/WikiQACorpus/WikiQA-train.txt', '/content/drive/MyDrive/WikiQACorpus/WikiQA.tsv', '/content/drive/MyDrive/WikiQACorpus/WikiQASent.pos.ans.tsv']


In [None]:
# We'll attempt several common formats. Adjust if your dataset is structured differently.
import pandas as pd
from pathlib import Path

def load_wikiqa(paths):
    # Try TSV/CSV with columns like QuestionId, Question, Candidate, Label or question, sentence, label
    dfs = []
    for p in paths:
        p = Path(p)
        if p.suffix.lower() in [".tsv", ".csv", ".txt"]:
            try:
                sep = '\t' if p.suffix.lower()=='.tsv' else ','
                df = pd.read_csv(p, sep=sep, encoding='utf-8', on_bad_lines='skip')
            except Exception:
                df = pd.read_csv(p, sep=None, engine='python', on_bad_lines='skip')
            # normalize column names
            cols = {c.lower(): c for c in df.columns}
            def pick(col_candidates):
                for cc in col_candidates:
                    if cc in cols:
                        return cols[cc]
                return None
            qcol = pick(['question', 'questiontext', 'q', 'questionbody'])
            scol = pick(['sentence', 'candidate', 'answer', 'sent'])
            lcol = pick(['label', 'is_correct', 'isanswer', 'target'])
            if qcol and scol and lcol:
                df = df[[qcol, scol, lcol]].rename(columns={qcol: 'question', scol: 'candidate', lcol: 'label'})
                dfs.append(df)
            else:
                # Try heuristics: if only two columns assume question,candidate,label maybe merged
                if len(df.columns) >= 3:
                    df = df.iloc[:, :3]
                    df.columns = ['question', 'candidate', 'label']
                    dfs.append(df)
                else:
                    print("Skipping file (unknown format):", p)
        else:
            print("Skipping non-tabular file:", p)
    if not dfs:
        raise FileNotFoundError("No suitable data files found. Put WikiQA tsv/csv into /content/data or change DATA_DIR.")
    full = pd.concat(dfs, ignore_index=True)
    # Normalize label to binary 0/1 if needed
    full['label'] = full['label'].astype(str).str.strip().replace({'true':'1','false':'0','yes':'1','no':'0'})
    try:
        full['label'] = full['label'].astype(int)
    except:
        # attempt to map common strings
        full['label'] = full['label'].map(lambda x: 1 if x.lower() in ('1','true','yes','correct') else 0)
    return full

# If autodetected files found use them, else raise instruction
if candidates:
    df = load_wikiqa(candidates)
else:
    # Provide instruction to user if nothing auto-detected
    raise FileNotFoundError("No dataset files found in DATA_DIR. Upload WikiQA files into /content/data or update DATA_DIR variable.")

print("Loaded rows:", len(df))
df.head()


Skipping file (unknown format): /content/drive/MyDrive/WikiQACorpus/README.txt
Skipping file (unknown format): /content/drive/MyDrive/WikiQACorpus/WikiQA-dev.txt
Skipping file (unknown format): /content/drive/MyDrive/WikiQACorpus/WikiQA-test.txt
Skipping file (unknown format): /content/drive/MyDrive/WikiQACorpus/WikiQA-train.txt
Loaded rows: 59877


Unnamed: 0,question,candidate,label
0,How are epithelial tissues joined together?,Cross section of sclerenchyma fibers in plant ...,0
1,How are epithelial tissues joined together?,Microscopic view of a histologic specimen of h...,0
2,How are epithelial tissues joined together?,"In Biology , Tissue is a cellular organization...",0
3,How are epithelial tissues joined together?,A tissue is an ensemble of similar cells from ...,0
4,How are epithelial tissues joined together?,Organs are then formed by the functional group...,0


In [None]:
# Cell 5: Basic EDA: class balance, lengths, duplicates, missing
import numpy as np
print("Label distribution:")
print(df['label'].value_counts())

df['q_len'] = df['question'].astype(str).str.split().apply(len)
df['c_len'] = df['candidate'].astype(str).str.split().apply(len)

print("Question length (words):", df['q_len'].describe().to_dict())
print("Candidate length (words):", df['c_len'].describe().to_dict())

print("Missing values:")
print(df.isnull().sum())

print("Duplicates (question+candidate):", df.duplicated(subset=['question','candidate']).sum())

# show some positive and negative examples
print("\nPositive examples:")
display(df[df['label']==1].sample(n=min(3, len(df[df['label']==1])), random_state=42)[['question','candidate','label']])
print("\nNegative examples:")
display(df[df['label']==0].sample(n=min(3, len(df[df['label']==0])), random_state=42)[['question','candidate','label']])


Label distribution:
label
0    56938
1     2939
Name: count, dtype: int64
Question length (words): {'count': 59877.0, 'mean': 6.791839938540675, 'std': 2.594578452674489, 'min': 1.0, '25%': 5.0, '50%': 6.0, '75%': 8.0, 'max': 21.0}
Candidate length (words): {'count': 59877.0, 'mean': 22.2720911201296, 'std': 14.02243502885227, 'min': 1.0, '25%': 14.0, '50%': 21.0, '75%': 28.0, 'max': 1577.0}
Missing values:
question     0
candidate    0
label        0
q_len        0
c_len        0
dtype: int64
Duplicates (question+candidate): 29412

Positive examples:


Unnamed: 0,question,candidate,label
46325,what radio station are the boston bruins on?,The network's flagship station is WBZ-FM /98.5...,1
13775,what freezes faster? hot or cold water?,"The Mpemba effect, named after Tanzanian stude...",1
51015,who are the characters in 90210 in season 3,Regular cast members for the season included S...,1



Negative examples:


Unnamed: 0,question,candidate,label
27263,when was the first automobile,The term motorcar has also been used in the co...,0
11107,How did Edgar Allan Poe die?,He is further credited with contributing to th...,0
57548,what is hijackthis 1.99.1,HijackThis does not remove or detect spyware; ...,0


In [None]:
import re
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s)
    s = s.strip()
    # basic normalization: collapse whitespace
    s = re.sub(r'\s+', ' ', s)
    return s

df['question'] = df['question'].apply(clean_text)
df['candidate'] = df['candidate'].apply(clean_text)
# drop rows where either is empty
df = df[(df['question']!='') & (df['candidate']!='')].reset_index(drop=True)
# optionally drop exact duplicates
df = df.drop_duplicates(subset=['question','candidate']).reset_index(drop=True)
print("After cleaning rows:", len(df))


After cleaning rows: 30456


In [None]:
#Create stratified train/validation/test splits
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42

train_val, test = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=RANDOM_STATE)
train, val = train_test_split(train_val, test_size=0.1111, stratify=train_val['label'], random_state=RANDOM_STATE)
# (this yields ~80/10/10 split)

print("Train/Val/Test sizes:", len(train), len(val), len(test))
print("Train label dist:\n", train['label'].value_counts(normalize=True))


Train/Val/Test sizes: 24364 3046 3046
Train label dist:
 label
0    0.951691
1    0.048309
Name: proportion, dtype: float64


In [None]:
# Baseline classical model: feature = TF-IDF of question + candidate combined
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score

# Combine question and candidate as a single input (simple approach)
def join_q_c(row):
    return row['question'] + " [SEP] " + row['candidate']

train_texts = train.apply(join_q_c, axis=1).tolist()
val_texts = val.apply(join_q_c, axis=1).tolist()
test_texts = test.apply(join_q_c, axis=1).tolist()
y_train = train['label'].values
y_val = val['label'].values
y_test = test['label'].values

tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)

pipe = make_pipeline(tfidf, clf)
pipe.fit(train_texts, y_train)

# Evaluate
y_pred = pipe.predict(val_texts)
y_proba = pipe.predict_proba(val_texts)[:,1]
print("Validation classification report (TF-IDF baseline):")
print(classification_report(y_val, y_pred, digits=4))
try:
    print("Validation ROC-AUC:", roc_auc_score(y_val, y_proba))
except:
    pass


Validation classification report (TF-IDF baseline):
              precision    recall  f1-score   support

           0     0.9669    0.8965    0.9304      2899
           1     0.1620    0.3946    0.2297       147

    accuracy                         0.8723      3046
   macro avg     0.5645    0.6455    0.5800      3046
weighted avg     0.9280    0.8723    0.8966      3046

Validation ROC-AUC: 0.7053100646950743


In [None]:
# Cell 9: Save baseline and show a quick inference example
sample = val.sample(1, random_state=RANDOM_STATE).iloc[0]
text = sample['question'] + " [SEP] " + sample['candidate']
print("Sample Q:", sample['question'])
print("Sample candidate:", sample['candidate'])
print("True label:", sample['label'])
print("Baseline predicted prob:", pipe.predict_proba([text])[0,1])


Sample Q: how big is the purdue greek system
Sample candidate: The university has also been highly influential in America's history of aviation , having established the first college credit offered in flight training , the first four-year bachelor's degree in aviation, and the first university airport ( Purdue University Airport ).
True label: 0
Baseline predicted prob: 0.23932726387200276


In [None]:
# Cell 10: Prepare datasets for transformers (use Hugging Face Dataset or simple tokenization)
from datasets import Dataset, DatasetDict
train_ds = Dataset.from_pandas(train[['question','candidate','label']])
val_ds = Dataset.from_pandas(val[['question','candidate','label']])
test_ds = Dataset.from_pandas(test[['question','candidate','label']])
dataset_dict = DatasetDict({'train': train_ds, 'validation': val_ds, 'test': test_ds})
dataset_dict


DatasetDict({
    train: Dataset({
        features: ['question', 'candidate', 'label', '__index_level_0__'],
        num_rows: 24364
    })
    validation: Dataset({
        features: ['question', 'candidate', 'label', '__index_level_0__'],
        num_rows: 3046
    })
    test: Dataset({
        features: ['question', 'candidate', 'label', '__index_level_0__'],
        num_rows: 3046
    })
})

In [None]:
#Tokenize using a BERT model tokenizer. We'll use 'bert-base-uncased' as a strong baseline.
from transformers import AutoTokenizer
MODEL_NAME = "bert-base-uncased"  # change to roberta-base or distilbert if you'd like
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# We'll pair question and candidate as sentence pair inputs [question] [SEP] [candidate]
def preprocess(examples):
    return tokenizer(examples['question'], examples['candidate'], truncation=True, padding='max_length', max_length=128)

tokenized = dataset_dict.map(preprocess, batched=True)
tokenized = tokenized.remove_columns(['question','candidate'])
tokenized = tokenized.rename_column('label','labels')
tokenized.set_format('torch')
tokenized


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/24364 [00:00<?, ? examples/s]

Map:   0%|          | 0/3046 [00:00<?, ? examples/s]

Map:   0%|          | 0/3046 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24364
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3046
    })
    test: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3046
    })
})

In [None]:
# Tokenize using a BERT model tokenizer. We'll use 'bert-base-uncased' as a strong baseline.
from transformers import AutoTokenizer
MODEL_NAME = "bert-base-uncased"  # change to roberta-base or distilbert if you'd like
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# We'll pair question and candidate as sentence pair inputs [question] [SEP] [candidate]
def preprocess(examples):
    return tokenizer(examples['question'], examples['candidate'], truncation=True, padding='max_length', max_length=128)

tokenized = dataset_dict.map(preprocess, batched=True)
tokenized = tokenized.remove_columns(['question','candidate'])
tokenized = tokenized.rename_column('label','labels')
tokenized.set_format('torch')
tokenized


Map:   0%|          | 0/24364 [00:00<?, ? examples/s]

Map:   0%|          | 0/3046 [00:00<?, ? examples/s]

Map:   0%|          | 0/3046 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24364
    })
    validation: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3046
    })
    test: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3046
    })
})

In [None]:
# Cell 16: Notes and improvement ideas (human-readable comments)
"""
What we assumed and what might be missing from raw WikiQA:
- Some distributed releases of WikiQA may contain XML or special columns; loader tries to auto-detect common layouts.
- The dataset sometimes lacks a dedicated validation split. We created a stratified val/test split.
- Class imbalance: WikiQA tends to have far fewer positive examples (correct answers) than negatives.
  * We used class_weight in LR baseline and let the trainer load default loss. If imbalance is severe, consider:
    - oversampling positives in train set
    - using focal loss or class weights in training
    - using more negatives per question for training (hard negative mining)
- Text normalization: we only collapsed whitespace. For classical models you may remove stopwords, punctuation, or lemmatize.
- For transformer fine-tuning we relied on tokenizer handling punctuation; keep punctuation.
- Maximum sequence length: set to 128. If candidate sentences are long, consider 256 or use hierarchical models.
- Evaluation: reporting accuracy isn't sufficient for imbalanced tasks; we computed precision, recall, F1, ROC-AUC.
- Feature engineering: also consider using explicit features:
    - lexical overlap (word overlap, exact match count)
    - BM25 score between question and candidate
    - POS-tag or named-entity overlap features
  These can be concatenated to the model or used in ensemble.
- Cross-validation: for robust metrics, run k-fold (grouped by question id if available) to avoid leakage.
- Deployment: we saved the fine-tuned model to ./bert-wikiqa-best for inference.
- Reproducibility: set seeds for torch/numpy for exact reproducibility (not fully deterministic across hardware).
- Speed & cost: BERT fine-tuning requires GPU — Colab's free GPU is fine for a small dataset. For larger runs use a TPU or better GPU.

What we added:
- Auto-detection/loading for multiple file formats
- Stratified splits and simple cleaning
- Baseline TF-IDF+LogReg for quick sanity check
- BERT fine-tuning pipeline with evaluation and saving
- Inference helper using pipeline()

Possible next steps (research/production):
- Hard negative mining: sample negatives that are semantically close to the question
- Pairwise ranking loss (e.g., margin ranking) instead of classification for ranking use-cases
- Fine-tune a cross-encoder then distill to a bi-encoder for fast retrieval systems
"""
print("See comments above for improvements & dataset caveats.")


See comments above for improvements & dataset caveats.
