# Bloom's Taxonomy Classification Pipeline

This notebook consolidates the pipeline for classifying questions into Bloom's Taxonomy levels (BT1-BT6). It includes data preparation, classical models, neural networks, transformer-based models, ensemble, zero-shot, contrastive learning, multi-task learning, explainability, and evaluation.

## Requirements
Install the following dependencies:
```
torch==2.0.1
transformers==4.30.2
scikit-learn==1.3.0
xgboost==1.7.6
tensorflow==2.12.0
numpy==1.24.3
pandas==2.0.3
matplotlib==3.7.2
seaborn==0.12.2
lime==0.2.0.1
shap==0.42.0
captum==0.6.0
datasets==2.13.1
sentence-transformers==2.2.2
huggingface-hub==0.16.4
scipy==1.11.1
joblib==1.3.1
```
Run `pip install -r requirements.txt` in your environment.

## 1. Data Preparation

In [None]:
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

# Sample CSV content (replace with full dataset)
csv_content = """Questions,Category
About what proportion of the population of the US is living on farms?,BT1
Correctly label the brain lobes indicated on the diagram below,BT1
Define compound interest.,BT1
...
"""

# Load CSV
df = pd.read_csv(StringIO(csv_content), names=['text', 'label'], header=0)

# Clean text
df['text'] = df['text'].str.strip().str.lower().str.replace('"', '')

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Split: 80/10/10 stratified
train, test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val, test = train_test_split(test, test_size=0.5, stratify=test['label'], random_state=42)

# Save
os.makedirs('artifacts/data', exist_ok=True)
train.to_csv('artifacts/data/train.csv', index=False)
val.to_csv('artifacts/data/val.csv', index=False)
test.to_csv('artifacts/data/test.csv', index=False)

print("Data prepared and saved to artifacts/data/")

: 

## 2. Classical Models

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
import os
import joblib
import numpy as np

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# TFIDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train['text'])
X_val = vectorizer.transform(val['text'])
X_test = vectorizer.transform(test['text'])

y_train = train['label']
y_val = val['label']
y_test = test['label']

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred, average='weighted')

# SVM
svm = LinearSVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted')

# Save models
os.makedirs('artifacts/models', exist_ok=True)
joblib.dump(lr, 'artifacts/models/lr.pkl')
joblib.dump(svm, 'artifacts/models/svm.pkl')
joblib.dump(xgb, 'artifacts/models/xgb.pkl')
joblib.dump(vectorizer, 'artifacts/models/tfidf.pkl')

# Save results
results = {
    'lr': {'acc': lr_acc, 'f1': lr_f1, 'pred': lr_pred.tolist()},
    'svm': {'acc': svm_acc, 'f1': svm_f1, 'pred': svm_pred.tolist()},
    'xgb': {'acc': xgb_acc, 'f1': xgb_f1, 'pred': xgb_pred.tolist()}
}
np.save('artifacts/results/classical_results.npy', results)

print(f"Classical models trained. LR Acc: {lr_acc:.3f}, SVM Acc: {svm_acc:.3f}, XGB Acc: {xgb_acc:.3f}")

## 3. TextCNN

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import numpy as np
import os

# Custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        words = text.split()
        indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
        if len(indices) < self.max_len:
            indices += [self.vocab['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        return torch.tensor(indices), torch.tensor(self.labels[idx])

# TextCNN Model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)
    
    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max_pool1d(xi, xi.size(2)).squeeze(2) for xi in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        return self.fc(x)

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Build vocab
words = ' '.join(train['text']).split()
vocab = {word: i+2 for i, word in enumerate(set(words))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Dataset and DataLoader
train_dataset = TextDataset(train['text'].values, train['label'].values, vocab)
val_dataset = TextDataset(val['text'].values, val['label'].values, vocab)
test_dataset = TextDataset(test['text'].values, test['label'].values, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextCNN(len(vocab), embed_dim=100, num_classes=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch+1}, Val Acc: {val_acc:.3f}")

# Test
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
torch.save(model.state_dict(), 'artifacts/models/textcnn.pt')
np.save('artifacts/results/textcnn_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds})

print(f"TextCNN trained. Test Acc: {test_acc:.3f}")

## 4. BiLSTM

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os

# Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        words = text.split()
        indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
        if len(indices) < self.max_len:
            indices += [self.vocab['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        return torch.tensor(indices), torch.tensor(self.labels[idx])

# BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Build vocab
words = ' '.join(train['text']).split()
vocab = {word: i+2 for i, word in enumerate(set(words))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Dataset and DataLoader
train_dataset = TextDataset(train['text'].values, train['label'].values, vocab)
val_dataset = TextDataset(val['text'].values, val['label'].values, vocab)
test_dataset = TextDataset(test['text'].values, test['label'].values, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiLSTM(len(vocab), embed_dim=100, hidden_dim=128, num_classes=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch+1}, Val Acc: {val_acc:.3f}")

# Test
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
torch.save(model.state_dict(), 'artifacts/models/bilstm.pt')
np.save('artifacts/results/bilstm_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds})

print(f"BiLSTM trained. Test Acc: {test_acc:.3f}")

## 5. HAN

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os

# Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        words = text.split()
        indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
        if len(indices) < self.max_len:
            indices += [self.vocab['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        return torch.tensor(indices), torch.tensor(self.labels[idx])

# HAN Model
class HAN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(HAN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.word_gru = nn.GRU(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attention = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.sentence_gru = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        self.sentence_attention = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.word_gru(x)
        word_attention = torch.tanh(self.word_attention(x))
        word_attention = torch.softmax(word_attention, dim=1)
        x = torch.sum(x * word_attention, dim=1)
        x = x.unsqueeze(1)
        x, _ = self.sentence_gru(x)
        sentence_attention = torch.tanh(self.sentence_attention(x)).squeeze(1)
        sentence_attention = torch.softmax(sentence_attention, dim=1)
        x = torch.sum(x * sentence_attention.unsqueeze(2), dim=1)
        x = self.dropout(x)
        return self.fc(x)

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Build vocab
words = ' '.join(train['text']).split()
vocab = {word: i+2 for i, word in enumerate(set(words))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Dataset and DataLoader
train_dataset = TextDataset(train['text'].values, train['label'].values, vocab)
val_dataset = TextDataset(val['text'].values, val['label'].values, vocab)
test_dataset = TextDataset(test['text'].values, test['label'].values, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HAN(len(vocab), embed_dim=100, hidden_dim=128, num_classes=6).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch+1}, Val Acc: {val_acc:.3f}")

# Test
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
torch.save(model.state_dict(), 'artifacts/models/han.pt')
np.save('artifacts/results/han_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds})

print(f"HAN trained. Test Acc: {test_acc:.3f}")

## 6. Transformer Models (BERT)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Save as HuggingFace dataset
train.to_csv('artifacts/data/train_hf.csv', index=False)
val.to_csv('artifacts/data/val_hf.csv', index=False)
test.to_csv('artifacts/data/test_hf.csv', index=False)

# Load dataset
dataset = load_dataset('csv', data_files={
    'train': 'artifacts/data/train_hf.csv',
    'validation': 'artifacts/data/val_hf.csv',
    'test': 'artifacts/data/test_hf.csv'
})

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Training arguments
training_args = TrainingArguments(
    output_dir='artifacts/models/bert',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Test
test_results = trainer.predict(tokenized_datasets['test'])
test_preds = np.argmax(test_results.predictions, axis=-1)
test_acc = accuracy_score(test_results.label_ids, test_preds)
test_f1 = f1_score(test_results.label_ids, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
model.save_pretrained('artifacts/models/bert')
tokenizer.save_pretrained('artifacts/models/bert')
np.save('artifacts/results/bert_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds.tolist()})

print(f"BERT trained. Test Acc: {test_acc:.3f}")

## 7. Ensemble

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import joblib
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os

# Load data
test = pd.read_csv('artifacts/data/test.csv')
X_test = test['text'].values
y_test = test['label'].values

# Load classical models
vectorizer = joblib.load('artifacts/models/tfidf.pkl')
lr = joblib.load('artifacts/models/lr.pkl')
svm = joblib.load('artifacts/models/svm.pkl')
xgb = joblib.load('artifacts/models/xgb.pkl')

# Load transformer model
bert_model = AutoModelForSequenceClassification.from_pretrained('artifacts/models/bert')
bert_tokenizer = AutoTokenizer.from_pretrained('artifacts/models/bert')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
bert_model.eval()

# TFIDF predictions
X_test_tfidf = vectorizer.transform(X_test)
lr_probs = lr.predict_proba(X_test_tfidf)
svm_probs = svm.decision_function(X_test_tfidf)
xgb_probs = xgb.predict_proba(X_test_tfidf)

# BERT predictions
def get_bert_probs(texts):
    inputs = bert_tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
    return probs

bert_probs = get_bert_probs(X_test)

# Soft voting ensemble
ensemble_probs = (lr_probs + svm_probs + xgb_probs + bert_probs) / 4
ensemble_preds = np.argmax(ensemble_probs, axis=1)

# Metrics
ensemble_acc = accuracy_score(y_test, ensemble_preds)
ensemble_f1 = f1_score(y_test, ensemble_preds, average='weighted')

# Save
np.save('artifacts/results/ensemble_results.npy', {'acc': ensemble_acc, 'f1': ensemble_f1, 'pred': ensemble_preds.tolist()})

print(f"Ensemble trained. Test Acc: {ensemble_acc:.3f}")

## 8. T5 Prompting

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Format prompts
def format_prompts(df):
    df['text'] = df['text'].apply(lambda x: f"Classify this question as BT level: {x}")
    df['labels'] = df['label'].apply(lambda x: f"BT{x+1}")
    return df

train = format_prompts(train)
val = format_prompts(val)
test = format_prompts(test)

# Save as HuggingFace dataset
train.to_csv('artifacts/data/train_t5.csv', index=False)
val.to_csv('artifacts/data/val_t5.csv', index=False)
test.to_csv('artifacts/data/test_t5.csv', index=False)

dataset = load_dataset('csv', data_files={
    'train': 'artifacts/data/train_t5.csv',
    'validation': 'artifacts/data/val_t5.csv',
    'test': 'artifacts/data/test_t5.csv'
})

# Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_function(examples):
    inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    labels = tokenizer(examples['labels'], padding='max_length', truncation=True, max_length=10)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Training arguments
training_args = TrainingArguments(
    output_dir='artifacts/models/t5',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    preds = [int(pred.replace('BT', '')) - 1 for pred in decoded_preds]
    labels = [int(label.replace('BT', '')) - 1 for label in decoded_labels]
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Test
test_results = trainer.predict(tokenized_datasets['test'])
decoded_preds = tokenizer.batch_decode(test_results.predictions, skip_special_tokens=True)
test_preds = [int(pred.replace('BT', '')) - 1 for pred in decoded_preds]
test_labels = [int(label.replace('BT', '')) - 1 for label in tokenizer.batch_decode(test_results.label_ids, skip_special_tokens=True)]
test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
model.save_pretrained('artifacts/models/t5')
tokenizer.save_pretrained('artifacts/models/t5')
np.save('artifacts/results/t5_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds})

print(f"T5 trained. Test Acc: {test_acc:.3f}")

## 9. Zero-Shot Classification

In [None]:
from transformers import pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os

# Load data
test = pd.read_csv('artifacts/data/test.csv')
X_test = test['text'].values
y_test = test['label'].values

# Zero-shot pipeline
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
labels = ['BT1', 'BT2', 'BT3', 'BT4', 'BT5', 'BT6']

# Predict
test_preds = []
for text in X_test:
    result = classifier(text, candidate_labels=labels)
    pred = labels.index(result['labels'][0])
    test_preds.append(pred)

# Metrics
test_acc = accuracy_score(y_test, test_preds)
test_f1 = f1_score(y_test, test_preds, average='weighted')

# Save
os.makedirs('artifacts/results', exist_ok=True)
np.save('artifacts/results/zero_shot_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds})

print(f"Zero-shot trained. Test Acc: {test_acc:.3f}")

## 10. Contrastive Learning

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import os
import joblib

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
X_train = model.encode(train['text'].values)
X_val = model.encode(val['text'].values)
X_test = model.encode(test['text'].values)

y_train = train['label'].values
y_val = val['label'].values
y_test = test['label'].values

# Logistic Regression on embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
test_preds = clf.predict(X_test)
test_acc = accuracy_score(y_test, test_preds)
test_f1 = f1_score(y_test, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
joblib.dump(clf, 'artifacts/models/contrastive_lr.pkl')
joblib.dump(model, 'artifacts/models/sentence_transformer.pkl')
np.save('artifacts/results/contrastive_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds.tolist()})

print(f"Contrastive model trained. Test Acc: {test_acc:.3f}")

## 11. Multi-Task Learning

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os

# Dataset with synthetic difficulty
class MultiTaskDataset(Dataset):
    def __init__(self, texts, labels, difficulties, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.difficulties = difficulties
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        words = text.split()
        indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
        if len(indices) < self.max_len:
            indices += [self.vocab['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        return torch.tensor(indices), torch.tensor(self.labels[idx]), torch.tensor(self.difficulties[idx])

# Multi-task Model
class MultiTaskModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_difficulties):
        super(MultiTaskModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc_class = nn.Linear(hidden_dim * 2, num_classes)
        self.fc_difficulty = nn.Linear(hidden_dim * 2, num_difficulties)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        class_output = self.fc_class(x)
        difficulty_output = self.fc_difficulty(x)
        return class_output, difficulty_output

# Load data
train = pd.read_csv('artifacts/data/train.csv')
val = pd.read_csv('artifacts/data/val.csv')
test = pd.read_csv('artifacts/data/test.csv')

# Synthetic difficulty (1-3)
train['difficulty'] = np.random.randint(1, 4, size=len(train))
val['difficulty'] = np.random.randint(1, 4, size=len(val))
test['difficulty'] = np.random.randint(1, 4, size=len(test))

# Build vocab
words = ' '.join(train['text']).split()
vocab = {word: i+2 for i, word in enumerate(set(words))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Dataset and DataLoader
train_dataset = MultiTaskDataset(train['text'].values, train['label'].values, train['difficulty'].values, vocab)
val_dataset = MultiTaskDataset(val['text'].values, val['label'].values, val['difficulty'].values, vocab)
test_dataset = MultiTaskDataset(test['text'].values, test['label'].values, test['difficulty'].values, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiTaskModel(len(vocab), embed_dim=100, hidden_dim=128, num_classes=6, num_difficulties=3).to(device)
criterion_class = nn.CrossEntropyLoss()
criterion_difficulty = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for texts, labels, difficulties in train_loader:
        texts, labels, difficulties = texts.to(device), labels.to(device), difficulties.to(device)
        optimizer.zero_grad()
        class_output, difficulty_output = model(texts)
        loss_class = criterion_class(class_output, labels)
        loss_difficulty = criterion_difficulty(difficulty_output, difficulties)
        loss = loss_class + loss_difficulty
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for texts, labels, _ in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            class_output, _ = model(texts)
            val_preds.extend(torch.argmax(class_output, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_acc = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch+1}, Val Acc: {val_acc:.3f}")

# Test
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for texts, labels, _ in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        class_output, _ = model(texts)
        test_preds.extend(torch.argmax(class_output, dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')

# Save
os.makedirs('artifacts/models', exist_ok=True)
torch.save(model.state_dict(), 'artifacts/models/multitask.pt')
np.save('artifacts/results/multitask_results.npy', {'acc': test_acc, 'f1': test_f1, 'pred': test_preds})

print(f"Multi-task model trained. Test Acc: {test_acc:.3f}")

## 12. Explainability

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from lime.lime_text import LimeTextExplainer
from captum.attr import IntegratedGradients
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Load data
test = pd.read_csv('artifacts/data/test.csv')
X_test = test['text'].values
y_test = test['label'].values

# Load models
vectorizer = joblib.load('artifacts/models/tfidf.pkl')
lr = joblib.load('artifacts/models/lr.pkl')
bert_model = AutoModelForSequenceClassification.from_pretrained('artifacts/models/bert')
bert_tokenizer = AutoTokenizer.from_pretrained('artifacts/models/bert')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
bert_model.eval()

# LIME for Logistic Regression
explainer = LimeTextExplainer(class_names=['BT1', 'BT2', 'BT3', 'BT4', 'BT5', 'BT6'])
def lr_predict_proba(texts):
    return lr.predict_proba(vectorizer.transform(texts))

lime_explanations = []
for text in X_test[:5]:
    exp = explainer.explain_instance(text, lr_predict_proba, num_features=6)
    lime_explanations.append(exp.as_list())

# Integrated Gradients for BERT
def bert_forward(inputs):
    attention_mask = (inputs != bert_tokenizer.pad_token_id).long()
    outputs = bert_model(inputs, attention_mask=attention_mask)
    return outputs.logits

ig = IntegratedGradients(bert_forward)
ig_explanations = []
for text in X_test[:5]:
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = inputs['input_ids'].to(device)
    attributions = ig.attribute(inputs, target=0)
    ig_explanations.append(attributions.cpu().numpy())

# Save
os.makedirs('artifacts/explanations', exist_ok=True)
np.save('artifacts/explanations/lime_lr.npy', lime_explanations)
np.save('artifacts/explanations/ig_bert.npy', ig_explanations)

print("Explainability analysis completed.")

## 13. Evaluation

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import os

# Load data
test = pd.read_csv('artifacts/data/test.csv')
y_test = test['label'].values

# Load results
models = ['classical', 'textcnn', 'bilstm', 'han', 'bert', 'ensemble', 't5', 'zero_shot', 'contrastive', 'multitask']
results = {}
for model in models:
    result = np.load(f'artifacts/results/{model}_results.npy', allow_pickle=True).item()
    results[model] = {
        'acc': result['acc'],
        'f1': result['f1'],
        'pred': result['pred'],
        'cm': confusion_matrix(y_test, result['pred']).tolist()
    }

# Save
os.makedirs('artifacts/evaluations', exist_ok=True)
np.save('artifacts/evaluations/all_results.npy', results)

print("Evaluation completed.")

## 14. Compare Results

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

# Load results
results = np.load('artifacts/evaluations/all_results.npy', allow_pickle=True).item()

# Create DataFrame
df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': [results[m]['acc'] for m in results],
    'Weighted F1': [results[m]['f1'] for m in results]
})

# Add macro F1
test = pd.read_csv('artifacts/data/test.csv')
y_test = test['label'].values
df['Macro F1'] = [f1_score(y_test, results[m]['pred'], average='macro') for m in results]

# Generate LaTeX table
latex_table = df.to_latex(index=False, float_format="%.2f", caption="Model Comparison on Test Set", label="tab:model_comparison")

# Save LaTeX
os.makedirs('artifacts/results', exist_ok=True)
with open('artifacts/results/compare_results.tex', 'w') as f:
    f.write("\\documentclass{article}\\usepackage{booktabs}\\begin{document}\n")
    f.write(latex_table)
    f.write("\\end{document}")

print("Comparison table generated. Run `latexmk -pdf artifacts/results/compare_results.tex` to compile.")
print(df)