# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 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

In [None]:
!cp ./drive/MyDrive/qies_files/*.csv .

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.0


In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import pandas as pd
from sentence_transformers import SentenceTransformer

# Kolmogorov Arnold Networks (KANs) based Model Architechture

In [None]:
# @title Dataset building
class StanceDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['post']
        topic = self.data.iloc[idx]['topic_str']
        stance = int(self.data.iloc[idx]['label'])

        encoding = self.tokenizer.encode_plus(
            text,
            topic,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'stance': torch.tensor(stance, dtype=torch.long)
        }

In [None]:
# @title QIES Arch
# class QIESModel(nn.Module):
#     def __init__(self, base_model, num_stances, embedding_dim, hidden_dim, context_dim):
#         super().__init__()
#         self.base_model = base_model
#         self.num_stances = num_stances
#         self.stance_embeddings = nn.Parameter(torch.randn(num_stances, embedding_dim))
#         nn.init.orthogonal_(self.stance_embeddings)

#         self.superposition = nn.Sequential(
#             nn.Linear(hidden_dim, num_stances),
#             nn.Softmax(dim=-1)
#         )

#         self.collapse = nn.Sequential(
#             nn.Linear(embedding_dim + context_dim, embedding_dim),
#             nn.ReLU(),
#             nn.Linear(embedding_dim, 1)
#         )

#         self.classifier = nn.Linear(hidden_dim + 1, num_stances)

#     def forward(self, input_ids, attention_mask):
#         base_output = self.base_model(input_ids, attention_mask=attention_mask)
#         hidden_state = base_output.last_hidden_state[:, 0, :]  # Use CLS token

#         weights = self.superposition(hidden_state)
#         superposition = torch.matmul(weights.unsqueeze(1), self.stance_embeddings).squeeze(1)

#         context = hidden_state  # Using hidden state as context
#         combined = torch.cat([superposition, context], dim=-1)
#         collapsed_score = self.collapse(combined).squeeze(-1)

#         logits = self.classifier(torch.cat([hidden_state, collapsed_score.unsqueeze(1)], dim=1))
#         return logits, collapsed_score, weights

class KANLayer(nn.Module):
    def __init__(self, inputdim, outdim, initial_gridsize=5, addbias=True):
        super(KANLayer, self).__init__()
        self.addbias = addbias
        self.inputdim = inputdim
        self.outdim = outdim

        # Learnable gridsize parameter
        self.gridsize_param = nn.Parameter(torch.tensor(initial_gridsize, dtype=torch.float32))

        # Fourier coefficients as a learnable parameter with Xavier initialization
        self.fouriercoeffs = nn.Parameter(torch.empty(2, outdim, inputdim, initial_gridsize))
        nn.init.xavier_uniform_(self.fouriercoeffs)

        if self.addbias:
            self.bias = nn.Parameter(torch.zeros(1, outdim))

    def forward(self, x):
        gridsize = torch.clamp(self.gridsize_param, min=1).round().int()
        xshp = x.shape
        outshape = xshp[:-1] + (self.outdim,)
        x = torch.reshape(x, (-1, self.inputdim))
        k = torch.reshape(torch.arange(1, gridsize + 1, device=x.device), (1, 1, 1, gridsize))
        xrshp = torch.reshape(x, (x.shape[0], 1, x.shape[1], 1))
        c = torch.cos(k * xrshp)
        s = torch.sin(k * xrshp)
        y = torch.sum(c * self.fouriercoeffs[0:1, :, :, :gridsize], (-2, -1))
        y += torch.sum(s * self.fouriercoeffs[1:2, :, :, :gridsize], (-2, -1))
        if self.addbias:
            y += self.bias
        y = torch.reshape(y, outshape)
        return y

class KANStanceClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_stances):
        super(KANStanceClassifier, self).__init__()
        self.layer1 = KANLayer(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.layer2 = KANLayer(hidden_dim, num_stances)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.layer2(x)
        return x


# Initialize Stance Embeddings with Multiple Statements
def load_pretrained_stance_embeddings(stance_statements, model_name='all-mpnet-base-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(stance_statements)
    return torch.tensor(embeddings)

def initialize_stance_embeddings(num_stances, embedding_dim):
    stance_statements = {
        0: [
            "I strongly oppose this idea.",
            "I completely disagree with this position.",
            "I am against this topic.",
            "I do not support this."
        ],
        1: [
            "I fully support this idea.",
            "I agree with this position.",
            "I am in favor of this topic.",
            "I endorse this."
        ],
        2: [
            "I have a neutral stance on this matter.",
            "I neither agree nor disagree with this position.",
            "I see valid points on both sides.",
            "I am indifferent towards this."
        ]
    }

    stance_embeddings = []
    for stance in range(num_stances):
        statements = stance_statements[stance]
        embeddings = load_pretrained_stance_embeddings(statements)
        mean_embedding = torch.mean(embeddings, dim=0)
        stance_embeddings.append(mean_embedding)

    pretrained_embeddings = torch.stack(stance_embeddings)

    # Project to desired dimensionality if necessary
    if pretrained_embeddings.shape[1] != embedding_dim:
        projection = nn.Linear(pretrained_embeddings.shape[1], embedding_dim, bias=False)
        with torch.no_grad():
            pretrained_embeddings = projection(pretrained_embeddings)

    return pretrained_embeddings

class QIESModel(nn.Module):
    def __init__(self, base_model, num_stances, embedding_dim, hidden_dim, context_dim, stance_embeddings):
        super().__init__()
        self.base_model = base_model
        self.num_stances = num_stances
        self.stance_embeddings = nn.Parameter(stance_embeddings)
        nn.init.orthogonal_(self.stance_embeddings)

        self.superposition = nn.Sequential(
            KANLayer(hidden_dim, num_stances),
            nn.Softmax(dim=-1)
        )

        self.collapse = nn.Sequential(
            KANLayer(embedding_dim + context_dim, embedding_dim),
            nn.ReLU(),
            KANLayer(embedding_dim, 1)
        )

        self.classifier = KANStanceClassifier(hidden_dim + 1, hidden_dim, num_stances)

    def forward(self, input_ids, attention_mask):
        base_output = self.base_model(input_ids, attention_mask=attention_mask)
        hidden_state = base_output.last_hidden_state[:, 0, :]  # Use CLS token

        weights = self.superposition(hidden_state)
        superposition = torch.matmul(weights.unsqueeze(1), self.stance_embeddings).squeeze(1)

        context = hidden_state  # Using hidden state as context
        combined = torch.cat([superposition, context], dim=-1)
        collapsed_score = self.collapse(combined).squeeze(-1)

        logits = self.classifier(torch.cat([hidden_state, collapsed_score.unsqueeze(1)], dim=1))
        return logits, collapsed_score, weights

In [None]:
#  @title Training and evaluation functions
# def train_qies(model, dataloader, optimizer, scheduler, scaler, device):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(dataloader, desc="Training"):
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         stance = batch['stance'].to(device)

#         with autocast():
#             logits, collapsed_score, weights = model(input_ids, attention_mask)
#             loss = F.cross_entropy(logits, stance)
#             entropy = -torch.sum(weights * torch.log(weights + 1e-10), dim=1).mean()
#             loss += 0.1 * entropy  # Encourage superposition

#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()
#         scheduler.step()
#         total_loss += loss.item()
#     return total_loss / len(dataloader)

def train_qies(model, dataloader, optimizer, scheduler, scaler, device, l2_lambda=0.01):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        stance = batch['stance'].to(device)

        with autocast():
            logits, collapsed_score, weights = model(input_ids, attention_mask)
            loss = F.cross_entropy(logits, stance)
            entropy = -torch.sum(weights * torch.log(weights + 1e-10), dim=1).mean()
            loss += 0.1 * entropy  # Encourage superposition

            # Add L2 regularization for KAN layers
            l2_reg = 0.0
            for name, param in model.named_parameters():
                if 'fouriercoeffs' in name:
                    l2_reg += torch.norm(param)
            loss += l2_lambda * l2_reg

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            stance = batch['stance'].to(device)

            logits, _, _ = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(stance.cpu().numpy())
    return f1_score(all_labels, all_preds, average='macro')

In [None]:
# @title Model initialization
# Hyperparameters
base_model_name = "microsoft/deberta-v3-large"
num_stances = 3           #FIRST LET'S SEE WITH 3 STANCES ONLY
embedding_dim = 1024
hidden_dim = 1024
context_dim = 1024
batch_size = 32
num_epochs = 5
learning_rate = 2e-5
max_length = 256

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load base model and tokenizer
base_model = AutoModel.from_pretrained(base_model_name)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load datasets
train_dataset = StanceDataset('./vast_train.csv', tokenizer, max_length)
val_dataset = StanceDataset('./vast_dev.csv', tokenizer, max_length)
test_dataset = StanceDataset('./vast_test.csv', tokenizer, max_length)
zero_dataset = StanceDataset('./zero_shot.csv', tokenizer, max_length)
few_dataset = StanceDataset('./few_shot.csv', tokenizer, max_length)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
zero_dataloader = DataLoader(zero_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
few_dataloader = DataLoader(few_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)

# Initialize Stance Embeddings
stance_embeddings = initialize_stance_embeddings(num_stances, embedding_dim).to(device)

# Initialize model
model = QIESModel(base_model, num_stances, embedding_dim, hidden_dim, context_dim, stance_embeddings).to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

# Mixed precision training
scaler = GradScaler()


file_name = 'best_qies_model_large_256.pth'

# Training loop
best_val_f1 = 0
for epoch in range(num_epochs):
    train_loss = train_qies(model, train_dataloader, optimizer, scheduler, scaler, device)
    val_f1 = evaluate(model, val_dataloader, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), file_name)
    else:
        print("No improvement in validation accuracy. Stopping this run.")
        break

# Evaluate on test set
!cp ./drive/MyDrive/qies_files/best_qies_model1.pth .
model.load_state_dict(torch.load(file_name))
val_f1_again = evaluate(model, val_dataloader, device)
print(f"Confirming Validation F1 Score: {val_f1_again:.4f}")
test_f1 = evaluate(model, test_dataloader, device)
print(f"Test F1 Score: {test_f1:.4f}")
few_shot_score = evaluate(model, few_dataloader, device)
print(f"Few Shot F1 Score: {few_shot_score:.4f}")
zero_shot_score = evaluate(model, zero_dataloader, device)
print(f"Zero Shot F1 Score: {zero_shot_score:.4f}")


#Unassign
! cp ./*.pth ./drive/MyDrive/qies_files/
print("Runtime Unassigned!")
from google.colab import runtime
runtime.unassign()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  scaler = GradScaler()
  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
Evaluating: 100%|██████████| 65/65 [00:27<00:00,  2.35it/s]


Epoch 1/5, Train Loss: 0.8812, Val F1: 0.7938


  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
Evaluating: 100%|██████████| 65/65 [00:27<00:00,  2.35it/s]


Epoch 2/5, Train Loss: 0.6204, Val F1: 0.8108


  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:56<00:00,  2.38it/s]
Evaluating: 100%|██████████| 65/65 [00:27<00:00,  2.35it/s]


Epoch 3/5, Train Loss: 0.5218, Val F1: 0.8211


  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
Evaluating: 100%|██████████| 65/65 [00:27<00:00,  2.35it/s]


Epoch 4/5, Train Loss: 0.4455, Val F1: 0.8202
No improvement in validation accuracy. Stopping this run.


  model.load_state_dict(torch.load(file_name))
  self.pid = os.fork()
Evaluating: 100%|██████████| 65/65 [00:27<00:00,  2.35it/s]


Confirming Validation F1 Score: 0.8211


  self.pid = os.fork()
Evaluating: 100%|██████████| 94/94 [00:40<00:00,  2.34it/s]


Test F1 Score: 0.7930


  self.pid = os.fork()
Evaluating: 100%|██████████| 49/49 [00:20<00:00,  2.36it/s]


Few Shot F1 Score: 0.7724


  self.pid = os.fork()
Evaluating: 100%|██████████| 46/46 [00:19<00:00,  2.34it/s]


Zero Shot F1 Score: 0.8147
Runtime Unassigned!


# MLP

In [None]:
# @title Dataset building
class StanceDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['post']
        topic = self.data.iloc[idx]['topic_str']
        stance = int(self.data.iloc[idx]['label'])

        encoding = self.tokenizer.encode_plus(
            text,
            topic,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'stance': torch.tensor(stance, dtype=torch.long)
        }

In [None]:
# @title QIES Arch

class KANLayer(nn.Module):
    def __init__(self, inputdim, outdim, initial_gridsize=5, addbias=True):
        super(KANLayer, self).__init__()
        self.addbias = addbias
        self.inputdim = inputdim
        self.outdim = outdim

        # Learnable gridsize parameter
        self.gridsize_param = nn.Parameter(torch.tensor(initial_gridsize, dtype=torch.float32))

        # Fourier coefficients as a learnable parameter with Xavier initialization
        self.fouriercoeffs = nn.Parameter(torch.empty(2, outdim, inputdim, initial_gridsize))
        nn.init.xavier_uniform_(self.fouriercoeffs)

        if self.addbias:
            self.bias = nn.Parameter(torch.zeros(1, outdim))

    def forward(self, x):
        gridsize = torch.clamp(self.gridsize_param, min=1).round().int()
        xshp = x.shape
        outshape = xshp[:-1] + (self.outdim,)
        x = torch.reshape(x, (-1, self.inputdim))
        k = torch.reshape(torch.arange(1, gridsize + 1, device=x.device), (1, 1, 1, gridsize))
        xrshp = torch.reshape(x, (x.shape[0], 1, x.shape[1], 1))
        c = torch.cos(k * xrshp)
        s = torch.sin(k * xrshp)
        y = torch.sum(c * self.fouriercoeffs[0:1, :, :, :gridsize], (-2, -1))
        y += torch.sum(s * self.fouriercoeffs[1:2, :, :, :gridsize], (-2, -1))
        if self.addbias:
            y += self.bias
        y = torch.reshape(y, outshape)
        return y

class KANStanceClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_stances):
        super(KANStanceClassifier, self).__init__()
        self.layer1 = KANLayer(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.layer2 = KANLayer(hidden_dim, num_stances)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.layer2(x)
        return x


# Initialize Stance Embeddings with Multiple Statements
def load_pretrained_stance_embeddings(stance_statements, model_name='all-mpnet-base-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(stance_statements)
    return torch.tensor(embeddings)

def initialize_stance_embeddings(num_stances, embedding_dim):
    stance_statements = {
        0: [
            "I strongly oppose this idea.",
            "I completely disagree with this position.",
            "I am against this topic.",
            "I do not support this."
        ],
        1: [
            "I fully support this idea.",
            "I agree with this position.",
            "I am in favor of this topic.",
            "I endorse this."
        ],
        2: [
            "I have a neutral stance on this matter.",
            "I neither agree nor disagree with this position.",
            "I see valid points on both sides.",
            "I am indifferent towards this."
        ]
    }

    stance_embeddings = []
    for stance in range(num_stances):
        statements = stance_statements[stance]
        embeddings = load_pretrained_stance_embeddings(statements)
        mean_embedding = torch.mean(embeddings, dim=0)
        stance_embeddings.append(mean_embedding)

    pretrained_embeddings = torch.stack(stance_embeddings)

    # Project to desired dimensionality if necessary
    if pretrained_embeddings.shape[1] != embedding_dim:
        projection = nn.Linear(pretrained_embeddings.shape[1], embedding_dim, bias=False)
        with torch.no_grad():
            pretrained_embeddings = projection(pretrained_embeddings)

    return pretrained_embeddings

# class QIESModel(nn.Module):
#     def __init__(self, base_model, num_stances, embedding_dim, hidden_dim, context_dim, stance_embeddings):
#         super().__init__()
#         self.base_model = base_model
#         self.num_stances = num_stances
#         self.stance_embeddings = nn.Parameter(stance_embeddings)
#         nn.init.orthogonal_(self.stance_embeddings)

#         self.superposition = nn.Sequential(
#             KANLayer(hidden_dim, num_stances),
#             nn.Softmax(dim=-1)
#         )

#         self.collapse = nn.Sequential(
#             KANLayer(embedding_dim + context_dim, embedding_dim),
#             nn.ReLU(),
#             KANLayer(embedding_dim, 1)
#         )

#         self.classifier = KANStanceClassifier(hidden_dim + 1, hidden_dim, num_stances)

#     def forward(self, input_ids, attention_mask):
#         base_output = self.base_model(input_ids, attention_mask=attention_mask)
#         hidden_state = base_output.last_hidden_state[:, 0, :]  # Use CLS token

#         weights = self.superposition(hidden_state)
#         superposition = torch.matmul(weights.unsqueeze(1), self.stance_embeddings).squeeze(1)

#         context = hidden_state  # Using hidden state as context
#         combined = torch.cat([superposition, context], dim=-1)
#         collapsed_score = self.collapse(combined).squeeze(-1)

#         logits = self.classifier(torch.cat([hidden_state, collapsed_score.unsqueeze(1)], dim=1))
#         return logits, collapsed_score, weights


class QIESModel(nn.Module):
    def __init__(self, base_model, num_stances, embedding_dim, hidden_dim, context_dim, stance_embeddings):
        super().__init__()
        self.base_model = base_model
        self.num_stances = num_stances
        self.stance_embeddings = nn.Parameter(stance_embeddings)
        nn.init.orthogonal_(self.stance_embeddings)

        self.superposition = nn.Sequential(
            nn.Linear(hidden_dim, num_stances),
            nn.Softmax(dim=-1)
        )

        self.collapse = nn.Sequential(
            nn.Linear(embedding_dim + context_dim, embedding_dim),
            nn.ReLU(),
            nn.Linear(embedding_dim, 1)
        )

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim + 1, hidden_dim + 1),
            nn.ReLU(),
            nn.Linear(hidden_dim + 1, num_stances)
        )

    def forward(self, input_ids, attention_mask):
        base_output = self.base_model(input_ids, attention_mask=attention_mask)
        hidden_state = base_output.last_hidden_state[:, 0, :]  # Use CLS token

        weights = self.superposition(hidden_state)
        superposition = torch.matmul(weights.unsqueeze(1), self.stance_embeddings).squeeze(1)

        context = hidden_state  # Using hidden state as context
        combined = torch.cat([superposition, context], dim=-1)
        collapsed_score = self.collapse(combined).squeeze(-1)

        logits = self.classifier(torch.cat([hidden_state, collapsed_score.unsqueeze(1)], dim=1))
        return logits, collapsed_score, weights

In [None]:
#  @title Training and evaluation functions
def train_qies(model, dataloader, optimizer, scheduler, scaler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        stance = batch['stance'].to(device)

        with autocast():
            logits, collapsed_score, weights = model(input_ids, attention_mask)
            loss = F.cross_entropy(logits, stance)
            entropy = -torch.sum(weights * torch.log(weights + 1e-10), dim=1).mean()
            loss += 0.1 * entropy  # Encourage superposition

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# def train_qies(model, dataloader, optimizer, scheduler, scaler, device, l2_lambda=0.01):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(dataloader, desc="Training"):
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         stance = batch['stance'].to(device)

#         with autocast():
#             logits, collapsed_score, weights = model(input_ids, attention_mask)
#             loss = F.cross_entropy(logits, stance)
#             entropy = -torch.sum(weights * torch.log(weights + 1e-10), dim=1).mean()
#             loss += 0.1 * entropy  # Encourage superposition

#             # Add L2 regularization for KAN layers
#             l2_reg = 0.0
#             for name, param in model.named_parameters():
#                 if 'fouriercoeffs' in name:
#                     l2_reg += torch.norm(param)
#             loss += l2_lambda * l2_reg

#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()
#         scheduler.step()
#         total_loss += loss.item()
#     return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            stance = batch['stance'].to(device)

            logits, _, _ = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(stance.cpu().numpy())
    return f1_score(all_labels, all_preds, average='macro')

In [None]:
# @title Model initialization
# Hyperparameters
base_model_name = "microsoft/deberta-v3-large"
num_stances = 3           #FIRST LET'S SEE WITH 3 STANCES ONLY
embedding_dim = 1024
hidden_dim = 1024
context_dim = 1024
batch_size = 32
num_epochs = 5
learning_rate = 2e-5
max_length = 256

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load base model and tokenizer
base_model = AutoModel.from_pretrained(base_model_name)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load datasets
train_dataset = StanceDataset('./vast_train.csv', tokenizer, max_length)
val_dataset = StanceDataset('./vast_dev.csv', tokenizer, max_length)
test_dataset = StanceDataset('./vast_test.csv', tokenizer, max_length)
zero_dataset = StanceDataset('./zero_shot.csv', tokenizer, max_length)
few_dataset = StanceDataset('./few_shot.csv', tokenizer, max_length)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
zero_dataloader = DataLoader(zero_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
few_dataloader = DataLoader(few_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)

# Initialize Stance Embeddings
stance_embeddings = initialize_stance_embeddings(num_stances, embedding_dim).to(device)

# Initialize model
model = QIESModel(base_model, num_stances, embedding_dim, hidden_dim, context_dim, stance_embeddings).to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

# Mixed precision training
scaler = GradScaler()


file_name = 'best_qies_model_large_256_mlps.pth'

# Training loop
best_val_f1 = 0
for epoch in range(num_epochs):
    train_loss = train_qies(model, train_dataloader, optimizer, scheduler, scaler, device)
    val_f1 = evaluate(model, val_dataloader, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), file_name)
    else:
        print("No improvement in validation accuracy. Stopping this run.")
        break

# Evaluate on test set
!cp ./drive/MyDrive/qies_files/best_qies_model1.pth .
model.load_state_dict(torch.load(file_name))
val_f1_again = evaluate(model, val_dataloader, device)
print(f"Confirming Validation F1 Score: {val_f1_again:.4f}")
test_f1 = evaluate(model, test_dataloader, device)
print(f"Test F1 Score: {test_f1:.4f}")
few_shot_score = evaluate(model, few_dataloader, device)
print(f"Few Shot F1 Score: {few_shot_score:.4f}")
zero_shot_score = evaluate(model, zero_dataloader, device)
print(f"Zero Shot F1 Score: {zero_shot_score:.4f}")


#Unassign
! cp ./*.pth ./drive/MyDrive/qies_files/
print("Runtime Unassigned!")
from google.colab import runtime
runtime.unassign()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  scaler = GradScaler()
  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:42<00:00,  2.60it/s]
Evaluating: 100%|██████████| 65/65 [00:26<00:00,  2.41it/s]


Epoch 1/5, Train Loss: 0.8361, Val F1: 0.7682


  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:40<00:00,  2.62it/s]
Evaluating: 100%|██████████| 65/65 [00:26<00:00,  2.42it/s]


Epoch 2/5, Train Loss: 0.5370, Val F1: 0.8059


  self.pid = os.fork()
  with autocast():
Training: 100%|██████████| 422/422 [02:40<00:00,  2.62it/s]
Evaluating: 100%|██████████| 65/65 [00:26<00:00,  2.42it/s]


Epoch 3/5, Train Loss: 0.4136, Val F1: 0.7988
No improvement in validation accuracy. Stopping this run.


  model.load_state_dict(torch.load(file_name))
  self.pid = os.fork()
Evaluating: 100%|██████████| 65/65 [00:26<00:00,  2.41it/s]


Confirming Validation F1 Score: 0.8059


  self.pid = os.fork()
Evaluating: 100%|██████████| 94/94 [00:39<00:00,  2.41it/s]


Test F1 Score: 0.7879


  self.pid = os.fork()
Evaluating: 100%|██████████| 49/49 [00:20<00:00,  2.42it/s]


Few Shot F1 Score: 0.7745


  self.pid = os.fork()
Evaluating: 100%|██████████| 46/46 [00:19<00:00,  2.40it/s]


Zero Shot F1 Score: 0.8020
