In [5]:
!pip install -q sentence-transformers transformers torch torchvision torchaudio gensim scikit-learn joblib tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
import re
import math
import joblib
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [2]:
DATA_PATH = '/content/SRIP-Dataset-Negation.csv' # change if needed
if not os.path.exists(DATA_PATH):
    print("Dataset not found at", DATA_PATH)
# If running interactively, prompt user to upload
from google.colab import files
print('Please upload the dataset file (SRIP-Dataset-Negation.csv)')
uploaded = files.upload()
for fn in uploaded:
    DATA_PATH = '/content/' + fn
    print('Loaded', DATA_PATH)


df = pd.read_csv(DATA_PATH)
print('Loaded rows:', len(df))
print(df.columns)

Dataset not found at /content/SRIP-Dataset-Negation.csv
Please upload the dataset file (SRIP-Dataset-Negation.csv)


Saving SRIP-Dataset-Negation.csv to SRIP-Dataset-Negation.csv
Loaded /content/SRIP-Dataset-Negation.csv
Loaded rows: 8575
Index(['Class', 'Sentence 1', 'Sentence 2', 'Similarity Score',
       'Negation Count S1', 'Negation Count S2', 'Variation Type'],
      dtype='object')


In [3]:
NEG_WORDS = set(["not","no","never","n't","none","nobody","nowhere","neither","nor","cannot","can't","without","hardly","rarely","seldom","nothing","n’t"])
CONJUNCTS = set(["and","or","but","yet","so","for","nor"])

def tokenize_simple(s):
    s = str(s)
    s = re.sub(r"[^A-Za-z0-9\s']", ' ', s)
    toks = s.lower().split()
    return toks

def negation_features(s):
    toks = tokenize_simple(s)
    neg_positions = [i for i,t in enumerate(toks) if t in NEG_WORDS]
    conj_positions = [i for i,t in enumerate(toks) if t in CONJUNCTS]
    neg_count = len(neg_positions)
    parity = 0 if neg_count % 2 == 0 else 1
    relation = 'none'
    if neg_positions and conj_positions:
        neg_idx = neg_positions[0]
        conj_idx = conj_positions[0]
        if neg_idx < conj_idx:
            relation = 'neg_before_conj'
        elif neg_idx > conj_idx:
            relation = 'neg_after_conj'
        else:
            relation = 'neg_on_conj'
    return neg_count, parity, relation

# Apply to dataset
rows = []
for i, r in df.iterrows():
    s1 = r['Sentence 1']
    s2 = r['Sentence 2']
    n1, p1, rel1 = negation_features(s1)
    n2, p2, rel2 = negation_features(s2)
    rows.append({
        'neg_count_s1': n1,
        'neg_count_s2': n2,
        'parity_s1': p1,
        'parity_s2': p2,
        'parity_cancel': int((n1 + n2) % 2 == 0),
        'neg_rel_s1': rel1,
        'neg_rel_s2': rel2
    })
feat_df = pd.DataFrame(rows)
df = pd.concat([df.reset_index(drop=True), feat_df], axis=1)

# Optional: inspect
print(df[['Sentence 1','neg_count_s1','neg_rel_s1']].head())

                                          Sentence 1  neg_count_s1 neg_rel_s1
0         The process is waiting in the ready queue.             0       none
1      The process isn't waiting in the ready queue.             0       none
2         The process is waiting in the ready queue.             0       none
3      The process isn't waiting in the ready queue.             0       none
4  The process isn't waiting in the not ready queue.             1       none


In [6]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from gensim.models import Word2Vec
import torch.nn.functional as F

# 1) SBERT
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print('Loaded SBERT; dim =', sbert_model.get_sentence_embedding_dimension())

# 2) DistilBERT and RoBERTa tokenizers + models for mean pooling
hf_models = {
    'distilbert': 'distilbert-base-uncased',
    'roberta': 'roberta-base'
}

hf_tokenizers = {}
hf_models_loaded = {}
for name, mname in hf_models.items():
    print('Loading', mname)
    hf_tokenizers[name] = AutoTokenizer.from_pretrained(mname)
    hf_models_loaded[name] = AutoModel.from_pretrained(mname)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded SBERT; dim = 384
Loading distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
hf_tokenizers['bert'] = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
bert_model.eval()
hf_models_loaded['bert'] = bert_model
print('Loaded BERT-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loaded BERT-base-uncased


In [8]:
import torch
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

# --- 1. Define Device for PyTorch Operations ---
# This is crucial for fixing the RuntimeError
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# ------------------------------------------------

def hf_mean_pool(texts, tokenizer, model, device, batch_size=16):
    """
    Computes mean-pooled embeddings for texts using a Hugging Face model.
    The model and input tensors are moved to the specified device.
    """
    embs = []

    # FIX: Ensure the model is on the specified device
    model.to(device)

    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

            # Move token tensors to the same device as the model (device)
            enc = {k: v.to(device) for k, v in enc.items()}

            outputs = model(**enc)
            last = outputs.last_hidden_state       # (bs, seq, dim)

            # attention_mask is already on 'device'
            mask = enc['attention_mask'].unsqueeze(-1)

            # Mean pooling calculation
            mean_pooled = (last * mask).sum(1) / mask.sum(1)

            # Move result back to CPU before appending
            embs.append(mean_pooled.cpu().numpy())

    return np.vstack(embs)

# 3) Word2Vec trained on corpus
# Assuming 'df' and 'tokenize_simple' are defined in the broader scope
all_sentences = pd.concat([df['Sentence 1'], df['Sentence 2']]).astype(str).apply(tokenize_simple).tolist()
w2v = Word2Vec(sentences=all_sentences, vector_size=200, window=5, min_count=1, epochs=40)

def w2v_embed(texts):
    embs = []
    for t in texts:
        toks = tokenize_simple(t)
        vecs = [w2v.wv[w] for w in toks if w in w2v.wv]
        if len(vecs) == 0:
            embs.append(np.zeros(w2v.vector_size))
        else:
            embs.append(np.mean(vecs, axis=0))
    return np.vstack(embs)

# Now compute embeddings
S1 = df['Sentence 1'].astype(str).tolist()
S2 = df['Sentence 2'].astype(str).tolist()

print('Computing SBERT embeddings...')
# Assuming sbert_model is initialized and already moved to the correct device
emb_sbert_s1 = sbert_model.encode(S1, show_progress_bar=True, convert_to_numpy=True)
emb_sbert_s2 = sbert_model.encode(S2, show_progress_bar=True, convert_to_numpy=True)

print('Computing BERT mean-pooled embeddings...')
# 🛠️ Pass the device argument
emb_bert_s1 = hf_mean_pool(S1, hf_tokenizers['bert'], hf_models_loaded['bert'], device=device, batch_size=16)
emb_bert_s2 = hf_mean_pool(S2, hf_tokenizers['bert'], hf_models_loaded['bert'], device=device, batch_size=16)

print('Computing DistilBERT mean-pooled embeddings...')
# 🛠️ Pass the device argument (where the original error occurred)
emb_distil_s1 = hf_mean_pool(S1, hf_tokenizers['distilbert'], hf_models_loaded['distilbert'], device=device, batch_size=16)
emb_distil_s2 = hf_mean_pool(S2, hf_tokenizers['distilbert'], hf_models_loaded['distilbert'], device=device, batch_size=16)

print('Computing RoBERTa mean-pooled embeddings...')
# 🛠️ Pass the device argument
emb_roberta_s1 = hf_mean_pool(S1, hf_tokenizers['roberta'], hf_models_loaded['roberta'], device=device, batch_size=16)
emb_roberta_s2 = hf_mean_pool(S2, hf_tokenizers['roberta'], hf_models_loaded['roberta'], device=device, batch_size=16)

print('Computing Word2Vec embeddings...')
emb_w2v_s1 = w2v_embed(S1)
emb_w2v_s2 = w2v_embed(S2)

# Concatenate embeddings
emb_s1 = np.concatenate([
    emb_sbert_s1,
    emb_distil_s1,
    emb_roberta_s1,
    emb_bert_s1,
    emb_w2v_s1
], axis=1)

emb_s2 = np.concatenate([
    emb_sbert_s2,
    emb_distil_s2,
    emb_roberta_s2,
    emb_bert_s2,
    emb_w2v_s2
], axis=1)

print('Final embedding dims:', emb_s1.shape, emb_s2.shape)

Using device: cuda
Computing SBERT embeddings...


Batches:   0%|          | 0/268 [00:00<?, ?it/s]

Batches:   0%|          | 0/268 [00:00<?, ?it/s]

Computing BERT mean-pooled embeddings...
Computing DistilBERT mean-pooled embeddings...
Computing RoBERTa mean-pooled embeddings...
Computing Word2Vec embeddings...
Final embedding dims: (8575, 2888) (8575, 2888)


In [9]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
# Assuming emb_s1, emb_s2, and df are available from the previous steps

# --- 1. Dataset Class Definition ---

class NegationSimilarityDataset(Dataset):
    """
    PyTorch Dataset for Sentence Similarity task combining two embeddings
    and custom negation features.
    """
    def __init__(self, emb1, emb2, neg_features, targets=None):
        # Convert numpy arrays to PyTorch tensors and ensure float32 type
        self.emb1 = torch.tensor(emb1, dtype=torch.float32)
        self.emb2 = torch.tensor(emb2, dtype=torch.float32)
        self.neg = torch.tensor(neg_features, dtype=torch.float32)

        # Targets are optional (e.g., for inference/prediction)
        # Squeeze is important to remove any trailing dimension if y is (N, 1)
        self.targets = torch.tensor(targets, dtype=torch.float32).squeeze() if targets is not None else None

    def __len__(self):
        return len(self.emb1)

    def __getitem__(self, idx):
        # Returns the embeddings, negation features, and the target score (if available)
        if self.targets is None:
            return self.emb1[idx], self.emb2[idx], self.neg[idx]
        # Tuple order: (Sentence 1 Emb, Sentence 2 Emb, Negation Features, Target Score)
        return self.emb1[idx], self.emb2[idx], self.neg[idx], self.targets[idx]


# --- 2. Prepare and Scale Negation Features ---

# Select the numerical negation features you want to use
neg_features = df[['neg_count_s1','neg_count_s2','parity_cancel']].astype(float).values

# Initialize and fit the StandardScaler
scaler = StandardScaler()
neg_scaled = scaler.fit_transform(neg_features)

print(f"Negation features scaled successfully. Shape: {neg_scaled.shape}")

# Save the scaler object for later use on test/inference data
joblib.dump({'scaler': scaler}, 'neg_scaler.joblib')


# --- 3. Prepare Target Variable ---

# Target is typically the Similarity Score
y = df['Similarity Score'].astype(float).values


# --- 4. Train/Validation Split ---

# Split all features and the target simultaneously
X1_train, X1_val, X2_train, X2_val, neg_train, neg_val, y_train, y_val = train_test_split(
    emb_s1,          # Sentence 1 embeddings
    emb_s2,          # Sentence 2 embeddings
    neg_scaled,      # Scaled negation features
    y,               # Target scores
    test_size=0.15,  # 15% for validation set
    random_state=42
)

print(f"Train/Val split done. Train samples: {len(y_train)}, Validation samples: {len(y_val)}")


# --- 5. Create Dataset and DataLoader Instances ---

# Create Dataset instances
train_ds = NegationSimilarityDataset(X1_train, X2_train, neg_train, y_train)
val_ds = NegationSimilarityDataset(X1_val, X2_val, neg_val, y_val)

# Create DataLoader instances
# 🛠️ CRITICAL FIX: Set num_workers=0 to prevent multiprocessing AssertionErrors in notebooks
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=0)

print("\nDataLoaders created with num_workers=0 (for stability):")
print(f"  Training batches (batch_size=64): {len(train_loader)}")
print(f"  Validation batches (batch_size=128): {len(val_loader)}")

Negation features scaled successfully. Shape: (8575, 3)
Train/Val split done. Train samples: 7288, Validation samples: 1287

DataLoaders created with num_workers=0 (for stability):
  Training batches (batch_size=64): 114
  Validation batches (batch_size=128): 11


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# Assuming device, emb_s1, neg_scaled are defined in your environment

# --- Siamese BiLSTM Model ---

class SiameseBiLSTM(nn.Module):
    # Pass all model parameters to the __init__ method
    def __init__(self, input_dim, neg_dim, proj_dim, lstm_hidden, seq_len):
        super().__init__()
        self.seq_len = seq_len

        # Input validation for projection dimension
        assert proj_dim % seq_len == 0, 'proj_dim must be divisible by seq_len'
        self.chunk = proj_dim // seq_len # Size of the input to the BiLSTM

        # 1. Projection (from combined embedding size down to a size that can be chunked)
        self.proj = nn.Sequential(
            nn.Linear(input_dim, proj_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU()
        )

        # 2. Bi-LSTM (used as the Siamese feature extractor)
        # batch_first=False means input is (seq_len, batch, feature_size)
        self.bilstm = nn.LSTM(
            input_size=self.chunk,
            hidden_size=lstm_hidden,
            batch_first=False,
            bidirectional=True
        )

        # 3. Final MLP (Classifier head)
        # Feature dimension calculation:
        # - f1, f2: 2 * lstm_hidden (from BiLSTM)
        # - absdiff, mult: 2 * lstm_hidden each
        # Total from Siamese part: (2*lstm_hidden) * 4
        combined_dim = (lstm_hidden * 2) * 4 + neg_dim

        self.mlp = nn.Sequential(
            nn.Linear(combined_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1) # Final output is a single similarity score
        )

    def forward_once(self, x):
        # x: (batch, input_dim) - This is the input sentence embedding

        # Projection and ReLU activation
        p = self.proj(x) # (batch, proj_dim)

        # Reshape for BiLSTM input: (batch, seq_len, chunk_size)
        b = p.view(p.size(0), self.seq_len, self.chunk)

        # Permute to (seq_len, batch, chunk_size) for batch_first=False
        b = b.permute(1, 0, 2)

        # BiLSTM forward pass
        out, _ = self.bilstm(b) # out: (seq_len, batch, 2*lstm_hidden)

        # Take the feature vector from the last timestep
        feat = out[-1] # (batch, 2*lstm_hidden)
        return feat

    def forward(self, x1, x2, negf):
        # x1, x2: sentence embeddings
        # negf: negation features

        # Pass both sentences through the shared feature extractor (Siamese)
        f1 = self.forward_once(x1)
        f2 = self.forward_once(x2)

        # Compute interaction features
        absdiff = torch.abs(f1 - f2)
        mult = f1 * f2

        # Merge all features: f1, f2, diff, mult, negation features
        merged = torch.cat([f1, f2, absdiff, mult, negf], dim=1)

        # Pass through the final MLP classifier
        out = self.mlp(merged)

        # Return the output score, removing the dimension of size 1
        return out.squeeze(1)


# --- 4. Instantiate the Model ---
# Assuming these variables are correctly set up from your previous code:
input_dim = emb_s1.shape[1]    # e.g., 3072 if combining 4 BERT-like models + W2V
neg_dim = neg_scaled.shape[1]  # e.g., 3 features
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


model = SiameseBiLSTM(
    input_dim=input_dim,
    neg_dim=neg_dim,
    proj_dim=512,
    lstm_hidden=128,
    seq_len=8
).to(device) # Move the entire model to the defined device

print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

SiameseBiLSTM(
  (proj): Sequential(
    (0): Linear(in_features=2888, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=512, out_features=512, bias=True)
    (4): ReLU()
  )
  (bilstm): LSTM(64, 128, bidirectional=True)
  (mlp): Sequential(
    (0): Linear(in_features=1027, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=512, out_features=128, bias=True)
    (4): ReLU()
    (5): Linear(in_features=128, out_features=1, bias=True)
  )
)

Total parameters: 2,532,609


In [11]:
import torch
import torch.nn as nn
import numpy as np
import math
import joblib
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error

criterion = nn.MSELoss()


def evaluate(model, loader, device):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for batch in loader:
            x1, x2, neg, yb = batch
            x1 = x1.to(device)
            x2 = x2.to(device)
            neg = neg.to(device)
            yb = yb.to(device)

            out = model(x1, x2, neg)

            preds.append(out.detach().cpu().numpy())
            trues.append(yb.detach().cpu().numpy())

    preds = np.concatenate(preds)
    trues = np.concatenate(trues)

    rmse = math.sqrt(mean_squared_error(trues, preds))
    return rmse, preds, trues


# --- Main Training Loop ---

def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience, device, scaler, w2v):
    best_rmse = float('inf')
    wait = 0

    train_loss_history = []
    val_rmse_history = []

    print(f"Starting training for {num_epochs} epochs on device: {device}")

    for epoch in range(1, num_epochs + 1):
        # --- Training Phase ---
        model.train()
        losses = []
        for x1, x2, neg, yb in tqdm(train_loader, desc=f'Train epoch {epoch}'):
            x1 = x1.to(device)
            x2 = x2.to(device)
            neg = neg.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            out = model(x1, x2, neg)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())

        train_loss = np.mean(losses)
        train_loss_history.append(train_loss)

        # --- Validation Phase ---
        val_rmse, _, _ = evaluate(model, val_loader, device)
        val_rmse_history.append(val_rmse)

        print(f'Epoch {epoch}: train_loss={train_loss:.4f} val_RMSE={val_rmse:.4f}')

        # --- Early Stopping and Checkpoint ---
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            wait = 0
            torch.save(model.state_dict(), 'siamese_bilstm_negation_best.pt')
            joblib.dump({'scaler': scaler}, 'negation_scaler.joblib')
            print(f"   -> Model saved! Best RMSE: {best_rmse:.4f}")
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping triggered after {patience} epochs without improvement.")
                break

    joblib.dump(w2v, 'w2v.joblib')
    print("\nTraining complete.")

    return train_loss_history, val_rmse_history

In [16]:
import numpy as np
import torch
import torch.nn as nn
import joblib
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score


# Define the training parameters from the code's context
NUM_EPOCHS = 30
PATIENCE = 4

# Initialize the Optimizer (Assuming model is already defined and moved to device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.MSELoss() # Assuming criterion is defined globally

# Call the training function
train_losses, val_rmses = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    num_epochs=NUM_EPOCHS,
    patience=PATIENCE,
    device=device,
    scaler=scaler,
    w2v=w2v
)

print("\n--- Training Results ---")
print(f"Training Losses: {train_losses}")
print(f"Validation RMSEs: {val_rmses}")

try:
    # Use map_location to ensure compatibility regardless of where it was saved (GPU/CPU)
    model.load_state_dict(torch.load('siamese_bilstm_negation_best.pt', map_location=device))
    print("\nSuccessfully loaded best model weights for final evaluation.")
except FileNotFoundError:
    print("\nWARNING: Best model weights ('siamese_bilstm_negation_best.pt') not found.")
    print("Proceeding with the model's current (last epoch) state.")

final_rmse, val_preds, val_trues = evaluate(model, val_loader, device)

print(f"Final RMSE (from loaded best model): {final_rmse:.4f}")

# --- Classification Metrics (Using Threshold 55.0) ---

threshold = 55.0 # Using the requested threshold of 55.0

# 1. Binarize True Scores and Predictions based on the threshold
val_trues_binary = (val_trues > threshold).astype(int)
val_preds_binary = (val_preds > threshold).astype(int)

# 2. Calculate Classification Metrics
accuracy = accuracy_score(val_trues_binary, val_preds_binary)
precision = precision_score(val_trues_binary, val_preds_binary, zero_division=0)
recall = recall_score(val_trues_binary, val_preds_binary, zero_division=0)
f1 = f1_score(val_trues_binary, val_preds_binary, zero_division=0)

print(f"\n--- Classification Metrics (Threshold > {threshold}) ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Starting training for 30 epochs on device: cuda


Train epoch 1:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 1: train_loss=1751.0322 val_RMSE=22.6685
   -> Model saved! Best RMSE: 22.6685


Train epoch 2:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 2: train_loss=511.9361 val_RMSE=21.8359
   -> Model saved! Best RMSE: 21.8359


Train epoch 3:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 3: train_loss=355.5505 val_RMSE=16.3608
   -> Model saved! Best RMSE: 16.3608


Train epoch 4:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 4: train_loss=268.7439 val_RMSE=15.6206
   -> Model saved! Best RMSE: 15.6206


Train epoch 5:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 5: train_loss=246.9134 val_RMSE=15.5094
   -> Model saved! Best RMSE: 15.5094


Train epoch 6:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 6: train_loss=236.6486 val_RMSE=15.2935
   -> Model saved! Best RMSE: 15.2935


Train epoch 7:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 7: train_loss=226.6588 val_RMSE=15.0478
   -> Model saved! Best RMSE: 15.0478


Train epoch 8:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 8: train_loss=217.8476 val_RMSE=14.8068
   -> Model saved! Best RMSE: 14.8068


Train epoch 9:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 9: train_loss=206.9132 val_RMSE=14.2874
   -> Model saved! Best RMSE: 14.2874


Train epoch 10:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 10: train_loss=192.2293 val_RMSE=13.9044
   -> Model saved! Best RMSE: 13.9044


Train epoch 11:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 11: train_loss=177.8503 val_RMSE=13.4570
   -> Model saved! Best RMSE: 13.4570


Train epoch 12:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 12: train_loss=161.6315 val_RMSE=12.7535
   -> Model saved! Best RMSE: 12.7535


Train epoch 13:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 13: train_loss=144.4577 val_RMSE=12.0444
   -> Model saved! Best RMSE: 12.0444


Train epoch 14:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 14: train_loss=131.3885 val_RMSE=11.5822
   -> Model saved! Best RMSE: 11.5822


Train epoch 15:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 15: train_loss=119.2452 val_RMSE=11.3021
   -> Model saved! Best RMSE: 11.3021


Train epoch 16:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 16: train_loss=110.6574 val_RMSE=10.7635
   -> Model saved! Best RMSE: 10.7635


Train epoch 17:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 17: train_loss=103.7012 val_RMSE=11.0928


Train epoch 18:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 18: train_loss=94.7809 val_RMSE=10.2850
   -> Model saved! Best RMSE: 10.2850


Train epoch 19:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 19: train_loss=87.4032 val_RMSE=9.9283
   -> Model saved! Best RMSE: 9.9283


Train epoch 20:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 20: train_loss=80.9280 val_RMSE=9.5039
   -> Model saved! Best RMSE: 9.5039


Train epoch 21:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 21: train_loss=70.7227 val_RMSE=8.7516
   -> Model saved! Best RMSE: 8.7516


Train epoch 22:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 22: train_loss=60.8300 val_RMSE=8.1566
   -> Model saved! Best RMSE: 8.1566


Train epoch 23:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 23: train_loss=53.3202 val_RMSE=7.7063
   -> Model saved! Best RMSE: 7.7063


Train epoch 24:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 24: train_loss=45.7402 val_RMSE=6.9138
   -> Model saved! Best RMSE: 6.9138


Train epoch 25:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 25: train_loss=39.9052 val_RMSE=6.7280
   -> Model saved! Best RMSE: 6.7280


Train epoch 26:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 26: train_loss=35.4117 val_RMSE=6.3106
   -> Model saved! Best RMSE: 6.3106


Train epoch 27:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 27: train_loss=32.5335 val_RMSE=6.2442
   -> Model saved! Best RMSE: 6.2442


Train epoch 28:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 28: train_loss=30.9704 val_RMSE=6.3357


Train epoch 29:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 29: train_loss=28.9509 val_RMSE=5.6896
   -> Model saved! Best RMSE: 5.6896


Train epoch 30:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 30: train_loss=26.4746 val_RMSE=5.5610
   -> Model saved! Best RMSE: 5.5610

Training complete.

--- Training Results ---
Training Losses: [np.float64(1751.0321730228893), np.float64(511.9361371492085), np.float64(355.550487049839), np.float64(268.74392807274535), np.float64(246.91338107460425), np.float64(236.64860159890694), np.float64(226.65883462470873), np.float64(217.8475571347956), np.float64(206.91320372464364), np.float64(192.22930506656044), np.float64(177.85032627038788), np.float64(161.6315234669468), np.float64(144.45773917750307), np.float64(131.3885093822814), np.float64(119.24516457005551), np.float64(110.65742004126834), np.float64(103.70116524947316), np.float64(94.7808822497987), np.float64(87.4031840876529), np.float64(80.92797272665459), np.float64(70.72272344221149), np.float64(60.82998081675747), np.float64(53.320205772132205), np.float64(45.74016049033717), np.float64(39.905238820795425), np.float64(35.41172977915981), np.float64(32.53351835618939), np.flo

In [17]:
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel
from peft import LoraConfig, get_peft_model, TaskType
from tqdm import tqdm
import math

# --- CONFIGURATION ---
MODEL_NAME = "distilbert-base-uncased"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
THRESHOLD = 55.0  # Classification threshold
RANDOM_STATE = 42
TEST_SIZE = 0.15
BATCH_SIZE = 32
NUM_EPOCHS = 15
LORA_R = 8        # LoRA rank
LORA_ALPHA = 16   # LoRA scaling
LORA_DROPOUT = 0.1

# --- 1. DATASET CLASS (Text-based) ---

class DistilBERTTextDataset(Dataset):
    """Dataset for Sentence Similarity using raw text."""
    def __init__(self, sentences1, sentences2, targets=None):
        self.sentences1 = sentences1
        self.sentences2 = sentences2

        # --- FIX IS HERE: Explicitly converting to torch.float32 ---
        if targets is not None:
             self.targets = torch.tensor(targets, dtype=torch.float32)
        else:
             self.targets = None

    def __len__(self):
        return len(self.sentences1)

    def __getitem__(self, idx):
        if self.targets is None:
            return self.sentences1[idx], self.sentences2[idx]
        return self.sentences1[idx], self.sentences2[idx], self.targets[idx]

# --- 2. MODEL CLASS (LoRA + Regression Head) ---

class LoRADistilBERTRegressor(nn.Module):
    def __init__(self, model_name, lora_config):
        super().__init__()
        # Load the base model
        self.base_model = AutoModel.from_pretrained(model_name)

        # Apply LoRA
        self.lora_model = get_peft_model(self.base_model, lora_config)

        # Add a regression head on top of the CLS token output
        embedding_dim = self.lora_model.config.dim
        self.regressor = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(embedding_dim * 2, 512),  # Concatenated S1 and S2 embeddings
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 1) # Output a single similarity score
        )

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2):
        # Get S1 embedding
        output_s1 = self.lora_model(input_ids=input_ids_s1, attention_mask=attention_mask_s1)
        # Use [CLS] token representation (first token)
        emb_s1 = output_s1.last_hidden_state[:, 0, :]

        # Get S2 embedding
        output_s2 = self.lora_model(input_ids=input_ids_s2, attention_mask=attention_mask_s2)
        emb_s2 = output_s2.last_hidden_state[:, 0, :]

        # Concatenate embeddings
        combined_emb = torch.cat((emb_s1, emb_s2), dim=-1)

        # Pass through the regression head
        score = self.regressor(combined_emb)
        return score.squeeze(-1) # Return shape (batch_size,)





In [26]:
def train_model_lora(model, train_loader, optimizer, criterion, tokenizer, num_epochs, device):
    model.train()
    print(f"LoRA training started on {device}")

    for epoch in range(1, num_epochs + 1):
        total_loss = 0
        for s1, s2, target in tqdm(train_loader, desc=f"Epoch {epoch} Training"):
            # Tokenize batch
            inputs_s1 = tokenizer(s1, padding=True, truncation=True, return_tensors="pt").to(device)
            inputs_s2 = tokenizer(s2, padding=True, truncation=True, return_tensors="pt").to(device)
            target = target.to(device)

            optimizer.zero_grad()

            outputs = model(
                inputs_s1['input_ids'], inputs_s1['attention_mask'],
                inputs_s2['input_ids'], inputs_s2['attention_mask']
            )

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch}: Train Loss = {avg_loss:.4f}")
        # In a real setup, you'd call evaluate_model_lora here and check for early stopping.


In [27]:



# --- 4. EVALUATION FUNCTION ---

def evaluate_model_lora(model, val_loader, tokenizer, device, threshold):
    model.eval()
    val_preds, val_trues = [], []

    with torch.no_grad():
        for s1, s2, target in tqdm(val_loader, desc="Validation"):
            inputs_s1 = tokenizer(s1, padding=True, truncation=True, return_tensors="pt").to(device)
            inputs_s2 = tokenizer(s2, padding=True, truncation=True, return_tensors="pt").to(device)

            outputs = model(
                inputs_s1['input_ids'], inputs_s1['attention_mask'],
                inputs_s2['input_ids'], inputs_s2['attention_mask']
            )

            val_preds.extend(outputs.cpu().numpy())
            val_trues.extend(target.numpy())

    val_preds = np.array(val_preds)
    val_trues = np.array(val_trues)

    # Regression Metric
    rmse = math.sqrt(mean_squared_error(val_trues, val_preds))

    # Classification Metrics
    val_trues_binary = (val_trues > threshold).astype(int)
    val_preds_binary = (val_preds > threshold).astype(int)

    accuracy = accuracy_score(val_trues_binary, val_preds_binary)
    precision = precision_score(val_trues_binary, val_preds_binary, zero_division=0)
    recall = recall_score(val_trues_binary, val_preds_binary, zero_division=0)
    f1 = f1_score(val_trues_binary, val_preds_binary, zero_division=0)

    return rmse, accuracy, precision, recall, f1



In [28]:
# --- 5. EXECUTION BLOCK ---

if __name__ == '__main__':
    # Load Data
    df = pd.read_csv('SRIP-Dataset-Negation.csv')

    X_s1 = df['Sentence 1'].values
    X_s2 = df['Sentence 2'].values
    y = df['Similarity Score'].values.astype(float)

    # Split Data (Text and Target)
    X_train_s1, X_val_s1, X_train_s2, X_val_s2, y_train, y_val = train_test_split(
        X_s1, X_s2, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    # Create Dataset and DataLoader
    train_ds = DistilBERTTextDataset(X_train_s1, X_train_s2, y_train)
    val_ds = DistilBERTTextDataset(X_val_s1, X_val_s2, y_val)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize LoRA Model
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        target_modules=["q_lin", "v_lin"], # Common targets for DistilBERT self-attention
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type=TaskType.FEATURE_EXTRACTION
    )

    model = LoRADistilBERTRegressor(MODEL_NAME, lora_config).to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Optimizer and Loss
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.MSELoss()

    # Run Training
    train_model_lora(model, train_loader, optimizer, criterion, tokenizer, NUM_EPOCHS, DEVICE)

    # Run Evaluation
    rmse, acc, prec, rec, f1 = evaluate_model_lora(model, val_loader, tokenizer, DEVICE, THRESHOLD)

    print("\n--- LoRA DistilBERT Results ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")

LoRA training started on cuda


Epoch 1 Training: 100%|██████████| 228/228 [00:18<00:00, 12.50it/s]


Epoch 1: Train Loss = 2317.1921


Epoch 2 Training: 100%|██████████| 228/228 [00:18<00:00, 12.37it/s]


Epoch 2: Train Loss = 673.4727


Epoch 3 Training: 100%|██████████| 228/228 [00:18<00:00, 12.45it/s]


Epoch 3: Train Loss = 506.1685


Epoch 4 Training: 100%|██████████| 228/228 [00:18<00:00, 12.62it/s]


Epoch 4: Train Loss = 458.8942


Epoch 5 Training: 100%|██████████| 228/228 [00:18<00:00, 12.63it/s]


Epoch 5: Train Loss = 377.2837


Epoch 6 Training: 100%|██████████| 228/228 [00:18<00:00, 12.50it/s]


Epoch 6: Train Loss = 319.3615


Epoch 7 Training: 100%|██████████| 228/228 [00:18<00:00, 12.44it/s]


Epoch 7: Train Loss = 286.9155


Epoch 8 Training: 100%|██████████| 228/228 [00:18<00:00, 12.59it/s]


Epoch 8: Train Loss = 268.5393


Epoch 9 Training: 100%|██████████| 228/228 [00:18<00:00, 12.52it/s]


Epoch 9: Train Loss = 253.9246


Epoch 10 Training: 100%|██████████| 228/228 [00:18<00:00, 12.64it/s]


Epoch 10: Train Loss = 245.1441


Epoch 11 Training: 100%|██████████| 228/228 [00:18<00:00, 12.58it/s]


Epoch 11: Train Loss = 240.0026


Epoch 12 Training: 100%|██████████| 228/228 [00:18<00:00, 12.63it/s]


Epoch 12: Train Loss = 234.2073


Epoch 13 Training: 100%|██████████| 228/228 [00:18<00:00, 12.62it/s]


Epoch 13: Train Loss = 230.0606


Epoch 14 Training: 100%|██████████| 228/228 [00:18<00:00, 12.61it/s]


Epoch 14: Train Loss = 224.8030


Epoch 15 Training: 100%|██████████| 228/228 [00:18<00:00, 12.53it/s]


Epoch 15: Train Loss = 222.6004


Validation: 100%|██████████| 41/41 [00:01<00:00, 25.08it/s]


--- LoRA DistilBERT Results ---
RMSE: 14.8680
Accuracy: 0.7218
Precision: 0.6031
Recall: 0.9491
F1-Score: 0.7375





In [41]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from tqdm import tqdm
import math

# --- CONFIGURATION ---
SBERT_LORA_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # Base model used for training
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128

# File Paths (from your previous saving step)
MODEL_LOAD_PATH = 'sbert_fine_tuned.pt'
TOKENIZER_LOAD_PATH = 'sbert_tokenizer_artifacts'

# Evaluation Parameters for STS-B (0-5 scale)
STS_B_CLASSIFICATION_THRESHOLD = 3.0
ORIGINAL_SCORE_MAX = 100.0 # Max score your fine-tuned model was trained on
STS_B_SCORE_MAX = 5.0      # Max score of the STS-B dataset


# --- 1. DATASET CLASS ---
class SBERTTextDataset(Dataset):
    def __init__(self, sentences1, sentences2, targets):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.sentences1)

    def __getitem__(self, idx):
        return self.sentences1[idx], self.sentences2[idx], self.targets[idx]

# --- 2. MODEL CLASS (Required to load state_dict) ---
# NOTE: Ensure this SBERTRegressor definition EXACTLY matches your training definition.

class SBERTRegressor(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        embedding_dim = self.base_model.config.hidden_size

        self.regressor = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(embedding_dim * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 1)
        )

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2):
        output_s1 = self.base_model(input_ids=input_ids_s1, attention_mask=attention_mask_s1)
        emb_s1 = self._mean_pooling(output_s1, attention_mask_s1)

        output_s2 = self.base_model(input_ids=input_ids_s2, attention_mask=attention_mask_s2)
        emb_s2 = self._mean_pooling(output_s2, attention_mask_s2)

        combined_emb = torch.cat((emb_s1, emb_s2), dim=-1)
        score = self.regressor(combined_emb)
        return score.squeeze(-1)


# --- 3. EVALUATION FUNCTION (Handles Scaling) ---

def evaluate_model_sbert_sts(model, val_loader, tokenizer, device, threshold, original_max, sts_b_max):
    model.eval()
    val_preds, val_trues = [], []

    with torch.no_grad():
        for s1, s2, target in tqdm(val_loader, desc="STS-B Validation"):
            inputs_s1 = tokenizer(s1, padding=True, truncation=True, return_tensors="pt").to(device)
            inputs_s2 = tokenizer(s2, padding=True, truncation=True, return_tensors="pt").to(device)

            outputs = model(
                inputs_s1['input_ids'], inputs_s1['attention_mask'],
                inputs_s2['input_ids'], inputs_s2['attention_mask']
            )

            val_preds.extend(outputs.cpu().numpy())
            val_trues.extend(target.numpy())

    val_preds = np.array(val_preds)
    val_trues = np.array(val_trues)

    # 1. Scale Predictions: Convert model output (0-100 scale) to STS-B scale (0-5)
    val_preds_scaled = (val_preds / original_max) * sts_b_max

    # 2. Binarization
    val_trues_binary = (val_trues > threshold).astype(int)
    val_preds_binary = (val_preds_scaled > threshold).astype(int)

    # Metrics
    rmse = math.sqrt(mean_squared_error(val_trues, val_preds_scaled))
    accuracy = accuracy_score(val_trues_binary, val_preds_binary)
    precision = precision_score(val_trues_binary, val_preds_binary, zero_division=0)
    recall = recall_score(val_trues_binary, val_preds_binary, zero_division=0)
    f1 = f1_score(val_trues_binary, val_preds_binary, zero_division=0)

    return rmse, accuracy, precision, recall, f1


# --- 4. EXECUTION BLOCK ---

if __name__ == '__main__':
    # Load the STS-B validation dataset
    ds = load_dataset("mteb/stsbenchmark-sts")
    val_df = ds['validation'].to_pandas()

    # Prepare Data
    X_val_s1 = val_df['sentence1'].values
    X_val_s2 = val_df['sentence2'].values
    # CORRECTED LINE: Using 'score' instead of 'label'
    y_val = val_df['score'].values

    # Create DataLoader
    val_ds = SBERTTextDataset(X_val_s1, X_val_s2, y_val)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # --- MODEL LOADING ---

    # 1. Initialize Tokenizer (Load from saved artifact)
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_LOAD_PATH)

    # 2. Initialize Model Structure
    model = SBERTRegressor(SBERT_LORA_MODEL_NAME).to(DEVICE)

    # 3. Load Saved Weights
    model.load_state_dict(torch.load(MODEL_LOAD_PATH))
    print(f"\nSuccessfully loaded model weights from: {MODEL_LOAD_PATH}")

    # --- END MODEL LOADING ---

    # Run Evaluation
    rmse, acc, prec, rec, f1 = evaluate_model_sbert_sts(
        model,
        val_loader,
        tokenizer,
        DEVICE,
        STS_B_CLASSIFICATION_THRESHOLD,
        ORIGINAL_SCORE_MAX,
        STS_B_SCORE_MAX
    )

    print("\n--- Fine-tuned SBERT STS-B Validation Results ---")
    print(f"Binarization Threshold: > {STS_B_CLASSIFICATION_THRESHOLD:.1f} (on 0-5 scale)")
    print("-" * 35)
    print(f"RMSE (0-5 scale): {rmse:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")


Successfully loaded model weights from: sbert_fine_tuned.pt


STS-B Validation: 100%|██████████| 12/12 [00:05<00:00,  2.12it/s]


--- Fine-tuned SBERT STS-B Validation Results ---
Binarization Threshold: > 3.0 (on 0-5 scale)
-----------------------------------
RMSE (0-5 scale): 1.7724
Accuracy: 0.4580
Precision: 0.3736
Recall: 0.7348
F1-Score: 0.4953





In [29]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import math

# --- CONFIGURATION ---
SBERT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
THRESHOLD = 55.0  # Classification threshold
RANDOM_STATE = 42
TEST_SIZE = 0.15
BATCH_SIZE = 32
NUM_EPOCHS = 15
LEARNING_RATE = 2e-5

# --- 1. DATASET CLASS (Text-based, standard) ---

class SBERTTextDataset(Dataset):
    def __init__(self, sentences1, sentences2, targets=None):
        self.sentences1 = sentences1
        self.sentences2 = sentences2

        # Ensure targets are converted to torch.float32 to avoid dtype errors
        if targets is not None:
             self.targets = torch.tensor(targets, dtype=torch.float32)
        else:
             self.targets = None

    def __len__(self):
        return len(self.sentences1)

    def __getitem__(self, idx):
        if self.targets is None:
            return self.sentences1[idx], self.sentences2[idx]
        return self.sentences1[idx], self.sentences2[idx], self.targets[idx]

# --- 2. MODEL CLASS (SBERT with Mean Pooling and Regression Head) ---

class SBERTRegressor(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        # Load the base transformer model
        self.base_model = AutoModel.from_pretrained(model_name)

        embedding_dim = self.base_model.config.hidden_size # e.g., 768 for MPNet-Base

        # Add a regression head on top of the concatenated sentence embeddings
        self.regressor = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(embedding_dim * 2, 512),  # Concatenated S1 and S2 embeddings
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 1) # Output a single similarity score
        )

    def _mean_pooling(self, model_output, attention_mask):
        """Standard SBERT pooling layer."""
        token_embeddings = model_output[0] # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def forward(self, input_ids_s1, attention_mask_s1, input_ids_s2, attention_mask_s2):
        # Get S1 embedding
        output_s1 = self.base_model(input_ids=input_ids_s1, attention_mask=attention_mask_s1)
        emb_s1 = self._mean_pooling(output_s1, attention_mask_s1)

        # Get S2 embedding
        output_s2 = self.base_model(input_ids=input_ids_s2, attention_mask=attention_mask_s2)
        emb_s2 = self._mean_pooling(output_s2, attention_mask_s2)

        # Concatenate embeddings
        combined_emb = torch.cat((emb_s1, emb_s2), dim=-1)

        # Pass through the regression head
        score = self.regressor(combined_emb)
        return score.squeeze(-1) # Return shape (batch_size,)








In [30]:
# --- 3. TRAINING FUNCTION ---

def train_model_sbert(model, train_loader, optimizer, criterion, tokenizer, num_epochs, device):
    model.train()
    print(f"SBERT fine-tuning started on {device}")

    for epoch in range(1, num_epochs + 1):
        total_loss = 0
        for s1, s2, target in tqdm(train_loader, desc=f"Epoch {epoch} Training"):
            # Tokenize batch
            inputs_s1 = tokenizer(s1, padding=True, truncation=True, return_tensors="pt").to(device)
            inputs_s2 = tokenizer(s2, padding=True, truncation=True, return_tensors="pt").to(device)
            target = target.to(device)

            optimizer.zero_grad()

            outputs = model(
                inputs_s1['input_ids'], inputs_s1['attention_mask'],
                inputs_s2['input_ids'], inputs_s2['attention_mask']
            )

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch}: Train Loss = {avg_loss:.4f}")



In [31]:
# --- 4. EVALUATION FUNCTION ---

def evaluate_model_sbert(model, val_loader, tokenizer, device, threshold):
    model.eval()
    val_preds, val_trues = [], []

    with torch.no_grad():
        for s1, s2, target in tqdm(val_loader, desc="Validation"):
            inputs_s1 = tokenizer(s1, padding=True, truncation=True, return_tensors="pt").to(device)
            inputs_s2 = tokenizer(s2, padding=True, truncation=True, return_tensors="pt").to(device)

            outputs = model(
                inputs_s1['input_ids'], inputs_s1['attention_mask'],
                inputs_s2['input_ids'], inputs_s2['attention_mask']
            )

            val_preds.extend(outputs.cpu().numpy())
            val_trues.extend(target.numpy())

    val_preds = np.array(val_preds)
    val_trues = np.array(val_trues)

    # Regression Metric
    rmse = math.sqrt(mean_squared_error(val_trues, val_preds))

    # Classification Metrics
    val_trues_binary = (val_trues > threshold).astype(int)
    val_preds_binary = (val_preds > threshold).astype(int)

    accuracy = accuracy_score(val_trues_binary, val_preds_binary)
    precision = precision_score(val_trues_binary, val_preds_binary, zero_division=0)
    recall = recall_score(val_trues_binary, val_preds_binary, zero_division=0)
    f1 = f1_score(val_trues_binary, val_preds_binary, zero_division=0)

    return rmse, accuracy, precision, recall, f1

In [32]:
# --- 5. EXECUTION BLOCK ---

if __name__ == '__main__':
    # Load Data
    df = pd.read_csv('SRIP-Dataset-Negation.csv')

    X_s1 = df['Sentence 1'].values
    X_s2 = df['Sentence 2'].values
    y = df['Similarity Score'].values

    # Split Data (Text and Target)
    X_train_s1, X_val_s1, X_train_s2, X_val_s2, y_train, y_val = train_test_split(
        X_s1, X_s2, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    # Create Dataset and DataLoader
    train_ds = SBERTTextDataset(X_train_s1, X_train_s2, y_train)
    val_ds = SBERTTextDataset(X_val_s1, X_val_s2, y_val)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize SBERT Model
    model = SBERTRegressor(SBERT_MODEL_NAME).to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(SBERT_MODEL_NAME)

    # Optimizer and Loss
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.MSELoss()

    # Run Training
    train_model_sbert(model, train_loader, optimizer, criterion, tokenizer, NUM_EPOCHS, DEVICE)

    # Run Evaluation (assuming model has been trained)
    rmse, acc, prec, rec, f1 = evaluate_model_sbert(model, val_loader, tokenizer, DEVICE, THRESHOLD)

    print("\n--- SBERT Fine-tuning Results ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

SBERT fine-tuning started on cuda


Epoch 1 Training: 100%|██████████| 228/228 [00:57<00:00,  3.97it/s]


Epoch 1: Train Loss = 2868.6367


Epoch 2 Training: 100%|██████████| 228/228 [00:56<00:00,  4.03it/s]


Epoch 2: Train Loss = 1854.8655


Epoch 3 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 3: Train Loss = 952.3732


Epoch 4 Training: 100%|██████████| 228/228 [00:56<00:00,  4.02it/s]


Epoch 4: Train Loss = 528.6017


Epoch 5 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 5: Train Loss = 296.8747


Epoch 6 Training: 100%|██████████| 228/228 [00:56<00:00,  4.03it/s]


Epoch 6: Train Loss = 228.5762


Epoch 7 Training: 100%|██████████| 228/228 [00:56<00:00,  4.00it/s]


Epoch 7: Train Loss = 196.2132


Epoch 8 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 8: Train Loss = 170.1005


Epoch 9 Training: 100%|██████████| 228/228 [00:56<00:00,  4.00it/s]


Epoch 9: Train Loss = 153.7622


Epoch 10 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 10: Train Loss = 138.3753


Epoch 11 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 11: Train Loss = 128.1619


Epoch 12 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 12: Train Loss = 115.5384


Epoch 13 Training: 100%|██████████| 228/228 [00:56<00:00,  4.03it/s]


Epoch 13: Train Loss = 109.7051


Epoch 14 Training: 100%|██████████| 228/228 [00:56<00:00,  4.01it/s]


Epoch 14: Train Loss = 98.9545


Epoch 15 Training: 100%|██████████| 228/228 [00:56<00:00,  4.00it/s]


Epoch 15: Train Loss = 90.7190


Validation: 100%|██████████| 41/41 [00:03<00:00, 12.96it/s]


--- SBERT Fine-tuning Results ---
RMSE: 12.7327
Accuracy: 0.8353
Precision: 0.7760
Recall: 0.8434
F1-Score: 0.8083





In [34]:
MODEL_SAVE_PATH = 'sbert_fine_tuned.pt'
torch.save(model.state_dict(), MODEL_SAVE_PATH)

    # 2. Save the tokenizer (necessary for later loading and processing new data)
TOKENIZER_SAVE_PATH = 'sbert_tokenizer_artifacts'
tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)

print(f"\nModel weights saved to: {MODEL_SAVE_PATH}")
print(f"Tokenizer saved to: {TOKENIZER_SAVE_PATH}")


Model weights saved to: sbert_fine_tuned.pt
Tokenizer saved to: sbert_tokenizer_artifacts
