In [1]:
# %pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install transformers[torch]==4.30.2
# %pip install accelerate -U
# %pip install optuna
%pip install fastai spacy
%python -m spacy download en_core_web_sm


Collecting fastai
  Downloading fastai-2.7.18-py3-none-any.whl.metadata (9.1 kB)
Collecting spacy
  Downloading spacy-3.8.2.tar.gz (1.3 MB)
     ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
     ---------------------------------------- 1.3/1.3 MB 7.4 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [95 lines of output]
      Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment
      Collecting setuptools
        Downloading setuptools-75.3.2-py3-none-any.whl.metadata (6.9 kB)
      Collecting cython<3.0,>=0.25
        Downloading Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Downloading cymem-2.0.11.tar.gz (10 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): finished with status 'done'
      Collecting preshed<3.1.0,>=3.0.2
        Downloading preshed-3.0.9-cp38-cp38-win_amd64.w

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

# 1. Load your dataset
df = pd.read_csv("/content/combined_dataset.csv")
texts = df["clean_text"].tolist()
labels = df["encoded_label"].tolist()

# 2. Train-test split
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# 3. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased").eval().cuda()

# 4. Custom dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = TextDataset(X_train_texts, y_train, tokenizer)
test_dataset = TextDataset(X_test_texts, y_test, tokenizer)

# 5. Extract CLS embeddings
def extract_cls_embeddings(model, dataset, batch_size=16):
    dataloader = DataLoader(dataset, batch_size=batch_size)
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting embeddings"):
            input_ids = batch["input_ids"].cuda()
            attention_mask = batch["attention_mask"].cuda()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeds = outputs.last_hidden_state[:, 0, :]  # [CLS] token
            embeddings.append(cls_embeds.cpu().numpy())
    return np.vstack(embeddings)

X_train = extract_cls_embeddings(bert_model, train_dataset)
X_test = extract_cls_embeddings(bert_model, test_dataset)



Extracting embeddings: 100%|██████████| 190/190 [00:10<00:00, 18.34it/s]
Extracting embeddings: 100%|██████████| 48/48 [00:02<00:00, 19.23it/s]


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 6. Compute Pearson kernel
def pearson_kernel(X1, X2):
    X1_centered = X1 - X1.mean(axis=1, keepdims=True)
    X2_centered = X2 - X2.mean(axis=1, keepdims=True)
    num = np.dot(X1_centered, X2_centered.T)
    denom = np.linalg.norm(X1_centered, axis=1, keepdims=True) * np.linalg.norm(X2_centered, axis=1, keepdims=True).T
    return num / (denom + 1e-8)

K_train = pearson_kernel(X_train, X_train)
K_test = pearson_kernel(X_test, X_train)  # note: test vs train


In [None]:
# 7. Train SVM
svm = SVC(kernel="precomputed")
svm.fit(K_train, y_train)

from sklearn.metrics import classification_report

# 8. Predict and evaluate
y_pred = svm.predict(K_test)
f1 = f1_score(y_test, y_pred, average="macro")
print(f"\n🎯 Pearson-SVM F1 Score (macro): {f1:.4f}")

print("\n📋 Classification Report:\n")
print(classification_report(y_test, y_pred, digits=4))



🎯 Pearson-SVM F1 Score (macro): 0.8044

📋 Classification Report:

              precision    recall  f1-score   support

         0.0     0.8093    0.8608    0.8343       424
         1.0     0.8084    0.7433    0.7745       335

    accuracy                         0.8090       759
   macro avg     0.8089    0.8021    0.8044       759
weighted avg     0.8089    0.8090    0.8079       759

