In [None]:
# ## 1. Install Required Packages 
# !pip install transformers torch sklearn pandas tqdm

In [1]:
# ## 2. Import Libraries
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

  from pandas.core import (


In [2]:
# ## 3. Load the Dataset
data_path = r"C:\Users\yozev\PycharmProjects\Probing-Slang-Ambiguity-in-LLM\using_claude\manual_slang_dataset.csv"

df = pd.read_csv(data_path)
# Rename columns for clarity
df = df.rename(columns={"sentence": "text", "binary": "label"})
print("Total examples:", len(df))
display(df.head())

Total examples: 736


Unnamed: 0,text,label
0,"That new song is absolutely fire, I can't stop...",1
1,The fire department responded quickly to the h...,0
2,"Your outfit is so bad, everyone's going to be ...",1
3,I got a bad grade on my chemistry test yesterday,0
4,"She killed that performance, the crowd went wild",1


In [3]:
# ## 4. Split into Train and Test Sets
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 588, Test size: 148


In [4]:
# ## 5. Initialize BERT Tokenizer & Model (for feature extraction only)
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
bert_model.eval()  # disable dropout, etc.

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
bert_model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
# ## 6. Helper Function to Extract [CLS] Embeddings
class SentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].flatten(),
            "attention_mask": enc["attention_mask"].flatten(),
        }


def extract_cls_embeddings(
    sentences, tokenizer, model, device, batch_size=16, max_length=128
):
    """
    Given a list of sentences, return a NumPy array of shape
    (num_sentences, hidden_size), where each row is BERT's [CLS] embedding.
    """
    dataset = SentDataset(sentences, tokenizer, max_length)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_embs = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Extracting embeddings", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # `pooler_output` is the [CLS] embedding after a tanh
            cls_emb = outputs.pooler_output
            all_embs.append(cls_emb.cpu())

    all_embs = torch.cat(all_embs, dim=0)  # shape: (N, hidden_size)
    return all_embs.numpy()

In [6]:
# ## 7. Extract Embeddings for Train and Test
X_train = extract_cls_embeddings(
    train_df["text"].tolist(), tokenizer, bert_model, device
)
X_test = extract_cls_embeddings(
    test_df["text"].tolist(), tokenizer, bert_model, device
)
y_train = train_df["label"].values
y_test = test_df["label"].values

print("Train embeddings shape:", X_train.shape)
print("Test embeddings shape:", X_test.shape)

                                                                      

Train embeddings shape: (588, 768)
Test embeddings shape: (148, 768)




In [7]:
# ## 8. Train a Logistic Regression Classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [8]:
# ## 9. Evaluate on the Test Set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"\nAccuracy (BERT features + Logistic Regression): {acc:.4f}")
print("Confusion Matrix:\n", cm)



Accuracy (BERT features + Logistic Regression): 0.9865
Confusion Matrix:
 [[72  2]
 [ 0 74]]
