In [2]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score

# Paths to your filtered TSV files
SLANG_FILE = r"C:\Users\yozev\PycharmProjects\Probing-Slang-Ambiguity-in-LLM\opensub\data\slang_OpenSub_filtered.tsv"
NON_SLANG_FILE = r"C:\Users\yozev\PycharmProjects\Probing-Slang-Ambiguity-in-LLM\opensub\data\slang_OpenSub_negatives_filtered.tsv"

# 1) Load both datasets
df_slang = pd.read_csv(SLANG_FILE, sep="\t", dtype=str)
df_nonslang = pd.read_csv(NON_SLANG_FILE, sep="\t", dtype=str)

# 2) Assign labels: 1 = slang, 0 = non-slang
df_slang["label"] = 1
df_nonslang["label"] = 0

# 3) Concatenate into one DataFrame
df_all = pd.concat([df_slang, df_nonslang], ignore_index=True)

# 4) Extract sentences and true labels
sentences = df_all["SENTENCE"].tolist()
true_labels = df_all["label"].tolist()

# 5) Load tokenizer & model (PyTorch) for zero-shot (MNLI) classification
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

candidate_labels = ["slang", "non-slang"]

def zero_shot_classify(sentence: str, labels: list[str]) -> int:
    """
    Returns 1 if 'slang' is predicted, 0 if 'non-slang' is predicted.
    We encode each (sentence, hypothesis) pair, run through the MNLI head,
    and pick the label whose entailment logit is highest.
    """
    # Build hypotheses like "This example is slang." and "This example is non-slang."
    hypotheses = [f"This example is {lbl}." for lbl in labels]
    # Tokenize: pair each hypothesis with the same sentence
    encoded = tokenizer(
        [sentence] * len(hypotheses),
        hypotheses,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoded)
        logits = outputs.logits  # shape: (len(labels), 3)
        # For BART-MNLI, the label mapping is [contradiction, neutral, entailment]
        entail_logits = logits[:, 2]  # get "entailment" logit for each hypothesis
        probs = F.softmax(entail_logits, dim=0)  # softmax over the entailment scores
        best_idx = torch.argmax(probs).item()

    return 1 if labels[best_idx] == "slang" else 0

# 6) Run all sentences through zero-shot classifier
predicted_labels = []
for sent in sentences:
    pred = zero_shot_classify(sent, candidate_labels)
    predicted_labels.append(pred)

# 7) Compute and print accuracy
acc = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {acc:.4f}")


ImportError: cannot import name 'sym_float' from 'torch' (unknown location)

In [3]:
!conda install -y pytorch torchvision torchaudio cudatoolkit=11.8 -c pytorch --debug


^C


In [1]:
import torch
print(torch.__version__, torch.version.cuda, torch.cuda.is_available())

AttributeError: module 'torch' has no attribute '__version__'