In [9]:
# Cell 1: Imports
import pandas as pd
from transformers import pipeline
from tqdm.auto import tqdm


In [10]:
# Cell 2: Load your CSV
csv_path = r"C:\Users\yozev\PycharmProjects\Slang_in_LLMs\filtered_slang.csv"
df = pd.read_csv(csv_path)

if "sentence" not in df.columns:
    raise ValueError("Your CSV must have a column named 'sentence'.")
sentences = df["sentence"].astype(str).tolist()


In [11]:
# Cell 3: Define two zero‐shot classifiers using purely PyTorch‐based MNLI models

# 1) facebook/bart-large-mnli (pure PyTorch; will never import TensorFlow)
bart_nli = pipeline(
    task="zero-shot-classification",
    model="facebook/bart-large-mnli",
    framework="pt"
)

# 2) joeddav/xlm-roberta-large-xnli (pure PyTorch; also never loads TensorFlow)
xlmroberta_nli = pipeline(
    task="zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    framework="pt"
)

# We will ask each model to choose between these two labels:
labels = ["contains slang", "does not contain slang"]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [12]:
# Cell 4: Run predictions on every sentence
results = []

for sent in tqdm(sentences, desc="Classifying sentences"):
    # BART-MNLI prediction
    out_bart = bart_nli(sent, candidate_labels=labels)
    pred_bart = out_bart["labels"][0]        # top label

    # XLM-RoBERTa-XNLI prediction
    out_xlm = xlmroberta_nli(sent, candidate_labels=labels)
    pred_xlm = out_xlm["labels"][0]          # top label

    # All sentences truly contain slang
    truth = "contains slang"

    results.append({
        "sentence":            sent,
        "bart_pred":           pred_bart,
        "xlmroberta_pred":     pred_xlm,
        "ground_truth":        truth,
        "bart_correct":        (pred_bart == truth),
        "xlmroberta_correct":  (pred_xlm == truth)
    })

results_df = pd.DataFrame(results)


Classifying sentences:   0%|          | 0/238 [00:00<?, ?it/s]

In [13]:
# Cell 5: Compute and print each model’s accuracy
total = len(results_df)
bart_acc = results_df["bart_correct"].sum() / total
xlmroberta_acc = results_df["xlmroberta_correct"].sum() / total

print(f"BART-MNLI accuracy:       {bart_acc:.2%}")
print(f"XLM-RoBERTa-XNLI accuracy: {xlmroberta_acc:.2%}")


BART-MNLI accuracy:       98.74%
XLM-RoBERTa-XNLI accuracy: 90.34%


In [14]:
# Cell 6 (optional): Inspect the first few rows
results_df.head(10)

Unnamed: 0,sentence,bart_pred,xlmroberta_pred,ground_truth,bart_correct,xlmroberta_correct
0,1. I can smell your dank all the way over here...,contains slang,contains slang,contains slang,True,True
1,"""you tool""",contains slang,contains slang,contains slang,True,True
2,He was telling me his trash about my sense of ...,contains slang,contains slang,contains slang,True,True
3,Nathan says Tom’s phone voice is touch; it sta...,contains slang,contains slang,contains slang,True,True
4,The budget for a trade show booth this year is...,contains slang,contains slang,contains slang,True,True
5,Person 1: Ey yo bish why u not at scool tis mo...,contains slang,contains slang,contains slang,True,True
6,i wanna get a ripped body,contains slang,contains slang,contains slang,True,True
7,"""Safe Jo.\r\nSafe Steve.\r\nSafe.\r\nYeah safe...",contains slang,contains slang,contains slang,True,True
8,Megan gave Simon head.,contains slang,contains slang,contains slang,True,True
9,Yo these pills you grabbed are pressed eh... i...,contains slang,contains slang,contains slang,True,True
