In [None]:
# =========================================================
# 📓 CELL 1 – Environment & API key  (bullet-proof)
# =========================================================
!pip install -qU "openai>=1.25.0" pandas==2.2.2 scikit-learn==1.6.1 backoff tqdm --no-warn-conflicts

import os, json, time
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import openai, backoff
from google.colab import userdata

openai.api_key = userdata.get('OPENAI_API_KEY')

# ---- Inference settings --------------------------------------------------
MODEL_ID    = "gpt-4o"
TEMPERATURE = 0
MAX_TOKENS  = 3


In [None]:
# =========================================================
# 📓 CELL 2 – Load & label the datasets
# =========================================================
POS_PATH = "/content/slang_OpenSub_filtered.tsv"
NEG_PATH = "/content/slang_OpenSub_negatives_filtered.tsv"

# If the column name varies (e.g. 'sentence'), fall back gracefully
def read_tsv(path, label):
    df = pd.read_csv(path, sep="\t")
    col = "SENTENCE" if "SENTENCE" in df.columns else df.columns[0]
    return pd.DataFrame({"sentence": df[col].astype(str), "label": label})

pos_df = read_tsv(POS_PATH, 1)   # 1 = contains slang
neg_df = read_tsv(NEG_PATH, 0)   # 0 = no slang
data   = pd.concat([pos_df, neg_df], ignore_index=True)

print(f"Loaded {len(pos_df)} slang and {len(neg_df)} non-slang sentences "
      f"({len(data)} total).")
data.head()


Loaded 1424 slang and 2193 non-slang sentences (3617 total).


Unnamed: 0,sentence,label
0,Or maybe we could go somewhere and you could b...,1
1,"I know it's a bit long, but you understand why.",1
2,He'll be all over the heat.,1
3,"I waited until you broke up with her, but me too.",1
4,"Sammy only had one eye, but he cut quite a fig...",1


In [None]:
# =========================================================
# 📓 CELL 3 – Prompt template & helper to query GPT-4o
# =========================================================
SYSTEM_PROMPT = (
    "You are an expert linguistic classifier. "
    "For each user sentence, respond with a single lowercase word: "
    "'yes' if the sentence CONTAINS slang, 'no' if it does NOT. "
    "Return ONLY 'yes' or 'no' – no additional text."
)

def build_messages(sentence: str):
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",
         "content": f"Classify this sentence:\n{sentence.strip()}"}
    ]

@backoff.on_exception(backoff.expo,
                      (openai.RateLimitError, openai.APIError, openai.Timeout),
                      max_tries=5)
def classify_sentence(sentence: str) -> str:
    resp = openai.chat.completions.create(
        model        = MODEL_ID,
        messages     = build_messages(sentence),
        temperature  = TEMPERATURE,
        max_tokens   = MAX_TOKENS,
        top_p        = 1,
        timeout      = 30,
    )
    answer = resp.choices[0].message.content.lower().strip()
    return "yes" if answer.startswith("y") else "no"


In [None]:
# =========================================================
# 📓 CELL 4 – Run inference (≈ 3 min for 3 100 lines)
# =========================================================
tqdm.pandas()
data["pred"] = data["sentence"].progress_apply(classify_sentence)


  0%|          | 0/3617 [00:00<?, ?it/s]

In [None]:
# =========================================================
# 📓 CELL 5 – Evaluate
# =========================================================
y_true = data["label"].map({1: "yes", 0: "no"})
y_pred = data["pred"]

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification report:\n",
      classification_report(y_true, y_pred, digits=3))

cm = confusion_matrix(y_true, y_pred, labels=["yes", "no"])
print("\nConfusion matrix [TP FP; FN TN]:\n", cm)


Accuracy: 0.7379043406137683

Classification report:
               precision    recall  f1-score   support

          no      0.714     0.947     0.814      2193
         yes      0.835     0.416     0.556      1424

    accuracy                          0.738      3617
   macro avg      0.775     0.682     0.685      3617
weighted avg      0.762     0.738     0.712      3617


Confusion matrix [TP FP; FN TN]:
 [[ 593  831]
 [ 117 2076]]


In [None]:
# =========================================================
# 📓 CELL 6 – Save misclassified examples
# =========================================================
# 1️⃣  Attach the ground-truth label in plain language
data["actual"] = data["label"].map({1: "yes", 0: "no"})

# 2️⃣  Keep only the rows where the model was wrong
misclassified = data[data["pred"] != data["actual"]][
    ["sentence", "pred", "actual"]
].rename(columns={
    "sentence": "sentence_text",
    "pred":      "model_prediction",
    "actual":    "ground_truth"
})

print(f"❌ Misclassifications: {len(misclassified)} / {len(data)}")

# 3️⃣  Write to CSV in /content (visible in the left sidebar)
out_path = "/content/misclassified_examples.csv"
misclassified.to_csv(out_path, index=False)
print(f"Saved → {out_path}")

❌ Misclassifications: 948 / 3617
Saved → /content/misclassified_examples.csv
