In [None]:
%pip install --quiet ibm-watsonx-ai python-dotenv tqdm scikit-learn




In [None]:
#Imports & auth setup
import os
import json
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dotenv import load_dotenv
from getpass import getpass

# WatsonX 
from ibm_watsonx_ai import Credentials, APIClient
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters

In [None]:
#authentication for VSC
# .env for key
load_dotenv()
api_key    = os.getenv("WATSONX_API_KEY") or getpass("IBM Cloud API key: ")
url        = os.getenv("WATSONX_URL")
project_id = os.getenv("WATSONX_PROJECT_ID")

# Build client
creds  = Credentials(url=url, api_key=api_key)
client = APIClient(credentials=creds, project_id=project_id)





In [None]:
# — Authentication for google colab
import os
from getpass import getpass
from ibm_watsonx_ai import Credentials, APIClient

# 1) Prompt (or load) your API key
api_key    = getpass("IBM Cloud API key: ")
url        = "https://us-south.ml.cloud.ibm.com"
project_id = "e92879d6-e36f-4374-b05f-98a704b287f2"

creds  = Credentials(url=url, api_key=api_key)
client = APIClient(credentials=creds, project_id=project_id)

In [None]:
# Instantiate LLM 
params = TextGenParameters(
    temperature=0.0,
    max_new_tokens=64,
    stop_sequences=["}"]
)
model = ModelInference(
    api_client=client,
    model_id="ibm/granite-13b-instruct-v2",
    params=params
)

In [None]:
#Load & clean data
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")
tl_df    = pd.read_csv("test_labels.csv")

label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"https?://\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

train_df['comment_text'] = train_df['comment_text'].apply(clean_text)
test_df['comment_text']  = test_df['comment_text'].apply(clean_text)

In [None]:
# evaluation subset
# sample for prompt tuning
sample_df = train_df.sample(n=5000, random_state=42).reset_index(drop=True)

In [None]:
#Filter to scored subset
scored = tl_df[label_cols].ne(-1).all(axis=1)
scored_ids = tl_df.loc[scored, 'id']
eval_df    = test_df[test_df['id'].isin(scored_ids)].reset_index(drop=True)
y_true     = tl_df.set_index('id').loc[eval_df['id'], label_cols].values.astype(int)

In [None]:
# JSON‐schema snippet:
KEYS_JSON = json.dumps({k: False for k in label_cols})

#JSON-schema prompt
SYSTEM_PROMPT = """
You are a toxicity classifier. You will:
1) Reason step by step whether the text is toxic, obscene, a threat, insult, etc.
2) Then output ONLY the final JSON object with these six keys (true or false):

{keys}

Examples:

TEXT: "COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK"
THOUGHTS: Strong profanity, personal attack → toxic, severe_toxic, obscene, insult.
OUTPUT: {{"toxic": true, "severe_toxic": true, "obscene": true, "threat": false, "insult": true, "identity_hate": false}}

TEXT: "Stupid piece of shit! Stop deleting my stuff, asshole, go die and fall in a hole!"
THOUGHTS: Very insulting, obscene, but no direct threat → toxic, severe_toxic, obscene, insult.
OUTPUT: {{"toxic": true, "severe_toxic": true, "obscene": true, "threat": false, "insult": true, "identity_hate": false}}

TEXT: "You are gay or antisemitian?"
THOUGHTS: Hate slur → toxic, obscene, insult, identity_hate.
OUTPUT: {{"toxic": true, "severe_toxic": false, "obscene": true, "threat": false, "insult": true, "identity_hate": true}}

TEXT: "Hi! I am back again! Last warning! Stop undoing my edits or die"
THOUGHTS: “or die” is a threat → toxic + threat.
OUTPUT: {{"toxic": true, "severe_toxic": false, "obscene": false, "threat": true, "insult": false, "identity_hate": false}}

TEXT: "FUCK YOUR FILTHY MOTHER IN THE ASS"
THOUGHTS: Profanity + insult → toxic, obscene, insult.
OUTPUT: {{"toxic": true, "severe_toxic": false, "obscene": true, "threat": false, "insult": true, "identity_hate": false}}

Now classify this comment:
TEXT: {text}
"""


In [None]:
# working with small samples for decreased runtimes
eval_small = eval_df.sample(frac=0.02, random_state=42).reset_index(drop=True)
print(f"Running LLM on {len(eval_small)} / {len(eval_df)} comments")

In [None]:


preds = []
for txt in tqdm(eval_small['comment_text'], desc="LLM classify"):
    prompt = SYSTEM_PROMPT.format(keys=KEYS_JSON, text=txt)
    resp   = model.generate(prompt)
    raw    = resp['results'][0]['generated_text'].strip()

    if '{' in raw and '}' in raw:
        raw = raw[raw.find('{'): raw.rfind('}')+1]

    try:
        parsed = json.loads(raw)
        if not isinstance(parsed, dict):
            raise ValueError("Not a dict")
        d = parsed
    except Exception:
        d = {k: False for k in label_cols}

    preds.append([int(bool(d.get(k, False))) for k in label_cols])

y_pred_small = np.array(preds)

In [None]:
# Evaluate
from sklearn.metrics import classification_report, roc_auc_score
y_true_small = tl_df.set_index('id').loc[eval_small['id'], label_cols].values.astype(int)
print(classification_report(y_true_small, y_pred_small, target_names=label_cols, zero_division=0))

from sklearn.metrics import roc_auc_score

print("ROC-AUC per label:")
for i, lbl in enumerate(label_cols):
    auc = roc_auc_score(y_true_small[:, i], y_pred_small[:, i])
    print(f"  {lbl:15s}: {auc:.4f}")

# ---- APPENDIX: single‐label “toxic” evaluation ----
Here i try only using one label

In [None]:
# few shot:
KEYS_SINGLE_JSON = json.dumps({"toxic": False})
SYSTEM_PROMPT_SINGLE = """
You are a toxicity classifier. You will:
1) Reason step by step whether the text is toxic, obscene, a threat, insult, etc.
2) Then output ONLY the final JSON object with this key (true or false):
{keys}

Examples:

TEXT: "COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK"
THOUGHTS: Strong profanity, personal attack → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "Stupid piece of shit! Stop deleting my stuff, asshole, go die and fall in a hole!"
THOUGHTS: Very insulting, obscene, → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "You are gay or antisemitian?"
THOUGHTS: Hate slur → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "Hi! I am back again! Last warning! Stop undoing my edits or die"
THOUGHTS: “or die” is a threat → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "FUCK YOUR FILTHY MOTHER IN THE ASS"
THOUGHTS: Profanity + insult → toxic.
OUTPUT: {{"toxic": true.}}

TEXT: "I love cats!"
THOUGHTS: No abusive or harassing content → non-toxic.
OUTPUT: {{"toxic": false}}

TEXT: "I'm sorry to say this, but I have to fail this article's GAN. The several major problems that  brought up in this article's previous GA assessment have not been issued"
THOUGHTS: No abusive or harassing content → non-toxic.
OUTPUT: {{"toxic": false}}


Now classify this comment:
TEXT: {text}
"""

In [None]:

# Few shot + chain of thought
KEYS_SINGLE_JSON = json.dumps({"toxic": False})
SYSTEM_PROMPT_SINGLE_CHAIN = """
You are a toxicity classifier. You will:
1) Reason step by step whether the text is toxic, obscene, a threat, insult, etc.
2) Then output ONLY the final JSON object with this key (true or false):
{keys}

Examples:

TEXT: "COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK"
THOUGHTS: Strong profanity, personal attack → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "Stupid piece of shit! Stop deleting my stuff, asshole, go die and fall in a hole!"
THOUGHTS: Very insulting, obscene, → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "You are gay or antisemitian?"
THOUGHTS: Hate slur → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "Hi! I am back again! Last warning! Stop undoing my edits or die"
THOUGHTS: “or die” is a threat → toxic.
OUTPUT: {{"toxic": true}}

TEXT: "FUCK YOUR FILTHY MOTHER IN THE ASS"
THOUGHTS: Profanity + insult → toxic.
OUTPUT: {{"toxic": true.}}

TEXT: "I love cats!"
THOUGHTS: No abusive or harassing content → non-toxic.
OUTPUT: {{"toxic": false}}

TEXT: "I'm sorry to say this, but I have to fail this article's GAN. The several major problems that  brought up in this article's previous GA assessment have not been issued"
THOUGHTS: No abusive or harassing content → non-toxic.
OUTPUT: {{"toxic": false}}


Now classify this comment:
TEXT: {text}
"""

In [None]:
# Run LLM on eval_small comments (same subset you already sampled):
preds_single = []
for txt in tqdm(eval_small['comment_text'], desc="LLM classify (toxic-only)"):
    prompt = SYSTEM_PROMPT_SINGLE_CHAIN.format(keys=KEYS_SINGLE_JSON, text=txt)
    resp   = model.generate(prompt)
    raw    = resp['results'][0]['generated_text'].strip()
    
    # extract the JSON
    if '{' in raw and '}' in raw:
        raw = raw[raw.find('{'): raw.rfind('}')+1]
    try:
        parsed = json.loads(raw)
        toxic_flag = bool(parsed.get("toxic", False))
    except Exception:
        toxic_flag = False
    
    preds_single.append(int(toxic_flag))

y_pred_toxic = np.array(preds_single)

In [None]:
# True labels and metrics for just “toxic”:
y_true_toxic = tl_df.set_index('id')\
                    .loc[eval_small['id'], "toxic"]\
                    .astype(int)\
                    .values

from sklearn.metrics import classification_report, roc_auc_score

print("=== Single-label ‘toxic’ ===")
print(classification_report(y_true_toxic, y_pred_toxic, target_names=["non-toxic","toxic"], zero_division=0))
print(f"ROC-AUC: {roc_auc_score(y_true_toxic, y_pred_toxic):.4f}")