In [1]:
# load feature importance
import pandas as pd


df_importance = pd.read_csv("feature_importance.csv", delimiter=";|,|\t", engine="python")

# clean column names
df_importance.columns = df_importance.columns.str.strip().str.lower()

# replace comma with dot and convert to float (for Danish decimal format)
df_importance['coefficient'] = df_importance['coefficient'].astype(str).str.replace(',', '.').astype(float)

# Structure as dict: {label: {term: coefficient}}
feature_map = (
    df_importance
    .groupby("label")
    .apply(lambda g: dict(zip(g["term"], g["coefficient"])))
    .to_dict()
)


import re

def get_trigger_terms(comment: str, label: str, feature_map: dict, top_n: int = 5):
    comment_words = set(re.findall(r"\b\w+\b", comment.lower()))
    top_words = feature_map.get(label, {})
    triggers = [word for word in comment_words if word in top_words]
    sorted_triggers = sorted(triggers, key=lambda x: -top_words[x])  # highest importance first
    return sorted_triggers[:top_n]  # limit to top-N terms

  .apply(lambda g: dict(zip(g["term"], g["coefficient"])))


In [2]:
#imports
# built-in
from typing import TypedDict, Optional
from pydantic import Field

# langgraph
from langgraph.graph import StateGraph, START, END
from langchain_core.runnables.graph import MermaidDrawMethod

# local
from src.llm import LLMCaller
from src.model import predict_toxicity
from dotenv import load_dotenv
import os



python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 2
python-dotenv could not parse statement starting at line 4
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 7
python-dotenv could not parse statement starting at line 8
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 2
python-dotenv could not parse statement starting at line 4
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 7
python-dotenv could not parse statement starting at line 8


In [3]:
#authentication for VSC
# load variables from .env
load_dotenv()

# Read from environment
WX_API_KEY = os.getenv("WX_API_KEY")
WX_PROJECT_ID = os.getenv("WX_PROJECT_ID")
WX_API_URL = os.getenv("WX_API_URL")

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 2
python-dotenv could not parse statement starting at line 4
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 7
python-dotenv could not parse statement starting at line 8


In [4]:
from src.llm import LLMCaller

model = LLMCaller(
    api_key=WX_API_KEY,
    project_id=WX_PROJECT_ID,
    api_url=WX_API_URL,
    model_id="watsonx/ibm/granite-3-2-8b-instruct",
    params={
        "temperature": 0.1,
        "top_p": 0.5,
        "max_new_tokens": 100
    }
)

In [5]:
from typing import TypeVar, Any
import litellm
from litellm.types.utils import ModelResponse, Message
from litellm import completion
from instructor import from_litellm, Mode
from pydantic import BaseModel, create_model

class BaseResponse(BaseModel):
    answer: str

ResponseType = TypeVar("ResponseType", bound=BaseModel)

class LLMCaller:
    def __init__(self, api_key: str, project_id: str, api_url: str, model_id: str, params: dict[str, Any]):
        self.api_key = api_key
        self.project_id = project_id
        self.api_url = api_url
        self.model_id = model_id
        self.params = params

        litellm.drop_params = True
        self.client = from_litellm(completion, mode=Mode.JSON)

    def create_response_model(self, title: str, fields: dict) -> ResponseType:
        return create_model(title, **fields, __base__=BaseResponse)

    def invoke(self, prompt: str, response_model: ResponseType = BaseResponse, **kwargs) -> ResponseType:
        response = self.client.chat.completions.create(
            model=self.model_id,
            messages=[{"role": "user", "content": prompt + f"\n\nProvide your answer as an object of {type(response_model)}"}],
            project_id=self.project_id,
            apikey=self.api_key,
            api_base=self.api_url,
            response_model=response_model,
            **kwargs,
        )
        return response

    def chat(self, messages: list[dict[str, str] | Message], **kwargs) -> ModelResponse:
        return completion(
            model=self.model_id,
            project_id=self.project_id,
            apikey=self.api_key,
            api_base=self.api_url,
            messages=messages,
            **kwargs,
        )

In [6]:
import joblib
import numpy as np

# load lr_cv_tuned
model_path = "models/lr_cv_tuned.joblib"
bow_model = joblib.load(model_path)

def predict_toxicity(comment: str):
    """
    Predict toxic labels and confidence for a comment using the loaded BoW model.
    Also identifies trigger terms based on global feature importance.
    """
    print("🧠 [MODEL] Running lr_cv_tuned.joblib on comment:")
    print(f"   \"{comment}\"")

    pred = bow_model.predict([comment])[0]       # e.g. [1, 0, 1, 0, 0, 0]
    probas = bow_model.predict_proba([comment])  # shape: (1, 6)
    
    class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    toxic_labels = []
    toxic_probs = []
    all_triggers = []

    for i, val in enumerate(pred):
        if val == 1:
            label = class_names[i]
            prob = probas[0][i]
            toxic_labels.append(label)
            toxic_probs.append(prob)
            print(f"   🔹 Label predicted: {label} (probability: {prob:.2f})")

            # extract trigger terms for this label
            comment_words = set(re.findall(r"\b\w+\b", comment.lower()))
            top_terms = feature_map.get(label, {})
            triggers = [word for word in comment_words if word in top_terms]
            triggers_sorted = sorted(triggers, key=lambda x: -top_terms[x])
            if triggers_sorted:
                print(f"   🔍 Trigger terms for '{label}': {', '.join(triggers_sorted[:5])}")
                all_triggers.extend(triggers_sorted[:5])  # limit to top 5

    if toxic_labels:
        label_str = ", ".join(toxic_labels)
        confidence = max(toxic_probs)
        print(f"✅ [MODEL RESULT] Predicted toxic: {label_str} (max confidence: {confidence:.2f})")
    else:
        label_str = "non-toxic"
        confidence = 1 - max(probas[0])
        print(f"✅ [MODEL RESULT] Predicted non-toxic (confidence: {confidence:.2f})")

    return label_str, confidence, list(set(all_triggers))




In [7]:
from typing import TypedDict, Optional
from pydantic import Field
from langgraph.graph import StateGraph, START, END


# define State for the agent
class CommentState(TypedDict):
    comment: str
    label: Optional[str]
    confidence: Optional[float]
    explanation: Optional[str]
    verbose: bool

# define Node Functions
def run_model(state: CommentState):
    print("\n🚀 [AGENT NODE] Running model node...")
    label, confidence, triggers = predict_toxicity(state["comment"])
    return {
        "label": label,
        "confidence": confidence,
        "triggers": triggers
    }

def explain_prediction(state: CommentState):
    prompt = f"""
    A machine learning model classified the following comment as '{state['label']}' with {state['confidence']:.2f} confidence.

    Comment: "{state['comment']}"

    Please explain step-by-step why this label is appropriate.
    Mention specific words or tone that could influence the model.
    """
    response = model.invoke(prompt)
    return {"explanation": response.answer}

def display_result(state: CommentState):
    if state["verbose"]:
        print("="*60)
        print(f"📝 Comment: {state['comment']}")
        print(f"📣 Prediction: {state['label'].upper()} ({state['confidence']:.2f} confidence)")
        print(f"🔍 Explanation:\n{state['explanation']}")
        print("="*60)
    return {}

# create the LangGraph
graph = StateGraph(CommentState)
graph.add_node("run_model", run_model)
graph.add_node("explain", explain_prediction)
graph.add_node("display", display_result)

graph.add_edge(START, "run_model")
graph.add_edge("run_model", "explain")
graph.add_edge("explain", "display")
graph.add_edge("display", END)

compiled_graph = graph.compile()

In [8]:
# example comments
sample_comment = "wtf is grandpa talking about?  Get him his medicine and put him to bed!" #True toxcic from reddit predictions
sample_comment1 = "Playing for Real Madrid really seems to inflate egos massively doesn't it?" #True non-toxic from reddit predictions
sample_comment2 = "I’m not trying to get banned, but sleezy fucktard is the phrase that comes to mind." #False negative


import time

# small delay, since ibm has an limit for how many request i can send in 8 seconds
time.sleep(1.0)

result = compiled_graph.invoke({
    "comment": sample_comment2,
    "verbose": True
})


🚀 [AGENT NODE] Running model node...
🧠 [MODEL] Running lr_cv_tuned.joblib on comment:
   "I’m not trying to get banned, but sleezy fucktard is the phrase that comes to mind."
✅ [MODEL RESULT] Predicted non-toxic (confidence: 0.95)
📝 Comment: I’m not trying to get banned, but sleezy fucktard is the phrase that comes to mind.
📣 Prediction: NON-TOXIC (0.95 confidence)
🔍 Explanation:
The comment is classified as 'non-toxic' due to the following reasons:

1. Contextual understanding: The model likely understands the context in which the phrase 'sleezy fucktard' is used. In this case, it is used to express frustration or disdain towards a specific individual, rather than as a direct insult or attack on a group or individual's characteristics.

2. Tone and intent: The comment begins with 'I’m not trying to get banned,' which indicates the user's awareness of community guidelines and their intention not to violate them. This tone suggests a level of self-restraint and consideration, which can