In [1]:
!pip install -q transformers torch gradio sentencepiece


Can't locate pip.pm in @INC (you may need to install the pip module) (@INC entries checked: /usr/lib/perl5/site_perl /usr/share/perl5/site_perl /usr/lib/perl5/vendor_perl /usr/share/perl5/vendor_perl /usr/lib/perl5/core_perl /usr/share/perl5/core_perl) at /c/Dwimperl/perl/bin/pip line 5.
BEGIN failed--compilation aborted at /c/Dwimperl/perl/bin/pip line 5.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import json


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import json


In [4]:
MODEL_ID = "Vrandan/Comment-Moderation"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

model.eval()


`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [5]:
moderation_model = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    top_k=None,
    device=0 if torch.cuda.is_available() else -1
)


In [6]:
POLICIES = {
    "HATE_SPEECH": {
        "severity": "MEDIUM",
        "definition": "Hostile or demeaning language directed at individuals or groups."
    },
    "HARASSMENT": {
        "severity": "MEDIUM",
        "definition": "Aggressive or abusive language targeting a person."
    },
    "VIOLENCE": {
        "severity": "HIGH",
        "definition": "Threats or encouragement of physical harm."
    },
    "SELF_HARM": {
        "severity": "HIGH",
        "definition": "References that encourage or glorify self-harm."
    },
    "SEXUAL_CONTENT": {
        "severity": "HIGH",
        "definition": "Sexually explicit or inappropriate material."
    }
}


In [7]:
SEVERITY_ACTION = {
    "LOW": "ALLOW",
    "MEDIUM": "RESTRICT",
    "HIGH": "DISALLOW"
}


In [8]:
LABEL_POLICY_MAP = {
    "H": "HATE_SPEECH",
    "HR": "HARASSMENT",
    "V": "VIOLENCE",
    "V2": "VIOLENCE",
    "H2": "VIOLENCE",
    "SH": "SELF_HARM",
    "S": "SEXUAL_CONTENT",
    "S3": "SEXUAL_CONTENT"
}


In [9]:
def get_signals(text: str) -> dict:
    predictions = moderation_model(text)[0]
    return {p["label"]: round(p["score"], 3) for p in predictions}


In [10]:
def detect_violations(signals: dict, threshold: float = 0.1) -> list:
    violations = []

    for label, score in signals.items():
        if score >= threshold and label in LABEL_POLICY_MAP:
            policy = LABEL_POLICY_MAP[label]
            violations.append({
                "policy": policy,
                "severity": POLICIES[policy]["severity"],
                "confidence": score,
                "evidence": label
            })

    return violations


In [11]:
def decide_action(violations: list):
    if not violations:
        return "ALLOW", "Content is safe and compliant."

    rank = {"LOW": 1, "MEDIUM": 2, "HIGH": 3}
    most_severe = max(violations, key=lambda x: rank[x["severity"]])

    return (
        SEVERITY_ACTION[most_severe["severity"]],
        f"Violation of {most_severe['policy']} policy detected."
    )


In [12]:
def format_response(text, decision, violations, reasoning):
    return {
        "input_text": text,
        "final_decision": decision,
        "violations": violations,
        "reasoning": reasoning,
        "explainability": {
            "policy_based": True,
            "model_as_signal": True
        }
    }


In [13]:
def moderate_text(text: str) -> dict:
    signals = get_signals(text)
    violations = detect_violations(signals)
    decision, reasoning = decide_action(violations)
    return format_response(text, decision, violations, reasoning)


In [14]:
moderate_text("I hope you have a great day")


{'input_text': 'I hope you have a great day',
 'final_decision': 'ALLOW',
 'violations': [],
 'reasoning': 'Content is safe and compliant.',
 'explainability': {'policy_based': True, 'model_as_signal': True}}

In [15]:
moderate_text("I hate you and you deserve pain")


{'input_text': 'I hate you and you deserve pain',
 'final_decision': 'DISALLOW',
 'violations': [{'policy': 'HATE_SPEECH',
   'severity': 'MEDIUM',
   'confidence': 0.423,
   'evidence': 'H'},
  {'policy': 'VIOLENCE',
   'severity': 'HIGH',
   'confidence': 0.316,
   'evidence': 'V'},
  {'policy': 'HARASSMENT',
   'severity': 'MEDIUM',
   'confidence': 0.281,
   'evidence': 'HR'},
  {'policy': 'SELF_HARM',
   'severity': 'HIGH',
   'confidence': 0.19,
   'evidence': 'SH'}],
 'reasoning': 'Violation of VIOLENCE policy detected.',
 'explainability': {'policy_based': True, 'model_as_signal': True}}

In [16]:
# user_text = input("Enter text for moderation: ")
# result = moderate_text(user_text)
# print(json.dumps(result, indent=2))


In [17]:
import gradio as gr

def ui_moderation(text):
    return moderate_text(text)

demo = gr.Interface(
    fn=ui_moderation,
    inputs=gr.Textbox(lines=4, placeholder="Enter text here"),
    outputs=gr.JSON(),
    title="Explainable Content Moderation System",
    description="Policy-driven moderation with transparent decision logic"
)

demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://46c7779b5070212ac6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


