In [1]:
!pip install -q langchain transformers accelerate sentencepiece
!pip install -q langchain-community

In [2]:
!pip install -q gradio transformers torch accelerate sentencepiece

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline
)




In [4]:
MODEL_NAME = "Vrandan/Comment-Moderation"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

model.eval()


`torch_dtype` is deprecated! Use `dtype` instead!


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
moderation_pipeline = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)


Device set to use cpu


In [8]:
test_text = "I hope you have a great day!"

output = moderation_pipeline(test_text)
output


[[{'label': 'S', 'score': 0.01796688884496689},
  {'label': 'H', 'score': 0.01684916950762272},
  {'label': 'V', 'score': 0.017463263124227524},
  {'label': 'HR', 'score': 0.01866951957345009},
  {'label': 'SH', 'score': 0.018865317106246948},
  {'label': 'S3', 'score': 0.008961008861660957},
  {'label': 'H2', 'score': 0.0071882931515574455},
  {'label': 'V2', 'score': 0.010438548400998116},
  {'label': 'OK', 'score': 0.9388498067855835}]]

In [9]:
test_text = "hate her, gonna beat her!"

output = moderation_pipeline(test_text)
output


[[{'label': 'S', 'score': 0.1803099662065506},
  {'label': 'H', 'score': 0.7862440943717957},
  {'label': 'V', 'score': 0.6674680113792419},
  {'label': 'HR', 'score': 0.5422320365905762},
  {'label': 'SH', 'score': 0.15856768190860748},
  {'label': 'S3', 'score': 0.14313942193984985},
  {'label': 'H2', 'score': 0.458370178937912},
  {'label': 'V2', 'score': 0.17443899810314178},
  {'label': 'OK', 'score': 0.047905780375003815}]]

In [10]:
SEVERITY_LEVELS = {
    "LOW": {
        "description": "Mild or contextual issues with no direct harm"
    },
    "MEDIUM": {
        "description": "Potentially harmful content requiring restriction or warning"
    },
    "HIGH": {
        "description": "Clear harmful or unsafe content requiring removal"
    }
}

In [11]:
MODERATION_POLICIES = {
    "HATE_SPEECH": {
        "description": "Content that targets individuals or groups based on protected characteristics with abusive or demeaning language.",
        "severity": "MEDIUM",
        "examples": [
            "insulting a protected group",
            "calling for harm against a community"
        ]
    },

    "HARASSMENT": {
        "description": "Abusive or threatening language directed at an individual or group.",
        "severity": "MEDIUM",
        "examples": [
            "personal insults",
            "threatening tone without violence"
        ]
    },

    "VIOLENCE": {
        "description": "Content that promotes, glorifies, or threatens physical harm.",
        "severity": "HIGH",
        "examples": [
            "threats of physical attack",
            "encouraging violence"
        ]
    },

    "SELF_HARM": {
        "description": "Content that promotes or encourages self-harm or suicide.",
        "severity": "HIGH",
        "examples": [
            "encouraging self-harm",
            "suicide ideation"
        ]
    },

    "SEXUAL_CONTENT": {
        "description": "Explicit sexual content not suitable for general audiences.",
        "severity": "HIGH",
        "examples": [
            "explicit sexual descriptions"
        ]
    },

    "SAFE_CONTENT": {
        "description": "Content that does not violate any policy.",
        "severity": "LOW",
        "examples": [
            "neutral conversation",
            "positive statements"
        ]
    }
}


In [12]:
DECISION_RULES = {
    "LOW": "ALLOW",
    "MEDIUM": "RESTRICT",
    "HIGH": "DISALLOW"
}

In [13]:
def build_reasoning_prompt(user_text, model_scores, policies):
    """
    Constructs a policy-aware reasoning prompt.
    """

    policy_descriptions = "\n".join(
        [f"- {k}: {v['description']}" for k, v in policies.items()]
    )

    prompt = f"""
You are a content moderation reasoning system.

Your task:
1. Analyze the user-generated content.
2. Compare it strictly against the moderation policies below.
3. Identify which policy (if any) is violated.
4. Explain the reasoning using policy language only.
5. If no policy is violated, clearly state that the content is safe.

Moderation Policies:
{policy_descriptions}

Model Signals (for reference, not final decision):
{model_scores}

User Content:
\"\"\"{user_text}\"\"\"

Respond in a structured and concise manner.
"""
    return prompt

In [14]:
def get_model_signals(text):
    """
    Runs the Hugging Face moderation model
    and returns label-score pairs.
    """
    results = moderation_pipeline(text)[0]

    signals = {item["label"]: round(item["score"], 4) for item in results}
    return signals

In [15]:
user_text = "I hate all of you and you deserve pain."

model_signals = get_model_signals(user_text)

reasoning_prompt = build_reasoning_prompt(
    user_text=user_text,
    model_scores=model_signals,
    policies=MODERATION_POLICIES
)

print(reasoning_prompt)


You are a content moderation reasoning system.

Your task:
1. Analyze the user-generated content.
2. Compare it strictly against the moderation policies below.
3. Identify which policy (if any) is violated.
4. Explain the reasoning using policy language only.
5. If no policy is violated, clearly state that the content is safe.

Moderation Policies:
- HATE_SPEECH: Content that targets individuals or groups based on protected characteristics with abusive or demeaning language.
- HARASSMENT: Abusive or threatening language directed at an individual or group.
- VIOLENCE: Content that promotes, glorifies, or threatens physical harm.
- SELF_HARM: Content that promotes or encourages self-harm or suicide.
- SEXUAL_CONTENT: Explicit sexual content not suitable for general audiences.
- SAFE_CONTENT: Content that does not violate any policy.

Model Signals (for reference, not final decision):
{'S': 0.047, 'H': 0.2959, 'V': 0.24, 'HR': 0.2156, 'SH': 0.198, 'S3': 0.032, 'H2': 0.0598, 'V2': 0.0481,

In [16]:
def extract_policy_violation(model_signals, policies, threshold=0.1):
    violated_policies = []

    for model_label, score in model_signals.items():
        if score >= threshold and model_label in MODEL_TO_POLICY_MAP:
            policy_name = MODEL_TO_POLICY_MAP[model_label]

            violated_policies.append({
                "policy": policy_name,
                "severity": policies[policy_name]["severity"],
                "confidence": round(score, 3),
                "evidence_label": model_label
            })

    return violated_policies

In [17]:
def make_moderation_decision(violations, decision_rules):
    """
    Determines final moderation decision based on highest severity.
    """
    if not violations:
        return "ALLOW", "No policy violations detected."

    # Highest severity wins
    severity_rank = {"LOW": 1, "MEDIUM": 2, "HIGH": 3}
    top_violation = max(
        violations,
        key=lambda x: severity_rank[x["severity"]]
    )

    decision = decision_rules[top_violation["severity"]]
    reasoning = (
        f"Content violates {top_violation['policy']} policy "
        f"with {top_violation['severity']} severity."
    )

    return decision, reasoning


In [18]:
def build_response(
    user_text,
    decision,
    violations,
    reasoning,
):
    """
    Returns structured, explainable moderation output.
    """
    return {
        "input_text": user_text,
        "decision": decision,
        "violated_policies": violations,
        "reasoning": reasoning,
        "explainability": {
            "policy_driven": True,
            "model_used_as_signal": True,
            "human_readable_reasoning": True
        }
    }


In [19]:
def moderate_content(text):
    # Step 1: Get model signals
    model_signals = get_model_signals(text)

    # Step 2: Extract policy violations
    violations = extract_policy_violation(
        model_signals,
        MODERATION_POLICIES
    )

    # Step 3: Make final decision
    decision, reasoning = make_moderation_decision(
        violations,
        DECISION_RULES
    )

    # Step 4: Build structured output
    response = build_response(
        text,
        decision,
        violations,
        reasoning,
    )

    return response


In [20]:
text = "I hate all of you and you deserve pain."
raw_output = moderation_pipeline(text)
print("Raw model output:")
print(raw_output)

MODEL_TO_POLICY_MAP = {
    "H": "HATE_SPEECH",
    "HR": "HARASSMENT",
    "V": "VIOLENCE",
    "SH": "SELF_HARM",
    "S": "SEXUAL_CONTENT",
    "S3": "SEXUAL_CONTENT", # S3 usually indicates more severe sexual content, mapping to general sexual policy for now.
    "H2": "VIOLENCE",        # H2 indicates hate speech with a threat, which aligns with the VIOLENCE policy.
    "V2": "VIOLENCE",        # V2 indicates graphic violence, mapping to the VIOLENCE policy.
}

result_fixed = moderate_content(text)
print("\nModeration result with corrected policy mapping:")
print(result_fixed)

Raw model output:
[[{'label': 'S', 'score': 0.046954259276390076}, {'label': 'H', 'score': 0.2959465980529785}, {'label': 'V', 'score': 0.23995475471019745}, {'label': 'HR', 'score': 0.21563826501369476}, {'label': 'SH', 'score': 0.19797274470329285}, {'label': 'S3', 'score': 0.032017480581998825}, {'label': 'H2', 'score': 0.0597618892788887}, {'label': 'V2', 'score': 0.04814678058028221}, {'label': 'OK', 'score': 0.17626331746578217}]]

Moderation result with corrected policy mapping:
{'input_text': 'I hate all of you and you deserve pain.', 'decision': 'DISALLOW', 'violated_policies': [{'policy': 'HATE_SPEECH', 'severity': 'MEDIUM', 'confidence': 0.296, 'evidence_label': 'H'}, {'policy': 'VIOLENCE', 'severity': 'HIGH', 'confidence': 0.24, 'evidence_label': 'V'}, {'policy': 'HARASSMENT', 'severity': 'MEDIUM', 'confidence': 0.216, 'evidence_label': 'HR'}, {'policy': 'SELF_HARM', 'severity': 'HIGH', 'confidence': 0.198, 'evidence_label': 'SH'}], 'reasoning': 'Content violates VIOLENCE p

In [21]:
sample_text = "I like you so much you are my favourite"
result = moderate_content(sample_text)
result

{'input_text': 'I like you so much you are my favourite',
 'decision': 'ALLOW',
 'violated_policies': [],
 'reasoning': 'No policy violations detected.',
 'explainability': {'policy_driven': True,
  'model_used_as_signal': True,
  'human_readable_reasoning': True}}

In [22]:
sample_text = "I hate your content. It gives me so much pain watching"
result = moderate_content(sample_text)
result

{'input_text': 'I hate your content. It gives me so much pain watching',
 'decision': 'DISALLOW',
 'violated_policies': [{'policy': 'SEXUAL_CONTENT',
   'severity': 'HIGH',
   'confidence': 0.306,
   'evidence_label': 'S'},
  {'policy': 'HARASSMENT',
   'severity': 'MEDIUM',
   'confidence': 0.1,
   'evidence_label': 'HR'},
  {'policy': 'SELF_HARM',
   'severity': 'HIGH',
   'confidence': 0.16,
   'evidence_label': 'SH'}],
 'reasoning': 'Content violates SEXUAL_CONTENT policy with HIGH severity.',
 'explainability': {'policy_driven': True,
  'model_used_as_signal': True,
  'human_readable_reasoning': True}}

In [23]:
sample_text = "Wow thats ammazing"
result = moderate_content(sample_text)
result

{'input_text': 'Wow thats ammazing',
 'decision': 'ALLOW',
 'violated_policies': [],
 'reasoning': 'No policy violations detected.',
 'explainability': {'policy_driven': True,
  'model_used_as_signal': True,
  'human_readable_reasoning': True}}

You can use the built-in `input()` function in Python to get input directly from the user in a Colab code cell. When you run the cell, a text box will appear for the user to type their input.

In [24]:
user_text_input = input("Please enter the text you want to moderate: ")

# Now you can use 'user_text_input' with your moderation function
moderation_result = moderate_content(user_text_input)
print("\nModeration Result:")
print(moderation_result)

Please enter the text you want to moderate: i hate it

Moderation Result:
{'input_text': 'i hate it', 'decision': 'DISALLOW', 'violated_policies': [{'policy': 'HATE_SPEECH', 'severity': 'MEDIUM', 'confidence': 0.119, 'evidence_label': 'H'}, {'policy': 'HARASSMENT', 'severity': 'MEDIUM', 'confidence': 0.103, 'evidence_label': 'HR'}, {'policy': 'SELF_HARM', 'severity': 'HIGH', 'confidence': 0.111, 'evidence_label': 'SH'}], 'reasoning': 'Content violates SELF_HARM policy with HIGH severity.', 'explainability': {'policy_driven': True, 'model_used_as_signal': True, 'human_readable_reasoning': True}}


In [25]:
import gradio as gr

def gradio_moderation_interface(text):
    result = moderate_content(text)
    return result

demo = gr.Interface(
    fn=gradio_moderation_interface,
    inputs=gr.Textbox(
        lines=4,
        placeholder="Enter text for moderation"
    ),
    outputs="json",
    title="Policy-Driven LLM Content Moderation",
    description="LLM-assisted, policy-based moderation with explainable reasoning"
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6f29c41d6f531066ef.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [26]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

# Configure the API key
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

print("Gemini API configured.")

Gemini API configured.


In [27]:
# Initialize the Gemini GenerativeModel
# Using 'gemini-pro' for general text generation
gemini_reasoning_model = genai.GenerativeModel('gemini-pro')

print("Gemini Generative Model initialized.")

Gemini Generative Model initialized.


In [28]:
def moderate_content(text):
    # Step 1: Get model signals
    model_signals = get_model_signals(text)

    # Step 2: Extract policy violations
    violations = extract_policy_violation(
        model_signals,
        MODERATION_POLICIES
    )

    # Step 3: Make final decision
    decision, reasoning = make_moderation_decision(
        violations,
        DECISION_RULES
    )

    # Step 4: Build structured output
    response = build_response(
        text,
        decision,
        violations,
        reasoning,
    )

    # Step 5: Generate Gemini explanation if violations exist
    if violations:
        # Build a more comprehensive prompt for Gemini
        detailed_reasoning_prompt = build_reasoning_prompt(
            user_text=text,
            model_scores=model_signals,
            policies=MODERATION_POLICIES
        )

        # Add a final instruction for Gemini to explain the decision
        detailed_reasoning_prompt += f"\n\nBased on the above, explain the moderation decision '{decision}' for the user content, specifically mentioning any triggering words/phrases and relating them to the violated policies ({', '.join(v['policy'] for v in violations)})."

        try:
            gemini_response = gemini_reasoning_model.generate_content(detailed_reasoning_prompt)
            response["gemini_explanation"] = gemini_response.text
        except Exception as e:
            response["gemini_explanation"] = f"Error generating Gemini explanation: {e}"
    else:
        response["gemini_explanation"] = "No specific reasoning generated as no policy violations were detected."

    return response

In [29]:
def build_gemini_explanation_prompt(moderation_result):
    """
    Builds a safe, policy-faithful prompt for Gemini
    to explain the moderation outcome.
    """

    return f"""
You are an AI assistant explaining a content moderation decision
to a non-technical user.

IMPORTANT RULES:
- Do NOT change the decision.
- Do NOT introduce new policies.
- Do NOT add assumptions.
- Explain ONLY based on the information provided.

Moderation Decision:
{moderation_result["decision"]}

Reasoning (policy-based):
{moderation_result["reasoning"]}

Violations:
{moderation_result["violated_policies"]}

Explain clearly and politely why this content was allowed or disallowed.
"""


In [30]:
import json
import time

user_input = input("Please enter the text you want to moderate: ")

moderation_result = moderate_content(user_input)

print("\n--- Detailed Moderation Result ---")
print(json.dumps(moderation_result, indent=2))

print("\n--- Gemini Explanation ---")

gemini_prompt = build_gemini_explanation_prompt(moderation_result)

try:
    print("Calling Gemini...")
    start_time = time.time()

    response = gemini_reasoning_model.generate_content(gemini_prompt)

    print("Gemini responded in", round(time.time() - start_time, 2), "seconds\n")
    print(response.text)

except Exception as e:
    print("Gemini explanation unavailable.")
    print("Fallback explanation:")
    print(moderation_result["reasoning"])


Please enter the text you want to moderate: I hate sleping

--- Detailed Moderation Result ---
{
  "input_text": "I hate sleping",
  "decision": "DISALLOW",
  "violated_policies": [
    {
      "policy": "SEXUAL_CONTENT",
      "severity": "HIGH",
      "confidence": 0.108,
      "evidence_label": "S"
    },
    {
      "policy": "HATE_SPEECH",
      "severity": "MEDIUM",
      "confidence": 0.26,
      "evidence_label": "H"
    },
    {
      "policy": "VIOLENCE",
      "severity": "HIGH",
      "confidence": 0.14,
      "evidence_label": "V"
    },
    {
      "policy": "HARASSMENT",
      "severity": "MEDIUM",
      "confidence": 0.227,
      "evidence_label": "HR"
    }
  ],
  "reasoning": "Content violates SEXUAL_CONTENT policy with HIGH severity.",
  "explainability": {
    "policy_driven": true,
    "model_used_as_signal": true,
    "human_readable_reasoning": true
  },
  "gemini_explanation": "Error generating Gemini explanation: Invalid leading whitespace, reserved character(s

In [31]:
# Explanation templates for policy-based reasoning

POSITIVE_EXPLANATION = (
    "The content was reviewed against all moderation policies and was found to be safe. "
    "It does not contain hate speech, harassment, violence, self-harm references, "
    "or any other restricted content."
)

POLICY_EXPLANATION_TEMPLATES = {
    "HATE_SPEECH": (
        "The content contains hostile or demeaning language directed at a person or group, "
        "which violates the Hate Speech policy."
    ),
    "HARASSMENT": (
        "The content includes abusive or aggressive language directed at an individual, "
        "which falls under the Harassment policy."
    ),
    "VIOLENCE": (
        "The content expresses or promotes physical harm or threats, such as wishing pain "
        "or injury, which violates the Violence policy."
    ),
    "SELF_HARM": (
        "The content references or encourages self-harm, which violates the Self-Harm policy."
    ),
    "SEXUAL_CONTENT": (
        "The content contains sexually explicit or inappropriate references that are "
        "not suitable for general audiences."
    )
}


In [32]:
def generate_policy_explanation(decision, violations):
    """
    Generates a human-readable explanation for both
    allowed and disallowed content using policy templates.
    """

    # Case 1: Content is allowed
    if decision == "ALLOW":
        return POSITIVE_EXPLANATION

    # Case 2: Content is disallowed or restricted
    explanations = []

    for v in violations:
        policy = v["policy"]
        if policy in POLICY_EXPLANATION_TEMPLATES:
            explanations.append(POLICY_EXPLANATION_TEMPLATES[policy])

    # Remove duplicates while preserving order
    explanations = list(dict.fromkeys(explanations))

    return " ".join(explanations)


In [33]:
import gradio as gr

def gradio_moderation_interface(text):
    result = moderate_content(text)
    return result

demo = gr.Interface(
    fn=gradio_moderation_interface,
    inputs=gr.Textbox(
        lines=4,
        placeholder="Enter text for moderation"
    ),
    outputs="json",
    title="Policy-Driven LLM Content Moderation",
    description="LLM-assisted, policy-based moderation with explainable reasoning"
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f2b7de7bce2633730c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [34]:
import json

def gradio_moderation_interface(text):
    result = moderate_content(text)
    full_result_json = json.dumps(result, indent=2)
    gemini_explanation = result.get("gemini_explanation", "No explanation available.")
    return full_result_json, gemini_explanation


**Reasoning**:
Now that the `gradio_moderation_interface` function is updated to return two distinct outputs (JSON string and Gemini explanation), the Gradio `Interface` needs to be modified to handle these two outputs using `gr.JSON` and `gr.Textbox` components respectively.



In [35]:
import gradio as gr

def gradio_moderation_interface(text):
    result = moderate_content(text)
    full_result_json = json.dumps(result, indent=2)
    gemini_explanation = result.get("gemini_explanation", "No explanation available.")
    return full_result_json, gemini_explanation

demo = gr.Interface(
    fn=gradio_moderation_interface,
    inputs=gr.Textbox(
        lines=4,
        placeholder="Enter text for moderation"
    ),
    outputs=[
        gr.JSON(label="Full Moderation Result"),
        gr.Textbox(label="Gemini Explanation", lines=10)
    ],
    title="Policy-Driven LLM Content Moderation",
    description="LLM-assisted, policy-based moderation with explainable reasoning"
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://74481bce78e9248363.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


