### Real-Time Content Moderation System
This notebook presents a real-time content moderation pipeline that integrates toxicity detection and fact verification models. It acts as a safeguard before LLM-generated content is shown to users.

### Step 1: Install Required Libraries
Install the required libraries before proceeding:

In [None]:
! pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

### Step 2: Import Libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

### Step 3: Define the Safety Ensemble Class
This class loads both toxicity detection and fact-checking models and provides a method to analyze text.

In [None]:
class SafetyEnsemble:
    def __init__(self, hf_token=None):
        """
        Initialize safety models with optional Hugging Face token
        """
        # Load toxicity detection model
        self.toxicity_model = AutoModelForSequenceClassification.from_pretrained(
            "facebook/roberta-hate-speech-dynabench-r4-target",
            use_auth_token=hf_token
        )

        # Load fact verification model (natural language inference)
        self.factcheck_model = AutoModelForSequenceClassification.from_pretrained(
            "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
            use_auth_token=hf_token
        )

        # Load a shared tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            "roberta-base",
            use_auth_token=hf_token
        )

    def analyze(self, text):
        """
        Analyze text for toxicity and factual accuracy
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

        tox_score = self._get_toxicity_score(inputs)
        veracity, veracity_probs = self._get_veracity(inputs)

        return {
            "toxicity_risk": tox_score,
            "fact_accuracy": {
                "label": veracity,
                "probabilities": veracity_probs
            },
            "block": tox_score > 0.9 or veracity == "contradiction"
        }

    def _get_toxicity_score(self, inputs):
        """Return probability of toxicity (0 to 1)"""
        self.toxicity_model.eval()
        with torch.no_grad():
            outputs = self.toxicity_model(**inputs)
        # Handle binary or multi-class logits
        return torch.sigmoid(outputs.logits).item() if outputs.logits.shape[1] == 1 else torch.softmax(outputs.logits, dim=1)[0][1].item()

    def _get_veracity(self, inputs):
        """Use an NLI model to assess veracity"""
        self.factcheck_model.eval()
        with torch.no_grad():
            outputs = self.factcheck_model(**inputs)

        probs = torch.softmax(outputs.logits, dim=1)[0]
        label_idx = torch.argmax(probs).item()
        label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

        return label_map[label_idx], [round(p.item(), 4) for p in probs]

### Step 4: Define a Dummy LLM for Testing

In [None]:
class DummyLLM:
    """Mock LLM for demonstration purposes"""
    def generate(self, prompt: str):
        return {"generated_text": f"This is a safe response to: '{prompt}'."}


### Step 5: Run the Moderation Pipeline

In [None]:
def main():
    safety = SafetyEnsemble()
    llm = DummyLLM()

    print("Safety Analysis Demo (type 'quit' to exit)")

    while True:
        prompt = input("\nEnter text to analyze: ").strip()
        if prompt.lower() in ('quit', 'exit'):
            break

        analysis = safety.analyze(prompt)

        print("\nAnalysis Results:")
        print(f"Toxicity Risk: {analysis['toxicity_risk']:.4f}")
        print(f"Fact Accuracy: {analysis['fact_accuracy']['label']}")
        print(f"  Probabilities: [E: {analysis['fact_accuracy']['probabilities'][0]:.2f}, "
              f"N: {analysis['fact_accuracy']['probabilities'][1]:.2f}, "
              f"C: {analysis['fact_accuracy']['probabilities'][2]:.2f}]")

        if analysis['block']:
            print("\n❌ Blocked - Reason:", end=" ")
            if analysis['toxicity_risk'] > 0.9:
                print("High toxicity risk", end="")
                if analysis['fact_accuracy']['label'] == "contradiction":
                    print(" and factual contradiction")
                else:
                    print()
            else:
                print("Factual contradiction")
        else:
            response = llm.generate(prompt)
            print(f"\n✅ Allowed - Generated response: {response['generated_text']}")


### Step 6: Run the Main Function

In [None]:
if __name__ == "__main__":
    main()