In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load tokenizer and model
model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Labels the model uses
labels = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

def predict_toxicity(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    
    scores = torch.sigmoid(outputs.logits)[0].numpy()
    results = {label: float(score) for label, score in zip(labels, scores)}

    # Decide if it's inappropriate based on any high score
    toxic_flags = [label for label, score in results.items() if score > 0.5]
    
    return "inappropriate" if toxic_flags else "appropriate"


  from .autonotebook import tqdm as notebook_tqdm


In [16]:
print(predict_toxicity("very bad wesbite and good content"))
print(predict_toxicity("bad person for wesbite"))
print(predict_toxicity("You are a good !"))


appropriate
appropriate
appropriate
