In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model and tokenizer
model_name = "KoalaAI/Text-Moderation"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the toxic-BERT model
moderation_model = pipeline("text-classification", model="unitary/toxic-bert", return_all_scores=True)

Device set to use cpu


In [16]:
# Example usage
text_input = "I don't like you, but I like to offend my brother, sometimes fuck my life"
moderation_result = moderate_text(text_input)
print(moderation_result)

Label: H - Probability: 0.0695
Label: H2 - Probability: 0.0464
Label: HR - Probability: 0.1206
Label: OK - Probability: 0.0458
Label: S - Probability: 0.0521
Label: S3 - Probability: 0.0266
Label: SH - Probability: 0.4109
Label: V - Probability: 0.1711
Label: V2 - Probability: 0.0570
('H', 0.06954085826873779)
{'H': 0.06954085826873779, 'H2': 0.046366188675165176, 'HR': 0.12059610337018967, 'OK': 0.04579153656959534, 'S': 0.05209412798285484, 'S3': 0.026556119322776794, 'SH': 0.4109174311161041, 'V': 0.17109625041484833, 'V2': 0.057041388005018234}
False


In [23]:
test_data = [
    "This sucks, you are the worst",
    "How wonderful! but I will kill you",
    "Fuck you, I hope you have a great day",
    "Hindus are devil",
    "c++ is so complex, it is going to be the end of me."
]


for test in test_data:
    print(moderate_text(test))

Label: H - Probability: 0.0325
Label: H2 - Probability: 0.0038
Label: HR - Probability: 0.0147
Label: OK - Probability: 0.8934
Label: S - Probability: 0.0046
Label: S3 - Probability: 0.0031
Label: SH - Probability: 0.0305
Label: V - Probability: 0.0122
Label: V2 - Probability: 0.0052
This sucks, you are the worst
{'H': 0.03250180929899216, 'H2': 0.0038343053311109543, 'HR': 0.014703861437737942, 'OK': 0.8933942914009094, 'S': 0.0045925891026854515, 'S3': 0.0031032410915941, 'SH': 0.03048069030046463, 'V': 0.012174390256404877, 'V2': 0.005214716773480177}
True
Label: H - Probability: 0.3169
Label: H2 - Probability: 0.1724
Label: HR - Probability: 0.1184
Label: OK - Probability: 0.0159
Label: S - Probability: 0.0223
Label: S3 - Probability: 0.0109
Label: SH - Probability: 0.0412
Label: V - Probability: 0.2729
Label: V2 - Probability: 0.0291
How wonderful! but I will kill you
{'H': 0.3168570399284363, 'H2': 0.17242419719696045, 'HR': 0.1184321865439415, 'OK': 0.015861589461565018, 'S': 0.

In [28]:
# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [32]:
# Function to mask toxic words in a sentence
def mask_toxic_words(sentence, threshold=0.7):
    words = sentence.split()  # Split sentence into words
    masked_sentence = []

    # Analyze each word for toxicity
    for word in words:
        result = moderation_model(word)
        toxic_score = next((score["score"] for score in result[0] if score["label"] == "toxic"), 0)

        # Mask word if it exceeds the threshold
        if toxic_score >= threshold:
            masked_sentence.append("*" * len(word))  # Use asterisks equal to the length of the word
        else:
            masked_sentence.append(word)

    return " ".join(masked_sentence)

In [33]:
mask_toxic_words("Fuck my life in 2 pieces this is my last resort. Suffocation is shit but my life is dick")

'**** my life in 2 pieces this is my last resort. Suffocation is **** but my life is ****'

In [69]:
# Load the model and tokenizer
model_name = "KoalaAI/Text-Moderation"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the toxic-BERT model
moderation_model = pipeline("text-classification", model="unitary/toxic-bert", return_all_scores=True)

Device set to use cpu


In [3]:
def moderate_text(text, thresholdOK=0.4):
    """
    Function to check if the input text is safe or unsafe based on category thresholds.

    Args:
        text (str): The input text to moderate.
        thresholds (dict): A dictionary with category labels as keys and threshold values as values.
                          Default values are used if not provided.

    Returns:
        dict: A dictionary containing the moderation result and category violations.
    """

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Run the model and get logits
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits

    # Convert logits to probabilities
    probabilities = logits.softmax(dim=-1).squeeze()

    # Map IDs to labels
    id2label = model.config.id2label
    labels = [id2label[idx] for idx in range(len(probabilities))]

    # Combine labels and probabilities
    label_prob_pairs = list(zip(labels, probabilities.tolist()))

    # Print the sorted results
    for label, probability in label_prob_pairs:
        print(f"Label: {label} - Probability: {probability:.4f}")

    map={}
    for label, prob in label_prob_pairs:
        map[label] = prob
    
    if map['OK'] < thresholdOK:
        return False

    return True

# Function to mask toxic words in a sentence
def mask_toxic_words(sentence, thresholdMask=0.7):
    words = sentence.split()  # Split sentence into words
    masked_sentence = []

    # Analyze each word for toxicity
    for word in words:
        result = moderation_model(word)
        toxic_score = next((score["score"] for score in result[0] if score["label"] == "toxic"), 0)

        # Mask word if it exceeds the threshold
        if toxic_score >= thresholdMask:
            masked_sentence.append("*" * len(word))  # Use asterisks equal to the length of the word
        else:
            masked_sentence.append(word)

    return " ".join(masked_sentence)

In [4]:
def process_text_pipeline(text, moderate_thresholdOK=0.2, mask_threshold=0.7):
    """
    Pipeline to moderate text and censor toxic words.
    Args:
        text (str): The input text.
        moderate_thresholds (dict): Thresholds for `moderate_text` function.
        moderate_thresholdOK (float): OK label threshold for `moderate_text` function.
        mask_threshold (float): Toxicity threshold for `mask_toxic_words` function.
    Returns:
        dict: A dictionary with moderation status and censored text.
    """
    # Step 1: Moderate the text
    is_safe = moderate_text(text, thresholdOK=moderate_thresholdOK)
    
    if not is_safe:
        return {
            "status": "unsafe",
            "censored_text": ""
        }

    # Step 2: Censor toxic words regardless of moderation status
    censored_text = mask_toxic_words(text, thresholdMask=mask_threshold)

    # Return the results
    return {
        "status": "safe" if is_safe else "unsafe",
        "censored_text": censored_text
    }

In [68]:
example_text = "You are such an idiot and a fool! but smart and crazy good, You are the best person in this world! you have the best  smile"
pipeline_result = process_text_pipeline(example_text)
print(pipeline_result)

Label: H - Probability: 0.5862
Label: H2 - Probability: 0.0364
Label: HR - Probability: 0.2002
Label: OK - Probability: 0.0666
Label: S - Probability: 0.0131
Label: S3 - Probability: 0.0078
Label: SH - Probability: 0.0167
Label: V - Probability: 0.0583
Label: V2 - Probability: 0.0147
{'status': 'unsafe', 'censored_text': ''}


In [58]:
data=[
    "You are such an idiot and a fool!",
    "This is awesome but that message was bullshit!",
    "Don't think that I give 2 fucks about your life",
    "I love and hate my life",
    "Give up on your dreams and die",
    "Hindus are devil",
    "Muslims are evil",
    "bad, fuck, dick, suck, piss, bitch, fucker, licker, annoying, simp, lol, nerd, cheese, butter, frick"
]

for text in data:
    print(process_text_pipeline(text))

Label: H - Probability: 0.6480
Label: H2 - Probability: 0.0630
Label: HR - Probability: 0.1376
Label: OK - Probability: 0.0347
Label: S - Probability: 0.0118
Label: S3 - Probability: 0.0077
Label: SH - Probability: 0.0088
Label: V - Probability: 0.0753
Label: V2 - Probability: 0.0131
{'status': 'unsafe', 'censored_text': 'You are such an ***** and a *****'}
Label: H - Probability: 0.0033
Label: H2 - Probability: 0.0004
Label: HR - Probability: 0.0007
Label: OK - Probability: 0.9913
Label: S - Probability: 0.0006
Label: S3 - Probability: 0.0004
Label: SH - Probability: 0.0016
Label: V - Probability: 0.0010
Label: V2 - Probability: 0.0007
{'status': 'safe', 'censored_text': 'This is awesome but that message was *********'}
Label: H - Probability: 0.0211
Label: H2 - Probability: 0.0010
Label: HR - Probability: 0.0051
Label: OK - Probability: 0.9616
Label: S - Probability: 0.0019
Label: S3 - Probability: 0.0011
Label: SH - Probability: 0.0038
Label: V - Probability: 0.0032
Label: V2 - Prob