In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("usharengaraju/dynamically-generated-hate-speech-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/usharengaraju/dynamically-generated-hate-speech-dataset?dataset_version_number=1...


100%|██████████| 1.64M/1.64M [00:00<00:00, 2.94MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/usharengaraju/dynamically-generated-hate-speech-dataset/versions/1


In [3]:
import pandas as pd
import os

# Loading the dataset from the downloaded path
file_path = os.path.join(path, "2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv")
df = pd.read_csv(file_path)

# Printing the shape of the dataset to see how many rows and columns it is having
print("Shape:", df.shape)

# Printing the list of columns to understand what features the dataset is containing
print("Columns:", df.columns.tolist())

# Printing the first few rows to get a quick look at how the data is looking
print("\nSample rows:")
print(df.head())

# Checking for missing values to see if the dataset is having any null or empty cells
print("\nMissing values:")
print(df.isnull().sum())

# Checking the distribution of labels to understand how many hateful vs non-hateful samples are present
print("\nLabel distribution:")
print(df['label'].value_counts())


Shape: (40623, 11)
Columns: ['Unnamed: 0', 'id', 'text', 'label', 'type', 'model_wrong', 'db.model_preds', 'status', 'round', 'split', 'annotator']

Sample rows:
   Unnamed: 0     id                                               text label  \
0           1  10000  It should be legal for a husband to rape his wife  hate   
1           2  10001                            dalits are lowlife scum  hate   
2           3  10002                                dalits are lowlives  hate   
3           4  10003  It was a better world when women dare not ques...  hate   
4           5  10004  I don't work this hard so that those immigrant...  hate   

       type model_wrong  db.model_preds           status round  split  \
0  notgiven        True         0.97457  dynabench entry     1  train   
1  notgiven       False         0.08233  dynabench entry     1   test   
2  notgiven        True         0.92319  dynabench entry     1  train   
3  notgiven        True         0.99006  dynabench entry   

In [4]:
# Keeping only the important columns: id, text, and label
df = df[['id', 'text', 'label']]

# Dropping rows where text or label is missing
df = df.dropna(subset=['text', 'label'])

# Converting all text to lowercase so that model is treating words consistently
df['text'] = df['text'].str.lower()

# Checking the updated shape after cleaning
print("Shape after cleaning:", df.shape)

# Checking the unique labels to confirm what categories are present
print("Unique labels:", df['label'].unique())

# Printing some random rows to see the cleaned text
print("\nSample after cleaning:")
print(df.sample(5))


Shape after cleaning: (40623, 3)
Unique labels: ['hate' 'nothate']

Sample after cleaning:
           id                                               text    label
19015  26611p  more disabled people should kill themselves no...     hate
3638    15186  if you are a lesbian, i don't want you near me...     hate
34716  44032p  trans ppl need to be left alone, it's not like...  nothate
36806    5542  i hate when people call me weird just because ...  nothate
8124    20755  there are a few things which i will definitely...     hate


In [5]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Printing the sizes of train and test sets
print("Training set size:", len(train_texts))
print("Testing set size:", len(test_texts))

# Showing some sample training data
print("\nSample training text and label:")
for i in range(3):
    print(f"Text: {train_texts.iloc[i]} | Label: {train_labels.iloc[i]}")


Training set size: 32498
Testing set size: 8125

Sample training text and label:
Text: there are plenty of reasons why people act the way they do and why some communities think that they are victims all of the time.   all of these veterans taking advantage when their worth is  the genuine equivalent to an insect in the garden. | Label: nothate
Text: why is it that when black people try to protest, not enough of them are being murdered if that is treated as riot?! | Label: hate
Text: this is getting out of hand now, i have had 5 arguments today already just because i merely expressed my opinion, just because you think indigenous people should exist doesnt mean i do, of course i dont and i dont see why anyone would. this is so silly and is starting to exhaust me now | Label: hate


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fitting on training data and transforming both train and test
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Printing the shapes of transformed datasets
print("Training data shape after vectorization:", X_train.shape)
print("Testing data shape after vectorization:", X_test.shape)

Training data shape after vectorization: (32498, 5000)
Testing data shape after vectorization: (8125, 5000)


In [8]:
from sklearn.linear_model import LogisticRegression

# Initializing Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Training the model on training data
model.fit(X_train, train_labels)

# Printing confirmation
print("Model training is completed")

Model training is completed


In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Making predictions on the test data
y_pred = model.predict(X_test)

# Printing classification report to show precision, recall, f1-score
print("Classification Report:")
print(classification_report(test_labels, y_pred))

# Printing confusion matrix to see correct vs incorrect classifications
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))

Classification Report:
              precision    recall  f1-score   support

        hate       0.74      0.78      0.76      4425
     nothate       0.72      0.67      0.69      3700

    accuracy                           0.73      8125
   macro avg       0.73      0.72      0.73      8125
weighted avg       0.73      0.73      0.73      8125

Confusion Matrix:
[[3462  963]
 [1236 2464]]


In [10]:
def chatbot_response(user_input):
    # Converting input to lowercase
    user_input = user_input.lower()

    # Transforming input using the fitted TF-IDF vectorizer
    vectorized_input = vectorizer.transform([user_input])

    # Predicting label
    prediction = model.predict(vectorized_input)[0]

    # Filtering unsafe response if detected as hateful
    if prediction == "hateful":
        return "This message is blocked due to unsafe or hateful content."
    else:
        return "Message accepted: " + user_input

# Testing the chatbot
print(chatbot_response("I hate you"))
print(chatbot_response("Have a nice day"))


Message accepted: i hate you
Message accepted: have a nice day


In [17]:
# Defining a simple keyword-based filter for unsafe terms
unsafe_keywords = unsafe_keywords = [
    "hate", "kill", "racist", "terrorist", "stupid", "idiot",
    "dumb", "moron", "fool", "loser", "pathetic", "ugly", "trash",
    "garbage", "worthless", "pig", "dog", "slut", "whore", "bitch",
    "bastard", "jerk", "retard", "psycho", "crazy", "lunatic",
    "violent", "attack", "murder", "die", "shoot", "gun", "knife",
    "bomb", "explosive", "suicide", "hang", "burn", "stab", "rape",
    "molest", "abuse", "harass", "slave", "bigot", "nazi", "fascist",
    "klan", "islamophobic", "homophobic", "sexist", "misogynist",
    "pedophile", "predator", "pervert", "incel", "drug", "addict",
    "junkie", "drunk", "alcoholic", "fat", "obese", "cripple", "disease","worst"
]


def chatbot_response(user_input):
    # Converting input to lowercase
    user_input = user_input.lower()

    # Transforming input using the fitted TF-IDF vectorizer
    vectorized_input = vectorizer.transform([user_input])

    # Predicting label
    prediction = model.predict(vectorized_input)[0]

    # Checking for unsafe keywords
    if prediction == "hateful" or any(word in user_input for word in unsafe_keywords):
        return "This message is blocked due to unsafe or hateful content."
    else:
        return "Message accepted: " + user_input

# Testing the chatbot again
print(chatbot_response("I hate you"))
print(chatbot_response("You are a kind person"))


This message is blocked due to unsafe or hateful content.
Message accepted: you are a kind person


In [18]:
import re

def contains_pii(text):
    # Checking for email addresses
    if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text):
        return True
    # Checking for phone numbers (simple pattern)
    if re.search(r'\b\d{10}\b', text):
        return True
    # Checking for credit card numbers (simple 16-digit pattern)
    if re.search(r'\b\d{16}\b', text):
        return True
    return False

def chatbot_response(user_input):
    # Converting input to lowercase
    user_input = user_input.lower()

    # Checking for PII first
    if contains_pii(user_input):
        return "This message is blocked due to personal information."

    # Transforming input using TF-IDF vectorizer
    vectorized_input = vectorizer.transform([user_input])

    # Predicting label
    prediction = model.predict(vectorized_input)[0]

    # Checking for unsafe keywords
    if prediction == "hateful" or any(word in user_input for word in unsafe_keywords):
        return "This message is blocked due to unsafe or hateful content."
    else:
        return "Message accepted: " + user_input

# Testing the chatbot with PII
print(chatbot_response("My email is test@example.com"))
print(chatbot_response("Call me at 9876543210"))
print(chatbot_response("You are stupid"))
print(chatbot_response("Good morning friend"))


This message is blocked due to personal information.
This message is blocked due to personal information.
This message is blocked due to unsafe or hateful content.
Message accepted: good morning friend


In [19]:
# Defining role-based test cases
safety_engineer_tests = [
    "I hate you",
    "You are racist",
    "Kill them all"
]

filter_developer_tests = [
    "My email is user123@gmail.com",
    "Here is my credit card 1234567812345678",
    "Call me at 9876543210"
]

tester_tests = [
    "Good morning, have a great day",
    "I love programming",
    "Let's go for a walk"
]

print("=== Safety Engineer Testing ===")
for test in safety_engineer_tests:
    print(f"Input: {test} -> Output: {chatbot_response(test)}")

print("\n=== Filter Developer Testing ===")
for test in filter_developer_tests:
    print(f"Input: {test} -> Output: {chatbot_response(test)}")

print("\n=== Tester Testing ===")
for test in tester_tests:
    print(f"Input: {test} -> Output: {chatbot_response(test)}")


=== Safety Engineer Testing ===
Input: I hate you -> Output: This message is blocked due to unsafe or hateful content.
Input: You are racist -> Output: This message is blocked due to unsafe or hateful content.
Input: Kill them all -> Output: This message is blocked due to unsafe or hateful content.

=== Filter Developer Testing ===
Input: My email is user123@gmail.com -> Output: This message is blocked due to personal information.
Input: Here is my credit card 1234567812345678 -> Output: This message is blocked due to personal information.
Input: Call me at 9876543210 -> Output: This message is blocked due to personal information.

=== Tester Testing ===
Input: Good morning, have a great day -> Output: Message accepted: good morning, have a great day
Input: I love programming -> Output: Message accepted: i love programming
Input: Let's go for a walk -> Output: Message accepted: let's go for a walk


In [20]:
import gradio as gr

def chatbot_interface(user_input):
    return chatbot_response(user_input)

# Creating Gradio interface
iface = gr.Interface(
    fn=chatbot_interface,
    inputs="text",
    outputs="text",
    title="Secure LLM Chatbot with Content Filtering",
    description="This chatbot is blocking unsafe responses such as hate speech and personal information."
)

# Launching the interface
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fc57141b564ebca322.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


