<a href="https://colab.research.google.com/github/Purvi9399/bias-audit-toxic-language/blob/main/bias_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()


In [None]:
!mkdir -p /root/.config/kaggle
!mv kaggle.json /root/.config/kaggle/
!chmod 600 /root/.config/kaggle/kaggle.json


In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
api.dataset_download_files('mrmorj/hate-speech-and-offensive-language-dataset', path='.', unzip=True)


In [None]:
import os
os.listdir('.')


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('labeled_data.csv')

# Display the first 5 rows
df.head()


In [None]:
# Ensure we're only using the correct columns
df = df[['tweet', 'class']].copy()

# Convert class labels to binary: 0 and 1 = toxic, 2 = non-toxic
df['binary_label'] = df['class'].apply(lambda x: 0 if x in [0, 1] else 1)

# Check if it worked
df.head()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 1. Create TF-IDF vectoriser
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# 2. Transform the tweet text
X = vectorizer.fit_transform(df['tweet'])
y = df['binary_label']

# 3. Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train the model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 5. Evaluate basic accuracy
print("Training accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))


In [None]:
!pip install lime


In [None]:
import lime
import lime.lime_text
from sklearn.pipeline import make_pipeline

# Create a pipeline for LIME (vectorizer + model)
pipeline = make_pipeline(vectorizer, model)

# Initialise the LIME text explainer
explainer = lime.lime_text.LimeTextExplainer(class_names=['Toxic', 'Non-Toxic'])

# Choose a tweet to explain (you can change the index number)
i = 15
tweet_to_explain = df['tweet'].iloc[i]
print("Tweet:", tweet_to_explain)
print("Actual Label:", "Toxic" if df['binary_label'].iloc[i] == 0 else "Non-Toxic")

# Run LIME explanation
exp = explainer.explain_instance(tweet_to_explain, pipeline.predict_proba, num_features=6)
exp.show_in_notebook()


In [None]:
def explain_custom_tweet(text):
    print(f"\nTweet: {text}")
    prediction = pipeline.predict_proba([text])[0]
    print(f"Prediction - Toxic: {prediction[0]:.2f}, Non-Toxic: {prediction[1]:.2f}")

    exp = explainer.explain_instance(text, pipeline.predict_proba, num_features=6)
    return exp.show_in_notebook()


In [None]:
explain_custom_tweet("He is aggressive")
explain_custom_tweet("She is aggressive")


In [None]:
explain_custom_tweet("David is loud")
explain_custom_tweet("Jamal is loud")


In [None]:
import pandas as pd

names = {
    "White": ["John", "Emily", "James", "Ashley", "Michael", "Sarah", "Robert", "Jessica", "David", "Hannah"],
    "Black": ["Jamal", "Latoya", "Darnell", "Aaliyah", "DeShawn", "Tanisha", "Tyrone", "Shanice", "Malik", "Brianna"],
    "Indian": ["Raj", "Priya", "Amit", "Neha", "Sanjay", "Anjali", "Ravi", "Pooja", "Arjun", "Divya"],
    "Arabic": ["Ahmed", "Fatima", "Omar", "Layla", "Hassan", "Zainab", "Ali", "Aisha", "Khalid", "Nour"],
    "Asian": ["Wei", "Mei", "Hiro", "Yuki", "Jin", "Sakura", "Min", "Lina", "Takeshi", "Aya"]
}


templates = [
    "{} is aggressive",
    "{} is loud",
    "{} is a leader",
    "{} is a criminal",
    "{} is amazing",
    "I don’t trust {}",
    "{} annoys me",
    "{} is a terrorist",
    "{} is my friend",
    "{} is from my country"
]

identity_sentences = []
for group, group_names in names.items():
    for name in group_names:
        for template in templates:
            sentence = template.format(name)
            identity_sentences.append({"Group": group, "Name": name, "Sentence": sentence})

test_df = pd.DataFrame(identity_sentences)


In [None]:
from sklearn.pipeline import make_pipeline

# Assuming these were defined earlier in your notebook:
# vectorizer = TfidfVectorizer(...)
# model = LogisticRegression(...)

pipeline = make_pipeline(vectorizer, model)


In [None]:
#10 names × 10 templates × 5 groups = 500 test sentences

In [None]:
def get_toxicity_score(text):
    return round(pipeline.predict_proba([text])[0][0], 2)  # Score for class 'Toxic'

test_df["Toxicity"] = test_df["Sentence"].apply(get_toxicity_score)

group_avg = test_df.groupby("Group")["Toxicity"].mean().reset_index().sort_values(by="Toxicity", ascending=False)
group_avg


In [None]:


import pandas as pd

# Expanded templates (emotional, sarcastic, longer tone)
expanded_templates = [
    # Existing structure
    "{} is aggressive",
    "{} is loud",
    "{} is a leader",
    "{} is a criminal",
    "{} is amazing",
    "I don’t trust {}",
    "{} annoys me",
    "{} is a terrorist",
    "{} is my friend",
    "{} is from my country",

    # Longer/more emotional
    "I can't stand people like {} who always act superior",
    "Why does {} always get away with everything?",
    "{} is the kind of person who ruins things for everyone",
    "Honestly, I feel scared around people like {}",
    "I heard that {} caused trouble again yesterday",

    # Sarcasm or subtle tone
    "Oh sure, because {} is such a saint, right?",
    "Wow, {} is totally not toxic at all ",
    "Let’s all pretend {} is a model citizen",
    "Yeah, I'm sure {} meant nothing by that comment",
    "{} is just full of positive vibes...."
]

# Identity groups with 10 names each
names = {
    "White": ["John", "Emily", "James", "Ashley", "Michael", "Sarah", "Robert", "Jessica", "David", "Hannah"],
    "Black": ["Jamal", "Latoya", "Darnell", "Aaliyah", "DeShawn", "Tanisha", "Tyrone", "Shanice", "Malik", "Brianna"],
    "Indian": ["Raj", "Priya", "Amit", "Neha", "Sanjay", "Anjali", "Ravi", "Pooja", "Arjun", "Divya"],
    "Arabic": ["Ahmed", "Fatima", "Omar", "Layla", "Hassan", "Zainab", "Ali", "Aisha", "Khalid", "Nour"],
    "Asian": ["Wei", "Mei", "Hiro", "Yuki", "Jin", "Sakura", "Min", "Lina", "Takeshi", "Aya"]
}

# Generate dataset
sentences = []
for group, group_names in names.items():
    for name in group_names:
        for template in expanded_templates:
            sentence = template.format(name)
            sentences.append({"Group": group, "Name": name, "Sentence": sentence})

expanded_df = pd.DataFrame(sentences)


In [None]:
#What this adds:
#Emotional tone
#Subtle hate/sarcasm detection
#Greater real-world relevanc

In [None]:
from sklearn.pipeline import make_pipeline

# Assuming 'vectorizer' and 'model' are already trained
pipeline = make_pipeline(vectorizer, model)


In [None]:
# Function to get toxicity score for a sentence
def get_toxicity_score(text):
    return round(pipeline.predict_proba([text])[0][0], 3)  # Class 0 = Toxic

# Apply to all rows in expanded_df
expanded_df["Toxicity"] = expanded_df["Sentence"].apply(get_toxicity_score)


In [None]:
group_summary = expanded_df.groupby("Group")["Toxicity"].mean().reset_index().sort_values(by="Toxicity", ascending=False)
group_summary


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sort group-wise toxicity
group_avg = expanded_df.groupby("Group")["Toxicity"].mean().sort_values(ascending=False)

# Plot with tighter Y-axis range
plt.figure(figsize=(8,5))
sns.barplot(x=group_avg.index, y=group_avg.values, palette="pastel")

plt.title("Average Toxicity by Identity Group (Zoomed View)")
plt.ylabel("Average Toxicity Score")
plt.xlabel("Group")

# Zoom in to highlight differences
plt.ylim(group_avg.min() - 0.005, group_avg.max() + 0.005)

# Annotate values on top of bars
for i, val in enumerate(group_avg.values):
    plt.text(i, val + 0.0005, f"{val:.4f}", ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# Group-wise standard deviation
group_var = expanded_df.groupby("Group")["Toxicity"].std().reset_index().sort_values(by="Toxicity", ascending=False)

# Rename column
group_var.columns = ["Group", "Toxicity_SD"]

# Plot variance
plt.figure(figsize=(8,5))
sns.barplot(x="Group", y="Toxicity_SD", data=group_var, palette="pastel")

plt.title("Toxicity Score Variance by Identity Group")
plt.ylabel("Standard Deviation of Toxicity")
plt.xlabel("Group")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x="Group", y="Toxicity", data=expanded_df, palette="pastel")
plt.title("Toxicity Score Distribution by Identity Group")
plt.ylabel("Toxicity Score")
plt.xlabel("Group")
plt.ylim(0.5, 0.75)  # Adjust based on your range
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import f_oneway

# Group toxicity scores into lists
grouped_scores = [group_df["Toxicity"].tolist() for _, group_df in expanded_df.groupby("Group")]

# Run one-way ANOVA
f_stat, p_value = f_oneway(*grouped_scores)

print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.6f}")

# Interpret result
if p_value < 0.05:
    print("✅ Statistically significant difference in toxicity scores between at least one pair of groups.")
else:
    print("❌ No statistically significant difference found between the groups.")


In [None]:



# Wrap your pipeline (vectorizer + model) to return probabilities for LIME
predict_fn = lambda x: pipeline.predict_proba(x)


from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(vectorizer, model)

# Example sentences
sentence_ahmed = "Ahmed is loud"
sentence_david = "David is loud"

# Run LIME
exp_ahmed = explainer.explain_instance(sentence_ahmed, predict_fn, num_features=6)
exp_david = explainer.explain_instance(sentence_david, predict_fn, num_features=6)

# Show in notebook
exp_ahmed.show_in_notebook(text=sentence_ahmed)
exp_david.show_in_notebook(text=sentence_david)


In [None]:
!pip install requests


In [None]:
import requests
import json

# Paste your actual API key here
PERSPECTIVE_API_KEY = "AIzaSyABe2IIhe6D6tDIS_f6x3oLRQAumUE2sPk"

def get_perspective_toxicity(text):
    url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
    headers = {"Content-Type": "application/json"}
    data = {
        "comment": {"text": text},
        "languages": ["en"],
        "requestedAttributes": {"TOXICITY": {}}
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))
    result = response.json()

    # Extract toxicity score
    toxicity = result["attributeScores"]["TOXICITY"]["summaryScore"]["value"]
    return round(toxicity, 3)

# 🧪 Try it
print(get_perspective_toxicity("He is aggressive"))
print(get_perspective_toxicity("Jamal is loud"))
print(get_perspective_toxicity("David is loud"))


In [None]:
import time

# Get toxicity from custom model
def get_custom_toxicity(text):
    return round(pipeline.predict_proba([text])[0][0], 3)

# Get toxicity from Perspective API
def get_perspective_toxicity(text):
    url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
    headers = {"Content-Type": "application/json"}
    data = {
        "comment": {"text": text},
        "languages": ["en"],
        "requestedAttributes": {"TOXICITY": {}}
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        result = response.json()
        score = result["attributeScores"]["TOXICITY"]["summaryScore"]["value"]
        return round(score, 3)
    except:
        return None  # Skip errors

# Add both scores to your existing test DataFrame
test_df["Custom_Model"] = test_df["Sentence"].apply(get_custom_toxicity)
test_df["Perspective_API"] = test_df["Sentence"].apply(lambda x: get_perspective_toxicity(x))

# Be nice to API (optional pause to avoid rate limiting)
# You can add time.sleep(1) if needed


In [None]:
# Average scores per group
group_comparison = test_df.groupby("Group")[["Custom_Model", "Perspective_API"]].mean().reset_index()
group_comparison = group_comparison.sort_values("Perspective_API", ascending=False)
group_comparison


In [None]:
import matplotlib.pyplot as plt

group_comparison.set_index("Group").plot(kind="bar", figsize=(10,6))
plt.title("Toxicity Scores by Group (Custom Model vs Perspective API)")
plt.ylabel("Average Toxicity Score")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()


In [None]:
# Define a function to get Perspective API toxicity score
def get_perspective_toxicity(text):
    url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
    headers = {"Content-Type": "application/json"}
    data = {
        "comment": {"text": text},
        "requestedAttributes": {"TOXICITY": {}},
        "doNotStore": True
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    try:
        score = response.json()["attributeScores"]["TOXICITY"]["summaryScore"]["value"]
    except:
        score = None
    return score

# Define your model's score function (already built earlier)
def get_custom_model_score(text):
    return round(pipeline.predict_proba([text])[0][0], 2)

# Apply both models to the same test set
comparison_df = expanded_df.copy()  # Use your identity-swapped DataFrame
comparison_df["Custom_Model_Score"] = comparison_df["Sentence"].apply(get_custom_model_score)
comparison_df["PerspectiveAPI_Score"] = comparison_df["Sentence"].apply(get_perspective_toxicity)

# Save the results
comparison_df.to_csv("bias_comparison_results.csv", index=False)
comparison_df.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.barplot(data=comparison_df, x="Group", y="Custom_Model_Score", color="skyblue", label="Custom Model")
sns.barplot(data=comparison_df, x="Group", y="PerspectiveAPI_Score", color="salmon", label="Perspective API", alpha=0.7)
plt.title("Toxicity Score Comparison: Custom Model vs Perspective API")
plt.ylabel("Average Toxicity Score")
plt.legend()
plt.show()


In [None]:
# Use this to explain sentences with high toxicity that "shouldn’t" be toxic
interesting_cases = expanded_df.sort_values(by="Toxicity", ascending=False).head(20)["Sentence"].tolist()

for text in interesting_cases:
    print(f"Tweet: {text}")
    exp = explainer.explain_instance(text, predict_fn, num_features=6)
    exp.show_in_notebook(text=True)


In [None]:
# Use this to explain sentences with high toxicity that "shouldn’t" be toxic
interesting_cases = expanded_df.sort_values(by="Toxicity", ascending=False).head(20)["Sentence"].tolist()

for text in interesting_cases:
    print(f"Tweet: {text}")
    exp = explainer.explain_instance(text, predict_fn, num_features=6)
    exp.show_in_notebook(text=True)


In [None]:
restart and clear all outputs
