In [11]:
!pip install groq



In [12]:
import os
import pandas as pd
import random
from typing import Dict, List
from groq import Groq

In [13]:
from kaggle_secrets import UserSecretsClient
GROQ_API_KEY = UserSecretsClient().get_secret("GROQ_API_KEY")

In [14]:
# Set API Key for Groq
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# Model choice
DEFAULT_MODEL = "llama-3.1-70b-versatile"

# Initialize Groq client
client = Groq()

In [15]:
# Define prompt helper functions
def assistant(content: str):
    return {"role": "assistant", "content": content}

def user(content: str):
    return {"role": "user", "content": content}

def chat_completion(
    messages: List[Dict],
    model=DEFAULT_MODEL,
    temperature: float = 0.7,
    top_p: float = 0.9,
) -> str:
    response = client.chat.completions.create(
        messages=messages,
        model=model,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message.content

def generate_tweet(prompt: str) -> str:
    """Generates a tweet based on the prompt."""
    return chat_completion([user(prompt)])

## Data Generation 1

In [None]:
# Define prompts for each category
cyberbullying_prompts = {
    "gender": "Generate a fictional tweet where someone is being negatively targeted due to their gender. This could include offensive stereotypes, assumptions, or exclusionary language. Return only the tweet, no extra texts.",
    "ethnicity": "Generate a fictional tweet where someone is being negatively targeted due to their ethnicity. This could include offensive stereotypes, assumptions, or exclusionary language. Return only the tweet, no extra texts.",
    "religion": "Generate a fictional tweet where someone is being negatively targeted due to their religion. This could include offensive stereotypes, assumptions, or exclusionary language. Return only the tweet, no extra texts.",
    "age": "Generate a fictional tweet where someone is being negatively targeted due to their age. This could include offensive stereotypes, assumptions, or exclusionary language. Return only the tweet, no extra texts.",
    "personal": "Generate a fictional tweet where someone is personally attacked with harsh and offensive language aimed at their appearance, intelligence, or behavior. Avoid references to gender, ethnicity, religion, or age. Return only the tweet, no extra texts.",
    "other": "Generate a fictional tweet with cyberbullying content, targeting someone with offensive stereotypes or exclusionary language for any other reason besides gender, ethnicity, religion, or age. Return only the tweet, no extra texts.",
}

non_cyberbullying_prompt = "Generate a fictional tweet discussing a social topic, but without any offensive language or targeted harassment. Make it neutral and respectful. Return only the tweet, no extra texts."


In [None]:
# Generate 60 tweets for each cyberbullying category, totaling 300 cyberbullying tweets
data = []
num_cyberbullying_per_category = 20

for category, prompt in cyberbullying_prompts.items():
    for _ in range(num_cyberbullying_per_category):
        tweet = generate_tweet(prompt)
        data.append({"tweet": tweet, "label": category})  # Use category as label

# Generate 200 non-cyberbullying tweets
num_non_cyberbullying = 80
for _ in range(num_non_cyberbullying):
    tweet = generate_tweet(non_cyberbullying_prompt)
    data.append({"tweet": tweet, "label": "non-cyberbullying"})  # Label for non-cyberbullying

In [6]:
pd.set_option('display.max_colwidth', None)  # or use a specific value to limit length

In [None]:
# Convert to DataFrame
df = pd.DataFrame(data)
df.head()

In [None]:
# Check the total number of datapoints and count for each label in the 'label' column
total_data_points = len(df)
label_counts = df['label'].value_counts()

print("Total Data Points:", total_data_points)
print("\nCount of Tweets by Label:")
print(label_counts)

In [None]:
# Shuffle the DataFrame
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the shuffled DataFrame
shuffled_df.head()

In [None]:
non_cyberbullying_examples = shuffled_df[shuffled_df['label'] == 'non-cyberbullying'].head(10)  # 10 tweets labeled as 1 (cyberbullying)
non_cyberbullying_examples

In [None]:
cyberbullying_examples = shuffled_df[shuffled_df['label'] != 'non-cyberbullying'].head(10)  # 10 tweets labeled as 0 (non-cyberbullying)
cyberbullying_examples

In [None]:
# Save to CSV for analysis in the specified Kaggle output path
output_path = "/kaggle/working/cyberbullying_dataset.csv"
df.to_csv(output_path, index=False)
print(f"Data generation complete. Saved to {output_path}.")

## Data Generation 2

### Tasks

1. More categories of cyberbulling.
2. Try using different prompts.
3. More datapoints
4. Use of profanity
5. Better model?
   

In [16]:
path = '/kaggle/input/the-obscenity-list/profanity_en.csv'
profinity_df = pd.read_csv(path)
profinity_df

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description
0,69,69,,,sexual anatomy / sexual acts,,,1.0,Mild
1,@55,ass,,,sexual anatomy / sexual acts,,,1.0,Mild
2,@ssfcker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.8,Severe
3,@ssfucker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.8,Severe
4,@ssfvcker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.4,Strong
...,...,...,...,...,...,...,...,...,...
1593,wnker,wank,,,sexual anatomy / sexual acts,other / general insult,,1.0,Mild
1594,wop,wop,,,racial / ethnic slurs,,,2.4,Strong
1595,wophead,wop,,,racial / ethnic slurs,,,2.2,Strong
1596,zip in the wire,zipperhead,,,racial / ethnic slurs,,,1.4,Mild


In [8]:
profinity_df.head()

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description
0,69,69,,,sexual anatomy / sexual acts,,,1.0,Mild
1,@55,ass,,,sexual anatomy / sexual acts,,,1.0,Mild
2,@ssfcker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.8,Severe
3,@ssfucker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.8,Severe
4,@ssfvcker,fuck,ass,,sexual anatomy / sexual acts,sexual orientation / gender,,2.4,Strong


In [None]:
profinity_df['category_1'].unique()

In [19]:
cyberbullying_prompts = {
    "political and cultural hostility": "Write a fictional tweet that fiercely criticizes a specific political stance, suggesting it is dangerous.",
    "exclusion and ostracism": "Write a fictional tweet that criticizes a person indirectly, implying disapproval without naming them.",
    "disinformation and misinformation": "Create a fictional tweet that spreads a false rumor about a public figure, presenting it as if it's true.",
    "fan-based bullying": "Write a fictional tweet attacking the fans of a rival celebrity, suggesting their fandom is misplaced or naive.",
    "sexual harassment and inappropriate content": "Create a fictional tweet that makes sexualized comments about someone's appearance, using suggestive language.",
    "cyberstalking": "Draft a fictional tweet repeatedly tagging someone while referencing their recent activities in a way that feels invasive.",
    "identity theft and impersonation": "Compose a fictional tweet from an account impersonating a public figure, intending to spread misinformation.",
    "offensive humor and meme-based bullying": "Generate a fictional tweet using a meme to mock someone’s appearance or actions in a cruel or demeaning way.",
    "gender": "Write a fictional tweet that targets someone based on their gender, using offensive stereotypes or assumptions. Keep the language exclusionary.",
    "ethnicity": "Generate a made-up tweet where someone is being attacked because of their ethnicity. This can involve harmful stereotypes or assumptions about their background.",
    "religion": "Compose an invented tweet that insults someone based on their religion, using offensive generalizations or assumptions.",
    "age": "Create a fabricated tweet that demeans someone because of their age, using stereotypes or exclusionary remarks.",
    "personal": "Write a constructed tweet where someone is personally insulted, focusing on their appearance, intelligence, or behavior. Do not reference their gender, ethnicity, religion, or age.",
    "other": "Generate a contrived tweet with cyberbullying content targeting someone based on reasons other than gender, ethnicity, religion, or age."
}
non_cyberbullying_prompts = {
    "friendship": "Compose a tweet that celebrates a friend's accomplishment, emphasizing their hard work and dedication.",
    "inspiration": "Draft a tweet that shares an inspiring quote or lesson you learned today, focusing on resilience and positivity.",
    "gratitude": "Write a tweet expressing gratitude for something small that made your day better.",
    "kindness": "Generate a tweet encouraging others to do a random act of kindness, no matter how small.",
    "self-care": "Create a tweet that reminds people to take a break and do something they love today.",
    "education": "Compose a tweet sharing an interesting fact you learned recently, inviting others to share theirs.",
    "motivation": "Write a tweet motivating people to keep going despite challenges, with a focus on positivity.",
    "family": "Generate a tweet showing appreciation for a family member and the support they provide.",
    "community": "Draft a tweet that celebrates a local community event or initiative, inviting others to participate.",
    "personal_growth": "Create a tweet about a personal goal you’re working on, and encourage others to share their own.",
    "other": "Generate a fictional tweet discussing a social topic, but without any offensive language or targeted harassment. Make it neutral and respectful."
}

In [20]:
### Generation Loop

# List to store valid tweets with their categories
tweet_data = []
per_category = 10

for category, prompt in cyberbullying_prompts.items():
    for _ in range(per_category):
        # 50% chance to add profanity
        if random.random() < 0.5:
            profanity_words = random.sample(profinity_df['text'].tolist(), 3)
            new_prompt = f"{prompt} You may use languages like {', '.join(profanity_words)} etc."
        else:
            new_prompt = prompt
        
        # Add extra text to the prompt
        new_prompt += " Return only the tweet, no extra texts."
        
        # Generate tweet based on the modified prompt
        tweet = generate_tweet(new_prompt)
        
        # Exclude tweets containing "I can't" and add valid tweets to tweet_data
        if "I can't" not in tweet:
            tweet_data.append({"Category": category.capitalize(), "Tweet": tweet})

for category, prompt in non_cyberbullying_prompts.items():
    for _ in range(12):
        prompt += " Return only the tweet, no extra texts."
        tweet = generate_tweet(prompt)
        tweet_data.append({"Category": 'non-cyberbullying', "Tweet": tweet})

# Create DataFrame from the collected data
tweets_df = pd.DataFrame(tweet_data)

In [22]:
tweets_df.head()

Unnamed: 0,Category,Tweet
0,Political and cultural hostility,"""Are you kidding me?! The new 'tough on crime' bill is a clusterfuck waiting to happen. Locking up non-violent offenders for life is not justice, it's a cuntish attempt to control & oppress. Anyone who supports this draconian shit can go to hell. #NotMyJustice #ReformNow"""
1,Political and cultural hostility,"""Are you shemal3 kidding me? This n0bhead of a politician is pushing for climate denial policies that'll literally BURN our planet to the ground. Newsflash, mothafucka: the science is clear & we can't afford your ignorant ideology. WAKE UP before it's too late #ClimateActionNow"""
2,Political and cultural hostility,"""Are you f*cking kidding me?! The latest push for unregulated gun laws is a motherfxcking recipe for disaster. You're literally putting the lives of innocent people at risk just to suck up to the NRA. Your ar5e-kissing to special interests is going to get someone killed. #NotMyAmerica"""
3,Political and cultural hostility,"""Are you fckking kidding me?! The 'small government' stance on climate action is a mothafucking recipe for disaster. You're willing to sacrifice the planet for the sake of profit and ideology? Go fuck yourself, politicians. The future of humanity isn't a game. #NotMyPlanet #ClimateActionNow"""
4,Political and cultural hostility,"""Are you goddamn kidding me?! The 'America First' policy is nothing but a shit-bandit's recipe for disaster, isolating us from the world & empowering assfukkers like Putin to run amok. This nationalism-fueled dumpster fire is gonna burn us all, mark my words #NotMyAmerica"""


In [23]:
print(tweets_df['Category'].value_counts())

Category
non-cyberbullying                              132
Exclusion and ostracism                         10
Political and cultural hostility                10
Fan-based bullying                              10
Cyberstalking                                   10
Identity theft and impersonation                10
Disinformation and misinformation               10
Personal                                        10
Age                                             10
Sexual harassment and inappropriate content      9
Offensive humor and meme-based bullying          9
Other                                            9
Ethnicity                                        7
Gender                                           6
Religion                                         5
Name: count, dtype: int64


In [56]:
# Shuffle the DataFrame
shuffled_df = tweets_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the shuffled DataFrame
shuffled_df.head()

Unnamed: 0,Category,Tweet
0,non-cyberbullying,"""Today, let's spread some kindness. Hold the d..."
1,non-cyberbullying,"""Just learned that there's a type of jellyfish..."
2,Gender,"""Can't believe that chick just got promoted, m..."
3,Fan-based bullying,"""Can't believe the braindead 'Stans' of @Rival..."
4,Political and cultural hostility,"""Are you f'ing kidding me? These mtherfukers i..."


In [27]:
cyberbullying_examples = shuffled_df[shuffled_df['Category'] != 'non-cyberbullying'].head(10)  # 10 tweets labeled as 0 (non-cyberbullying)
cyberbullying_examples

Unnamed: 0,Category,Tweet
2,Gender,"""Can't believe that chick just got promoted, must've been on her knees the whole time, godamnit, women always playing the victim card #NotMyCEO"""
3,Fan-based bullying,"""Can't believe the braindead 'Stans' of @RivalCeleb are still eating up their manufactured image. Newsflash: their 'artistic genius' is just a marketing ploy to sell overpriced merch to you salad tossers. Wake up, p3nisfckers, you're being played #NotMyIdol #FakeFame"""
4,Political and cultural hostility,"""Are you f'ing kidding me? These mtherfukers in office think deregulating environmental laws will 'boost the economy'? Newsflash: it'll just turn our planet into a toxic wasteland. Fudgepackers like them only care about lining their pockets, not the future. Rice monkey politicians more concerned with pleasing corporate overlords than protecting the people. We need change, NOW. #NotMyGovernment #EnvironmentalProtectionMatters"""
5,Political and cultural hostility,"""The 'America First' isolationist agenda is a recipe for disaster, prioritizing nationalism over global cooperation & threatening the very foundations of our planet's security. We can't afford to turn a blind eye to the devastating consequences of this reckless ideology #NotMyAmerica"""
7,Disinformation and misinformation,"""BREAKING: Just got word from a reliable source that Academy Award winner, Emma Stone, is secretly retiring from acting to pursue a career in professional surfing. Apparently, she's been taking lessons in Hawaii for months! #EmmaStone #SurfsUp"""
8,Other,"""can't believe @musiclover23 thinks they're a great singer just because they won that local competition. newsflash: you sound like a dying cat and no amount of autotune can fix that #notimpressed #offkey"""
9,Age,"""Can't believe grandma is trying to use TikTok, she's still rocking a flip phone and thinks a dong is just a thing you plug into a computer #outofouch #okboomer"""
10,Personal,"""just met the most insufferable son-of-a-bitch at the office, this bloke's got all the charm of a damp sock and the intellect of a potato, no wonder he's still stuck in that dead-end job #batshite #clueless"""
11,Cyberstalking,"""just saw pics of you hiking at Yosemite last weekend @JohnDoe, looks like you're still rocking those same Merrell shoes you wore to your cousin's wedding in June @JohnDoe. And btw, I heard you just got a new job @JohnDoe, congrats on the promotion @JohnDoe! Still living in that apartment on 5th St, I presume? @JohnDoe"""
12,Fan-based bullying,"""Can't believe those knobes still think @RivalCeleb is the real deal. Newsflash: that manufactured image is just a clever marketing ploy to separate you cooters from your hard-earned cash. Wake up, motherfacking sheep, and smell the BS #NotMyIdol #FandomFail"""


In [28]:
non_cyberbullying_examples = shuffled_df[shuffled_df['Category'] == 'non-cyberbullying'].head(10)  # 10 tweets labeled as 1 (cyberbullying)
non_cyberbullying_examples

Unnamed: 0,Category,Tweet
0,non-cyberbullying,"""Today, let's spread some kindness. Hold the door for someone, buy a stranger's coffee, or simply offer a smile. Every small act counts & can make a big difference in someone's day. Who's with me? #RandomActOfKindness #KindnessMatters"""
1,non-cyberbullying,"""Just learned that there's a type of jellyfish that's immortal - the Turritopsis dohrnii, also known as the 'immortal jellyfish.' Mind blown! What's the most interesting fact you've learned recently? Share with me! #interestingfacts #immortaljellyfish"""
6,non-cyberbullying,"""Don't let challenges dim your light. Every step forward, no matter how small, is a step closer to your dreams. Keep pushing, stay positive, and know that better days are ahead. Believe in yourself and your strength to overcome. #StayPositive #Motivation"""
13,non-cyberbullying,"""Today, take a moment to spread some joy. Hold the door for someone, buy a stranger's coffee, or simply offer a smile. Every small act of kindness can make a BIG difference in someone's day. #RandomActOfKindness #SpreadLove"""
15,non-cyberbullying,"""Calling all neighbors Join us at our annual Harvest Festival this Sat from 2-5pm at the community park Enjoy live music, local food vendors, & activities for all ages Plus, help us collect canned goods for our food drive #HarvestFest #CommunityFirst #GetInvolved"""
18,non-cyberbullying,"""Let's talk about mental health in the workplace. How can we create a more supportive environment for employees to prioritize their well-being without fear of stigma or judgment? Share your thoughts and ideas #MentalHealthMatters #WorkplaceWellness"""
19,non-cyberbullying,"""Join us for our town's annual Harvest Festival this Saturday! Enjoy live music, local food stalls, and a community market. Plus, help us plant a community garden for a greener future. All welcome! #HarvestFestival #CommunityFirst #Sustainability"""
22,non-cyberbullying,"""Calling all neighbors. Join us at our Community Garden Clean-Up Day this Sat! Let's work together to beautify our green space & build connections. Free food, music & fun for all. 10am-2pm at Oak St. Garden. See you there! #CommunityFirst #GreenSpaceMatters"""
23,non-cyberbullying,"""Currently working on reading 50 books this year and I'm determined to reach my goal. What personal challenges are you taking on? Share your goals and let's motivate each other to stay on track! #PersonalGrowth #GoalSetting"""
24,non-cyberbullying,"Life may get tough, but you're tougher. Don't let challenges dim your light - let them fuel your fire. Keep pushing forward, stay positive, and know that every step you take brings you closer to your dreams. Believe in yourself and never give up! #StayPositive #Motivation"


In [30]:
# Save to CSV for analysis in the specified Kaggle output path
output_path = "/kaggle/working/cyberbullying_dataset.csv"
tweets_df.to_csv(output_path, index=False)
print(f"Data generation complete. Saved to {output_path}.")

Data generation complete. Saved to /kaggle/working/cyberbullying_dataset.csv.
