In [2]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import streamlit as st

In [5]:
df = pd.read_csv("cyberbullying_tweets.csv")

In [6]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [7]:
df.tail()

Unnamed: 0,tweet_text,cyberbullying_type
47687,"Black ppl aren't expected to do anything, depe...",ethnicity
47688,Turner did not withhold his disappointment. Tu...,ethnicity
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity
47691,Bro. U gotta chill RT @CHILLShrammy: Dog FUCK ...,ethnicity


In [8]:
df.shape

(47692, 2)

In [9]:
df.isnull().sum()

tweet_text            0
cyberbullying_type    0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [11]:
df.describe()

Unnamed: 0,tweet_text,cyberbullying_type
count,47692,47692
unique,46017,6
top,RT @sailorhg: the intro for my hardware hackin...,religion
freq,2,7998


In [12]:
#nltk.download('stopwords')
#nltk.download('wordnet')


In [13]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [14]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  
    text = text.lower()  
    text = text.split()  
    text = [word for word in text if word not in stop_words]  
    text = [lemmatizer.lemmatize(word) for word in text]  
    return " ".join(text)

In [15]:
df['clean_text'] = df['tweet_text'].apply(clean_text)

In [16]:
df[['tweet_text', 'clean_text']].head()

Unnamed: 0,tweet_text,clean_text
0,"In other words #katandandre, your food was cra...",word katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,xochitlsuckkks classy whore red velvet cupcake
3,"@Jason_Gio meh. :P thanks for the heads up, b...",jason gio meh p thanks head concerned another ...
4,@RudhoeEnglish This is an ISIS account pretend...,rudhoeenglish isi account pretending kurdish a...


In [17]:
tfidf = TfidfVectorizer(max_features=5000)  # Use top 5000 features

# Transform text data
X_tfidf = tfidf.fit_transform(df['clean_text'])

# Convert to DataFrame
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

# Check transformed features
print(X_tfidf_df.head())

    aa  aalwuhaib   ab  abc  abdul  ability  able  ableist  abortion  abroad  \
0  0.0        0.0  0.0  0.0    0.0      0.0   0.0      0.0       0.0     0.0   
1  0.0        0.0  0.0  0.0    0.0      0.0   0.0      0.0       0.0     0.0   
2  0.0        0.0  0.0  0.0    0.0      0.0   0.0      0.0       0.0     0.0   
3  0.0        0.0  0.0  0.0    0.0      0.0   0.0      0.0       0.0     0.0   
4  0.0        0.0  0.0  0.0    0.0      0.0   0.0      0.0       0.0     0.0   

   ...  zaibatsunews  zappe  zero   zh  zimmerman  zionist  zoe  zone  zxbzv  \
0  ...           0.0    0.0   0.0  0.0        0.0      0.0  0.0   0.0    0.0   
1  ...           0.0    0.0   0.0  0.0        0.0      0.0  0.0   0.0    0.0   
2  ...           0.0    0.0   0.0  0.0        0.0      0.0  0.0   0.0    0.0   
3  ...           0.0    0.0   0.0  0.0        0.0      0.0  0.0   0.0    0.0   
4  ...           0.0    0.0   0.0  0.0        0.0      0.0  0.0   0.0    0.0   

   zython  
0     0.0  
1     0.0  
2 

In [18]:
print(X_tfidf_df.shape)

(47692, 5000)


In [19]:
X = X_tfidf_df
y = df['cyberbullying_type']  # Target labels

# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
category_descriptions = {
    "not_cyberbullying": "The input text does not contain cyberbullying content.",
    "gender": "The input text contains cyberbullying based on gender.",
    "religion": "The input text contains cyberbullying based on religion.",
    "other_cyberbullying": "The input text contains other forms of cyberbullying.",
    "age": "The input text contains cyberbullying based on age.",
    "ethnicity": "The input text contains cyberbullying based on ethnicity."
}

In [21]:
category_definitions = {
    "not_cyberbullying": "Cyberbullying is not detected in the given input.",
    "gender": "Gender-based cyberbullying involves targeting someone based on their gender identity, using sexist remarks, stereotypes, or discrimination.",
    "religion": "Religion-based cyberbullying involves attacking or mocking someone due to their religious beliefs, often leading to hate speech.",
    "other_cyberbullying": "This category includes various forms of cyberbullying that do not fit into specific categories like age, gender, or religion.",
    "age": "Age-based cyberbullying targets individuals based on their age, often discriminating against younger or older groups.",
    "ethnicity": "Ethnicity-based cyberbullying involves discrimination, stereotypes, or offensive comments directed at a person’s ethnic background."
}

In [22]:
st.title("🔍 Cyberbullying Detection System")

st.write("Enter a tweet or text below to check if it contains cyberbullying.")

user_input = st.text_area("Enter your text here:")

if st.button("Detect Cyberbullying"):
    if user_input.strip():
        cleaned_text = clean_text(user_input)
        transformed_text = tfidf.transform([cleaned_text])
        predicted_label = model.predict(transformed_text)[0]

        st.subheader(f"The input value is **{predicted_label}**.")
        st.write(category_descriptions[predicted_label])
        st.write(f"📌 **What is {predicted_label}?** {category_definitions[predicted_label]}")
    else:
        st.warning("⚠️ Please enter some text to analyze.")


2025-03-06 14:46:01.945 
  command:

    streamlit run c:\Users\theja\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-03-06 14:46:01.954 Session state does not function when running a script without `streamlit run`


In [23]:
pip install tk


Note: you may need to restart the kernel to use updated packages.
