# **Loading the data**

In [None]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the dataset
df = pd.read_csv("/content/cb_multi_labeled_balanced.csv", delimiter=",", on_bad_lines="skip", engine="python")

# Display basic info
print(df.head())
print(df.info())
print(df['label'].value_counts())  # Check class distribution


                                                text              label
0  @ZubearSays Any real nigga isn't letting this ...     ethnicity/race
1  @MoradoSkittle @prolifejewess @DAConsult @Kell...  not_cyberbullying
2        the only thing i wish, i wish a nigga would     ethnicity/race
3  You saudias are not friends of Muslim idiots c...           religion
4  @JaydenT2399 @TractorLaw @holmes_gael @erconge...           religion
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63317 entries, 0 to 63316
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    63317 non-null  object
 1   label   63317 non-null  object
dtypes: object(2)
memory usage: 989.5+ KB
None
label
not_cyberbullying    31784
gender/sexual        10842
ethnicity/race       10673
religion             10018
Name: count, dtype: int64


In [None]:
df.shape


(63317, 3)

# **Data Preprocessing**
We'll clean the text data before feeding it into a machine learning model.

1. Convert text to lowercase.

2. Remove special characters & numbers.

3. Remove stopwords (common words like "the", "is", "and").

4. Apply lemmatization (convert words to their root form).

In [None]:
!pip install spacy scikit-learn
!python -m spacy download en_core_web_sm  # Download English NLP model


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
print(df.columns)

Index(['text', 'label'], dtype='object')


In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load English NLP model
nlp = spacy.load("en_core_web_sm")


In [None]:
def clean_text_spacy(text):
    doc = nlp(text.lower())  # Convert to lowercase and process text
    words = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]  # Remove stopwords & lemmatize
    return " ".join(words)


## **Step 2: Text Preprocessing (Cleaning the Text)**

Remove special characters & numbers
1.  Tokenization (split text into words)
2.  Stopword removal (remove common words like "the", "is", etc.)
3.  Lemmatization (convert words to base form, e.g., "running" → "run")

In [None]:
import spacy
import pandas as pd
import re

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Text cleaning function using SpaCy
def clean_text_spacy(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters & numbers
    doc = nlp(text)  # Process text with SpaCy
    words = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatization & stopword removal
    return " ".join(words)

In [None]:
# Apply cleaning function to the dataset
df['clean_text'] = df['text'].apply(clean_text_spacy)

In [None]:
# Show cleaned text
df[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,@ZubearSays Any real nigga isn't letting this ...,zubearsay real nigga not let happen
1,@MoradoSkittle @prolifejewess @DAConsult @Kell...,moradoskittle prolifejewess daconsult kellyyod...
2,"the only thing i wish, i wish a nigga would",thing wish wish nigga
3,You saudias are not friends of Muslim idiots c...,saudia friend muslim idiot cheapless people
4,@JaydenT2399 @TractorLaw @holmes_gael @erconge...,jaydent tractorlaw holmesgael erconger acyn ma...


# **Step 3: Convert Text to Numerical Format**

**Applying TF-IDF Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words

# Fit and transform the clean text
X = vectorizer.fit_transform(df['clean_text']).toarray()

# Convert to DataFrame
X_df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())

# Show first few rows
X_df.head()


Unnamed: 0,aa,aalwuhaib,aaron,ab,abandon,abc,abduct,abdul,abi,abide,...,yu,yup,zero,zion,zionist,zoelkiflitaher,zombie,zone,zoobear,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Step 4: Encode Labels & Splitting Data**

Convert text labels into numerical format (so the model can understand).

Split data into training and testing sets (to evaluate performance).

### **Encode Labels (Multiclass Classification)**

Since we have multiple classes (not_cyberbullying, ethnicity/race, gender/sexual, religion), we will use Label Encoding to convert them into numbers.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Initialize label encoder
label_encoder = LabelEncoder()

# Transform text labels into numerical values
y = label_encoder.fit_transform(df['label'])

# Check class mapping
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Mapping:", class_mapping)


Class Mapping: {'ethnicity/race': np.int64(0), 'gender/sexual': np.int64(1), 'not_cyberbullying': np.int64(2), 'religion': np.int64(3)}


# **Split Data into Train & Test Sets**

In [None]:
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check shapes
print("Training Data Shape:", X_train.shape, y_train.shape)
print("Testing Data Shape:", X_test.shape, y_test.shape)


Training Data Shape: (50653, 5000) (50653,)
Testing Data Shape: (12664, 5000) (12664,)


# **Step 5: Train a Classification Model**

Since this is a text classification task, we will:

Convert text into TF-IDF features (numerical format).

Train a Logistic Regression model (simple but effective).

Evaluate the model’s performance.

### **Convert Text into TF-IDF Features**

Before training, we need to convert text data into numerical form using TF-IDF (Term Frequency-Inverse Document Frequency).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Use 5000 most important words



In [None]:
from sklearn.model_selection import train_test_split

# Assuming df['clean_text'] contains the preprocessed text
X = df['clean_text']  # Features (cleaned text)
y = df['label']       # Labels (categories)

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to string type (to avoid errors)
X_train = X_train.astype(str)
X_test = X_test.astype(str)


In [None]:
# Transform training and testing text using TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Check shape of transformed data
print("TF-IDF Train Shape:", X_train_tfidf.shape)
print("TF-IDF Test Shape:", X_test_tfidf.shape)


TF-IDF Train Shape: (50653, 5000)
TF-IDF Test Shape: (12664, 5000)


## **Training a Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)


# **Evaluating Model Performance**

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.99
Classification Report:
                    precision    recall  f1-score   support

   ethnicity/race       0.99      0.98      0.99      2101
    gender/sexual       0.99      0.99      0.99      2170
not_cyberbullying       0.98      1.00      0.99      6361
         religion       1.00      0.97      0.98      2032

         accuracy                           0.99     12664
        macro avg       0.99      0.98      0.99     12664
     weighted avg       0.99      0.99      0.99     12664



In [None]:
import numpy as np  # Import NumPy

threshold = 0.5  # Adjust this threshold

predicted_labels = []
for prob in probabilities:
    if max(prob) < threshold:
        predicted_labels.append("Uncertain")  # Flag uncertain cases
    else:
        predicted_labels.append(label_encoder.inverse_transform([np.argmax(prob)])[0])

print(predicted_labels)


['ethnicity/race', 'not_cyberbullying', 'not_cyberbullying']


# **Save the Model**

In [None]:
import joblib

# Save the trained model
joblib.dump(model, "cyberbully_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


# **Load the Model**

In [None]:
# Load the trained model
model = joblib.load("cyberbully_model.pkl")

# Load the TF-IDF vectorizer
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")

print("Model and vectorizer loaded successfully!")

Model and vectorizer loaded successfully!


# **Test on New Data**

In [None]:
# Sample text for prediction
sample_texts = [
    "I hate you, you are so dumb!",
    "Have a great day",
    "You belong to a different religion.",
]

# Transform the text using the loaded TF-IDF vectorizer
sample_texts_tfidf = tfidf_vectorizer.transform(sample_texts)

# Make predictions
predictions = model.predict(sample_texts_tfidf)

# Display results
for text, label in zip(sample_texts, predictions):
    print(f"Text: {text} → Predicted Label: {label}")


Text: I hate you, you are so dumb! → Predicted Label: ethnicity/race
Text: Have a great day → Predicted Label: not_cyberbullying
Text: You belong to a different religion. → Predicted Label: not_cyberbullying


# **Model Training Using Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9947883765003158
Classification Report:
                    precision    recall  f1-score   support

   ethnicity/race       0.99      1.00      1.00      2101
    gender/sexual       1.00      0.99      1.00      2170
not_cyberbullying       0.99      1.00      1.00      6361
         religion       1.00      0.98      0.99      2032

         accuracy                           0.99     12664
        macro avg       1.00      0.99      0.99     12664
     weighted avg       0.99      0.99      0.99     12664



# **Saving the Model**

In [None]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf_model, "cyberbully_rf_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


# **Load the Model**

In [None]:
# Load the saved Random Forest model
loaded_model = joblib.load("cyberbully_rf_model.pkl")

# Load the saved TF-IDF vectorizer
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

print("Model and vectorizer loaded successfully!")


Model and vectorizer loaded successfully!


# **Test the Model on New Data**

In [None]:
def predict_text(text):
    # Transform input text using the loaded TF-IDF vectorizer
    text_tfidf = loaded_vectorizer.transform([text])

    # Predict the label using the loaded model
    predicted_label = loaded_model.predict(text_tfidf)[0]

    return predicted_label

# Test Examples
new_texts = [
    "I hate you!",
    "You are so dumb and stupid.",
    "Have a great day!",
    "You belong to a different religion."
]

# Predict and display results
for text in new_texts:
    print(f"Text: {text} → Predicted Label: {predict_text(text)}")


Text: I hate you! → Predicted Label: not_cyberbullying
Text: You are so dumb and stupid. → Predicted Label: ethnicity/race
Text: Have a great day! → Predicted Label: not_cyberbullying
Text: You belong to a different religion. → Predicted Label: religion


In [None]:
from google.colab import files

# Download the model
files.download("cyberbully_rf_model.pkl")

# Download the vectorizer
files.download("tfidf_vectorizer.pkl")
