In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

try:
    df = pd.read_csv('cyberbullying_tweets(ML).csv')
    print("Dataset loaded successfully.")
    print("Original dataset shape:", df.shape)
    print("First 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: The file 'cyberbullying_tweets(ML).csv' was not found.")
    print("Please make sure the dataset file is in the same directory as the script.")
    exit()


print("\nChecking for null values...")
print(df.isnull().sum())


print("\nStarting text preprocessing...")

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """
    Function to preprocess raw text data.
    - Tokenizing words
    - Convert words to lower case
    - Removing Punctuations
    - Removing Stop words
    - Stemming the words
    """
    tokens = nltk.word_tokenize(text.lower())
    
    processed_tokens = []
    for token in tokens:
        if re.match("^[a-zA-Z]+$", token):
            if token not in stop_words:
                processed_tokens.append(stemmer.stem(token))
                
    return " ".join(processed_tokens)

df['processed_text'] = df['tweet_text'].apply(preprocess_text)

print("Text preprocessing complete.")
print("First 5 rows with processed text:")
print(df[['tweet_text', 'processed_text']].head())



count_vectorizer = CountVectorizer(max_features=5000) # Limit features for efficiency

tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features for efficiency

X = df['processed_text']
y = df['cyberbullying_type']

print("\nIndependent feature (X) and dependent feature (y) are selected.")


def train_and_evaluate_model(vectorizer, X, y):
   
    print(f"\n--- Evaluating models with {vectorizer.__class__.__name__} ---")
    
    X_vectorized = vectorizer.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Data split into training and testing sets.")
    print(f"Training data shape: {X_train.shape}")
    print(f"Test data shape: {X_test.shape}")

    models = {
        "Multinomial Naive Bayes": MultinomialNB(),
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42)
    }
    
    results = {}

    for model_name, model in models.items():
        print(f"\n--- Training and evaluating: {model_name} ---")
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        cr = classification_report(y_test, y_pred, zero_division=0)
        
        results[model_name] = accuracy
        
        print(f"Accuracy: {accuracy:.4f}")
        print("Confusion Matrix:")
        print(cm)
        print("Classification Report:")
        print(cr)
        
    return results

results_count_vec = train_and_evaluate_model(count_vectorizer, X, y)
results_tfidf_vec = train_and_evaluate_model(tfidf_vectorizer, X, y)

best_model_count_vec = max(results_count_vec, key=results_count_vec.get)
best_accuracy_count_vec = results_count_vec[best_model_count_vec]

best_model_tfidf_vec = max(results_tfidf_vec, key=results_tfidf_vec.get)
best_accuracy_tfidf_vec = results_tfidf_vec[best_model_tfidf_vec]

print("\n\n--- FINAL REPORT ---")
print("\nBest performing models for each vectorizer:")
print(f"Count Vectorizer: '{best_model_count_vec}' with accuracy {best_accuracy_count_vec:.4f}")
print(f"TF-IDF Vectorizer: '{best_model_tfidf_vec}' with accuracy {best_accuracy_tfidf_vec:.4f}")

if best_accuracy_count_vec > best_accuracy_tfidf_vec:
    print(f"\nOverall, the best model is '{best_model_count_vec}' using Count Vectorizer with an accuracy of {best_accuracy_count_vec:.4f}.")
else:
    print(f"\nOverall, the best model is '{best_model_tfidf_vec}' using TF-IDF Vectorizer with an accuracy of {best_accuracy_tfidf_vec:.4f}.")


Dataset loaded successfully.
Original dataset shape: (47692, 2)
First 5 rows of the dataset:
                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying

Checking for null values...
tweet_text            0
cyberbullying_type    0
dtype: int64
No null values found.

Starting text preprocessing...
Text preprocessing complete.
First 5 rows with processed text:
                                          tweet_text  \
0  In other words #katandandre, your food was cra...   
1  Why is #aussietv so white? #MKR #theblock #ImA...   
2  @XochitlSuckkks a classy whore? Or more red ve...   
3  @Jason_Gio meh. :P  thanks for the heads 