In [2]:
import re
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import NMF

In [3]:
df = pd.read_csv("cyberbullying_tweets.csv")
def clean_text(text):
    return re.sub(r'[@]\w+', '', text)

df['tweet_text_clean'] = df['tweet_text'].apply(clean_text)

In [4]:
X = df['tweet_text_clean']
y = df['cyberbullying_type']

lemmatizer = WordNetLemmatizer()
X_lemmatized = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(p) for p in x.lower().split()]))

In [5]:
def run_model(random_state_value):
    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, shuffle=True, random_state=random_state_value)
    pipe = Pipeline([('vectorizer', CountVectorizer()), ('model', LogisticRegression(max_iter=1000))])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    return accuracy_score(y_test, y_pred), pipe

n_runs = 10

results = Parallel(n_jobs=-1)(delayed(run_model)(i) for i in range(n_runs))

accuracies = [result[0] for result in results]
pipelines = [result[1] for result in results]

accuracies = np.array(accuracies)

print(f'Accuracies over {n_runs} runs: {accuracies}')
print(f'Mean Accuracy: {np.mean(accuracies)}')
print(f'Standard Deviation: {np.std(accuracies)}')

closest_index = np.abs(accuracies - np.mean(accuracies)).argmin()
pipe = pipelines[closest_index]
closest_accuracy = accuracies[closest_index]

print(f"\nClosest Pipeline Accuracy: {closest_accuracy} (Run: {closest_index + 1})")

Accuracies over 10 runs: [0.8260824  0.82838872 0.8190586  0.81727644 0.8257679  0.8190586
 0.81570395 0.82293741 0.82755006 0.82524374]
Mean Accuracy: 0.8227067826816228
Standard Deviation: 0.004337544644101728

Closest Pipeline Accuracy: 0.8229374148233567 (Run: 8)


In [21]:
words = pipe['vectorizer'].get_feature_names_out()
coefs = pipe['model'].coef_
classes = pipe['model'].classes_

sorted_coef_indexes = coefs.argsort(axis=1)

for i, classe in enumerate(classes):
    print("\n", classe)
    top5words = words[sorted_coef_indexes[i, -5:]]
    print(" -> ",end="")
    print(", ".join(top5words))
    #top5coefs = coefs[i, sorted_coef_indexes[i, -5:]]
    # print(top5coefs)


 age
 -> schools, bullied, bullies, bully, school

 ethnicity
 -> coon, dumb, colored, nigga, nigger

 gender
 -> female, sexist, notsexist, rape, feminazi

 not_cyberbullying
 -> daesh, mosul, andre, beatdown, mkr

 other_cyberbullying
 -> harassment, code, bullied, idiot, blameonenotall

 religion
 -> muslims, mohammed, islam, muslim, christian


In [22]:
def print_words_in_topics(nmf, vectorizer):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(nmf.components_):
        print(f"Topic {idx}")
        for i in topic.argsort()[-5:]:
            print(words[i])
        print()
    print()
    
vectorizer_nmf_pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True, stop_words='english')),
                                    ('nmf', NMF(n_components=6))])
X_nmf = vectorizer_nmf_pipeline.fit_transform(df['tweet_text_clean'])

In [22]:
print_words_in_topics(vectorizer_nmf_pipeline['nmf'], vectorizer_nmf_pipeline['vectorizer'])

Topic 0
girls
girl
school
bullied
high

Topic 1
obama
ass
nigger
dumb
fuck

Topic 2
funny
joke
jokes
gay
rape

Topic 3
muslims
just
don
people
like

Topic 4
middle
like
bullies
school
bully

Topic 5
women
sexist
mkr
http
rt


