Imports

In [33]:
import re
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt

Reading csv and removing words that start with @'s

In [34]:
df = pd.read_csv("cyberbullying_tweets.csv")
def clean_text(text):
    return re.sub(r'[@]\w+', '', text)

df['tweet_text_clean'] = df['tweet_text'].apply(clean_text)

Lemmatization

In [35]:
X = df['tweet_text_clean']
y = df['cyberbullying_type']

lemmatizer = WordNetLemmatizer()
X_lemmatized = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(p) for p in x.lower().split()]))

Preparing function to run model

In [36]:
def run_model(random_state_value):
    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, shuffle=True, random_state=random_state_value)
    pipe = Pipeline([('vectorizer', CountVectorizer()), ('model', LogisticRegression(max_iter=1000))])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    return accuracy_score(y_test, y_pred), pipe

Running models and printing accuracies

In [None]:
n_runs = 10

results = Parallel(n_jobs=-1)(delayed(run_model)(i) for i in range(n_runs))

accuracies = [result[0] for result in results]
pipelines = [result[1] for result in results]

accuracies = np.array(accuracies)

print(f'Accuracies {accuracies}')
print(f'Mean Accuracy {np.mean(accuracies)}')

closest_index = np.abs(accuracies - np.mean(accuracies)).argmin()
pipe = pipelines[closest_index]
closest_accuracy = accuracies[closest_index]

print(f"\nClosest Accuracy {closest_accuracy})")

Seeing most relevant words for each class

In [None]:
words = pipe['vectorizer'].get_feature_names_out()
coefs = pipe['model'].coef_
classes = pipe['model'].classes_

sorted_coef_indexes = coefs.argsort(axis=1)

for i, classe in enumerate(classes):
    print("\n", classe)
    top5words = words[sorted_coef_indexes[i, -5:]]
    print(" -> ",end="")
    print(", ".join(top5words))
    #top5coefs = coefs[i, sorted_coef_indexes[i, -5:]]
    # print(top5coefs)

Assessing dataset size

In [None]:
train_accuracies = []
test_accuracies = []
num_samples = 100
dataset_fractions = [x for x in np.linspace(0.1, 1, 100)]

for frac in dataset_fractions:
    df_sampled = df.sample(frac=frac, random_state=42)

    X = df_sampled['tweet_text_clean']
    y = df_sampled['cyberbullying_type']

    X_lemmatized = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(p) for p in x.lower().split()]))

    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, random_state=42)
    
    pipe = Pipeline([('vectorizer', CountVectorizer(stop_words='english')), ('model', LogisticRegression(max_iter=1000))])
    
    pipe.fit(X_train, y_train)
    
    train_accuracy = accuracy_score(y_train, pipe.predict(X_train))
    test_accuracy = accuracy_score(y_test, pipe.predict(X_test))
    
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

In [None]:
plt.plot(dataset_fractions, train_accuracies, label='Train Accuracy')
plt.plot(dataset_fractions, test_accuracies, label='Test Accuracy')
plt.xlabel('Dataset Fraction')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend()
plt.show()

Analysing topics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text_clean'], df['cyberbullying_type'], test_size=0.2, random_state=0)

n_topics = 8
vectorizer_nmf_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, stop_words='english')),
    ('nmf', NMF(n_components=n_topics, random_state=0))
])

X_train_topics = vectorizer_nmf_pipeline.fit_transform(X_train)

def print_words_in_topics(model, vectorizer, n_words=10):
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}: ", " ".join([feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]))

print_words_in_topics(vectorizer_nmf_pipeline['nmf'], vectorizer_nmf_pipeline['vectorizer'])

In [41]:
X_test_topics = vectorizer_nmf_pipeline.transform(X_test)

most_prominent_topics = np.argmax(X_test_topics, axis=1)

In [42]:
topic_classifiers = {}

for topic_idx in range(n_topics):
    topic_train_mask = np.argmax(X_train_topics, axis=1) == topic_idx
    X_train_topic = X_train[topic_train_mask]
    y_train_topic = y_train[topic_train_mask]

    classifier = Pipeline([
        ('vectorizer', CountVectorizer(binary=True, stop_words='english')),
        ('model', LogisticRegression(max_iter=1000))
    ])
    classifier.fit(X_train_topic, y_train_topic)

    topic_classifiers[topic_idx] = classifier

In [None]:
from sklearn.metrics import accuracy_score

topic_accuracies = []

for topic_idx in range(n_topics):
    topic_test_mask = most_prominent_topics == topic_idx
    X_test_topic = X_test[topic_test_mask]
    y_test_topic = y_test[topic_test_mask]

    if len(X_test_topic) > 0:
        classifier = topic_classifiers[topic_idx]
        y_pred_topic = classifier.predict(X_test_topic)

        accuracy = accuracy_score(y_test_topic, y_pred_topic)
        topic_accuracies.append((topic_idx, accuracy))

for topic_idx, accuracy in topic_accuracies:
    print(f"Topic {topic_idx} Accuracy: {accuracy:.2f}")

Combining the words print with the accuracy one

In [None]:
def print_words_in_topics(model, vectorizer, n_words=10):
    feature_names = vectorizer.get_feature_names_out()
    topic_names = []
    for _, topic in enumerate(model.components_):
        topic_name = " ".join([feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]])
        topic_names.append(topic_name)
    return topic_names

topic_names = print_words_in_topics(vectorizer_nmf_pipeline['nmf'], vectorizer_nmf_pipeline['vectorizer'])

topic_accuracies = []

for topic_idx in range(n_topics):
    topic_test_mask = most_prominent_topics == topic_idx
    X_test_topic = X_test[topic_test_mask]
    y_test_topic = y_test[topic_test_mask]

    if len(X_test_topic) > 0:
        classifier = topic_classifiers[topic_idx]
        y_pred_topic = classifier.predict(X_test_topic)

        accuracy = accuracy_score(y_test_topic, y_pred_topic)
        topic_accuracies.append((topic_idx, accuracy))

for topic_idx, accuracy in topic_accuracies:
    print(f"Topic {topic_idx} - Accuracy: {accuracy:.2f} - words: ({topic_names[topic_idx]})")