In [None]:
import pickle as pk
import pandas as pd
import matplotlib.pyplot as plt
import pandas.plotting as pdplt
import numpy as np
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
DATASET_PATH = "datasets/ruddit_with_text.csv"
STATS_PATH = "stats/stats.png"
STOPWORDS_LANGUAGE = "english"
VECTORIZER_PATH = "models/vectorizer.pickle"
CLASSIFIER_PATH = "models/classifier.pickle"
THRESHOLD_QUANTILE = 0.5
SEED = 1928

# Data setup

In [None]:
UNUSED_COLUMNS = ["post_id", "comment_id", "url"]
SCORE_COLUMN = "offensiveness_score"
COMMENT_COLUMN = "txt"
OFFENSIVE_LABEL = "offensive"
NOT_OFFENSIVE_LABEL = "not_offensive"

In [None]:
dataset = pd.read_csv(DATASET_PATH)
dataset.head()

In [None]:
# drops unused columns
dataset.drop(UNUSED_COLUMNS, axis="columns", inplace=True)

In [None]:
# drops deleted comments
dataset[COMMENT_COLUMN].replace("[deleted]", np.nan, inplace=True)
dataset.dropna(subset=[COMMENT_COLUMN], inplace=True)
dataset.head()

In [None]:
# finds labeling threshold
threshold = np.quantile(dataset[SCORE_COLUMN], q=THRESHOLD_QUANTILE)
print("threshold:", threshold)

In [None]:
plt.hist(dataset[SCORE_COLUMN], bins="auto")
plt.axvline(threshold, color="k")
_ = plt.title("Offensiveness Score Distribution")

# Training

In [None]:
vectorizer = CountVectorizer(stop_words=STOPWORDS_LANGUAGE)
x = vectorizer.fit_transform(dataset[COMMENT_COLUMN])
y = dataset[SCORE_COLUMN].map(lambda s: OFFENSIVE_LABEL if s > threshold else NOT_OFFENSIVE_LABEL)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=SEED)

In [None]:
classifier = MultinomialNB().fit(x_train, y_train)

# Results

In [None]:
classes = classifier.classes_
y_pred = classifier.predict(x_test)
report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
confusion.set_axis(classes, axis="rows", inplace=True)
confusion.set_axis(classes, axis="columns", inplace=True)

In [None]:
report

In [None]:
confusion

In [None]:
fig, [report_axis, confusion_axis] = plt.subplots(2, 1)
report_axis.axis("tight")
report_axis.axis("off")
pdplt.table(report_axis, report, loc="center")
report_axis.set_title("Estastisticas do Modelo")
confusion_axis.axis("tight")
confusion_axis.axis("off")
pdplt.table(confusion_axis, confusion, loc="center")
confusion_axis.set_title("Matriz de Confusão")
fig.tight_layout()
fig.savefig(STATS_PATH, bbox_inches="tight", dpi=240)

In [None]:
# comment = "welcome to reddit"
# features = vectorizer.transform([comment])
# prediction = classifier.predict_proba(features)
# print(f"offensiveness: {prediction[0][1]}")

# Saves model

In [None]:
with open(VECTORIZER_PATH, "wb") as vectorizer_file, open(CLASSIFIER_PATH, "wb") as classifier_file:
    pk.dump(vectorizer, vectorizer_file)
    pk.dump(classifier, classifier_file)
