In [None]:
import pickle as pk
import pandas as pd
import matplotlib.pyplot as plt
import pandas.plotting as pdplt
import numpy as np
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix


# Constants

In [None]:
DATASET_PATH = "datasets/ruddit_with_text.csv"

UNUSED_COLUMNS = ["post_id", "comment_id", "url"]
SCORE_COLUMN = "offensiveness_score"
COMMENT_COLUMN = "txt"
OFFENSIVE_LABEL = "offensive"
NOT_OFFENSIVE_LABEL = "not_offensive"

STOPWORDS_LANGUAGE = "english"
MAX_DF = 0.75

COUNT_VECTORIZER_PATH = "models/count_vectorizer.pickle"
TF_IDF_VECTORIZER_PATH = "models/tf_idf_vectorizer.pickle"

COUNT_SVM_MODEL_PATH = "models/count_svm_model.pickle"
TF_IDF_SVM_MODEL_PATH = "models/tf_idf_svm_model.pickle"
COUNT_NB_MODEL_PATH = "models/count_nb_model.pickle"
TF_IDF_NB_MODEL_PATH = "models/tf_idf_nb_model.pickle"

COUNT_SVM_TITLE = "MODEL RESULTS: COUNT - SVM"
TF_IDF_SVM_TITLE = "MODEL RESULTS: TF IDF - SVM"
COUNT_NB_TITLE = "MODEL RESULTS: COUNT - NB"
TF_IDF_NB_TITLE = "MODEL RESULTS: TF IDF - NB"

COUNT_SVM_STATS_PATH = "stats/count_svm_stats.png"
TF_IDF_SVM_STATS_PATH = "stats/tf_idf_svm_stats.png"
COUNT_NB_STATS_PATH = "stats/count_nb_stats.png"
TF_IDF_NB_STATS_PATH = "stats/tf_idf_nb_stats.png"


FIGURE_DPI = 240

SEED = 1928


# Data setup

In [None]:
dataset = pd.read_csv(DATASET_PATH)
dataset.head()


In [None]:
# drops unused columns
dataset.drop(UNUSED_COLUMNS, axis="columns", inplace=True)


In [None]:
# drops deleted comments
dataset[COMMENT_COLUMN].replace("[deleted]", np.nan, inplace=True)
dataset.dropna(subset=[COMMENT_COLUMN], inplace=True)
dataset.head()


In [None]:
middle = np.quantile(dataset[SCORE_COLUMN], 0.5)
print("middle:", middle)


In [None]:
plt.hist(dataset[SCORE_COLUMN], bins="auto")
plt.axvline(middle, color="k")
_ = plt.title("Offensiveness Score Distribution")


# Vectorizing

In [None]:
x = dataset[COMMENT_COLUMN]
y = dataset[SCORE_COLUMN].map(
    lambda s: OFFENSIVE_LABEL if s > 0 else NOT_OFFENSIVE_LABEL
)


In [None]:
count_vectorizer = CountVectorizer(stop_words=STOPWORDS_LANGUAGE, max_df=MAX_DF)
count_vectorizer.fit(dataset[COMMENT_COLUMN])


In [None]:
tf_idf_vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANGUAGE, max_df=MAX_DF)
tf_idf_vectorizer.fit(dataset[COMMENT_COLUMN])


In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, random_state=SEED
)
count_x_train = count_vectorizer.transform(x_train)
count_x_test = count_vectorizer.transform(x_test)
tf_idf_x_train = tf_idf_vectorizer.transform(x_train)
tf_idf_x_test = tf_idf_vectorizer.transform(x_test)


# Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit


def tune_params(model, param_grid, x, y):
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
    search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1)
    search.fit(x, y)
    return pd.DataFrame(search.cv_results_), search.best_params_


def plot_tuning_results(tuning_results, title):
    c = tuning_results["param_C"].to_numpy()
    score = tuning_results["mean_test_score"].to_numpy()
    best_score_index = np.argmax(score)
    best_c = c[best_score_index]
    best_score = score[best_score_index]
    plt.title(title)
    plt.plot(c, score)
    plt.scatter(best_c, best_score)
    plt.text(
        best_c,
        best_score,
        f"({best_c:0.4f}, {best_score:0.4f})",
        verticalalignment="bottom",
        horizontalalignment="left"
    )


In [None]:
svm_param_grid = [
    {
        "kernel": ["linear"],
        "C": np.logspace(-1, 1, 15),
    },
]


In [None]:
count_svm_tuning_results, count_svm_params = tune_params(
    SVC(class_weight="balanced"),
    svm_param_grid,
    count_x_train,
    y_train,
)


In [None]:
tf_idf_svm_tuning_results, tf_idf_svm_params = tune_params(
    SVC(class_weight="balanced"),
    svm_param_grid,
    tf_idf_x_train,
    y_train,
)


In [None]:
plot_tuning_results(count_svm_tuning_results, "TUNING RESULTS: COUNT - SVM")


In [None]:
plot_tuning_results(tf_idf_svm_tuning_results, "TUNING RESULTS: TF IDF - SVM")


# Training

In [None]:
count_svm_classifier = SVC(
    **count_svm_params,
    class_weight="balanced",
    probability=True,
)
count_svm_classifier.fit(count_x_train, y_train)


In [None]:
tf_idf_svm_classifier = SVC(
    **tf_idf_svm_params,
    class_weight="balanced",
    probability=True,
)
tf_idf_svm_classifier.fit(tf_idf_x_train, y_train)


In [None]:
count_nb_classifier = MultinomialNB()
count_nb_classifier.fit(count_x_train.toarray(), y_train)


In [None]:
tf_idf_nb_classifier = GaussianNB()
tf_idf_nb_classifier.fit(tf_idf_x_train.toarray(), y_train)


# Reports

In [None]:
def generate_report_and_confusion(classifier, x_test, y_test):
    classes = classifier.classes_
    y_pred = classifier.predict(x_test)
    report = pd.DataFrame(
        classification_report(y_test, y_pred, output_dict=True)
    ).transpose()
    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
    confusion.set_axis(classes, axis="rows", inplace=True)
    confusion.set_axis(classes, axis="columns", inplace=True)
    return [report, confusion]


In [None]:
def plot_report_and_confusion(report, confusion, title, fig_path):
    fig, main_axis = plt.subplots(1, 1)
    main_axis.axis("tight")
    main_axis.axis("off")
    [report_axis, confusion_axis] = fig.subplots(2, 1)
    report_axis.axis("tight")
    report_axis.axis("off")
    pdplt.table(report_axis, report, loc="center")
    report_axis.set_title(title)
    confusion_axis.axis("tight")
    confusion_axis.axis("off")
    pdplt.table(confusion_axis, confusion, loc="center")
    confusion_axis.set_title("Matriz de Confusão")
    fig.tight_layout()
    fig.savefig(fig_path, bbox_inches="tight", dpi=FIGURE_DPI)


In [None]:
def generate_stats(classifier, x_test, y_test, title, fig_path):
    [report, confusion] = generate_report_and_confusion(classifier, x_test, y_test)
    plot_report_and_confusion(report, confusion, title, fig_path)


In [None]:
generate_stats(
    count_svm_classifier, count_x_test, y_test, COUNT_SVM_TITLE, COUNT_SVM_STATS_PATH
)


In [None]:
generate_stats(
    tf_idf_svm_classifier,
    tf_idf_x_test,
    y_test,
    TF_IDF_SVM_TITLE,
    TF_IDF_SVM_STATS_PATH,
)


In [None]:
generate_stats(
    count_nb_classifier, count_x_test, y_test, COUNT_NB_TITLE, COUNT_NB_STATS_PATH
)


In [None]:
generate_stats(
    tf_idf_nb_classifier,
    tf_idf_x_test.toarray(),
    y_test,
    TF_IDF_NB_TITLE,
    TF_IDF_NB_STATS_PATH,
)


# Saves model

In [None]:
def pickle_to_file(object, path):
    with open(path, "wb") as file:
        pk.dump(object, file)


In [None]:
pickle_to_file(count_vectorizer, COUNT_VECTORIZER_PATH)
pickle_to_file(tf_idf_vectorizer, TF_IDF_VECTORIZER_PATH)

pickle_to_file(count_svm_classifier, COUNT_SVM_MODEL_PATH)
pickle_to_file(tf_idf_svm_classifier, TF_IDF_SVM_MODEL_PATH)
pickle_to_file(count_nb_classifier, COUNT_NB_MODEL_PATH)
pickle_to_file(tf_idf_nb_classifier, TF_IDF_NB_MODEL_PATH)


# Playground

In [None]:
comment = ""
vectorized = tf_idf_vectorizer.transform([comment])
prediction = tf_idf_svm_classifier.predict_proba(vectorized)
print(f"offensiveness: {prediction[0][1]}")
