## Data exploration

In [None]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd

from src.preprocessing.hatespeech_dataset_querying import prepare_hatespeech_v2_dataset, load_hatespeech_v2_dataset

try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../..")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

# Preparing and loading the data

In [None]:
# run if you need to create the preprocessed data file again
# prepare_hatespeech_v2_dataset(save=True)

df = load_hatespeech_v2_dataset()
df

In [None]:
df[df["label"] == 2]

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("unhcr/hatespeech-detection")
model = AutoModelForSequenceClassification.from_pretrained("unhcr/hatespeech-detection")

In [None]:
%%time
# Use a pipeline as a high-level helper
from transformers import pipeline
import tqdm 
pipe = pipeline("text-classification", model="unhcr/hatespeech-detection", device="cuda:0")

y_pred = pipe(list(df["text"].values))

In [None]:
Žfrom sklearn.metrics import classification_report, accuracy_score

def map_predicted_to_label(y_pred):
    y_pred_mapped = []
    for json_pair in y_pred:
        if json_pair["label"] == "Normal":
            y_pred_mapped.append(1)
        if json_pair["label"] == "Offensive":
            y_pred_mapped.append(1)
        if json_pair["label"] == "Hate speech":
            y_pred_mapped.append(2)
    return y_pred_mapped


y_truth = df["label"]
mapped_pred = map_predicted_to_label(y_pred)

print(classification_report(y_truth[:100], mapped_pred))

In [None]:
# second model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("IMSyPP/hate_speech_en")
model = AutoModelForSequenceClassification.from_pretrained("IMSyPP/hate_speech_en")

# Extracting features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [porter.stem(token) for token in tokens]
    return " ".join(tokens)

df["tokenized_text"] = df["text"].apply(lambda x: preprocess_text(x))
df

In [None]:
# count occurrences 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['tokenized_text'])


In [None]:
# TODO finish implmentation
word_counts = X.sum(axis=0)
word_counts_df = pd.DataFrame(word_counts, columns=vectorizer.get_feature_names_out())
word_counts_sorted = word_counts_df.transpose().sort_values(by=0, ascending=False)
word_counts_sorted