## preprocessing
- Tokenization
- Stop Word Removal
- Lemmatization
- Stemming

In [30]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package punkt to /home/saveriofnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saveriofnk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/saveriofnk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:

# Define your preprocessing function
def preprocess_text(text_series):
    # Tokenization
    tokens = text_series.apply(word_tokenize)

    # Lowercase and strip
    tokens = tokens.apply(lambda x: [word.lower().strip() for word in x])

    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens = tokens.apply(lambda x: [word for word in x if word.lower() not in stop_words])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = tokens.apply(lambda x: [lemmatizer.lemmatize(word, pos="v") for word in x])

    # Stemming
    stemmer = PorterStemmer()
    tokens = tokens.apply(lambda x: [stemmer.stem(word) for word in x])

    # Join tokens into a single string
    tokens = tokens.apply(lambda x: ' '.join(x))

    return tokens

## Load the Dataset

In [32]:
import pandas as pd

from src.preprocessing.hatespeech_dataset_querying import prepare_hatespeech_v2_dataset, load_hatespeech_v2_dataset, split_hatespeech_v2_dataset


In [33]:
#RUN if you don't have the test and train dataset files
#create test and train dataset
df_train, df_test = split_hatespeech_v2_dataset("../data/hatespeech_v2/prepared_hatespeech_v2.csv")

Saved train data to: ../data/hatespeech_v2/train_hatespeech_v2.csv
Saved test data to: ../data/hatespeech_v2/test_hatespeech_v2.csv


In [34]:
# Load the test and train dataset
train_df = pd.read_csv('../data/hatespeech_v2/train_hatespeech_v2.csv', sep=',')
test_df = pd.read_csv('../data/hatespeech_v2/test_hatespeech_v2.csv', sep=',')


## Feature Extraction + TD-IDF + Naive Bayes

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.exceptions import UndefinedMetricWarning

In [36]:
# Split the training data into X_train and y_train
X_train = train_df['text']
y_train = train_df['label']

# Split the test data into X_test and y_test
X_test = test_df['text']
y_test = test_df['label']

# Suppress UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', MultinomialNB())  # Train a Naive Bayes classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict
nb_y_pred = pipeline.predict(X_test)

# Evaluate
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred, digits=4)

print(f"Accuracy: {nb_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", nb_report)  # Remove unnecessary f-string

Accuracy: 80.50%
Classification report:
               precision    recall  f1-score   support

         0.0     0.8039    0.9994    0.8911     10873
         1.0     0.8762    0.0702    0.1301      2520
         2.0     0.0000    0.0000    0.0000       327

    accuracy                         0.8050     13720
   macro avg     0.5600    0.3566    0.3404     13720
weighted avg     0.7980    0.8050    0.7301     13720



In [37]:
import pickle
filename = '../data/model/nb_model_TFIDF_01.sav'
pickle.dump(pipeline, open(filename, 'wb'))

## Feature Extraction + TD-IDF + SVM

In [38]:
from sklearn.svm import SVC

In [39]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', SVC())  # Train an SVM classifier
])


# Train the model
pipeline.fit(X_train, y_train)

# Predict
svm_y_pred = pipeline.predict(X_test)

# Evaluate
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred, digits=4)

print(f"Accuracy: {svm_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", svm_report)  # Remove unnecessary f-string


Accuracy: 91.97%
Classification report:
               precision    recall  f1-score   support

         0.0     0.9392    0.9725    0.9555     10873
         1.0     0.8314    0.8040    0.8174      2520
         2.0     0.7500    0.0550    0.1026       327

    accuracy                         0.9197     13720
   macro avg     0.8402    0.6105    0.6252     13720
weighted avg     0.9148    0.9197    0.9098     13720



In [40]:
# #save the model in a file pickle
# import pickle
# filename = '../data/model/svm_model_TFIDF_01.sav'
# pickle.dump(pipeline, open(filename, 'wb'))
# 
# #save the model only not the entire pipeline in a file pickle
# #filename = 'svm_model.sav'
# #pickle.dump(pipeline.named_steps['classifier'], open(filename, 'wb'))

## Feature Extraction + TD-IDF + Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:


# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', RandomForestClassifier())  # Train a Random Forest classifier
])

# Suppress UndefinedMetricWarning for Random Forest
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Train the model
pipeline.fit(X_train, y_train)

# Predict
rf_y_pred = pipeline.predict(X_test)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred, digits=4)

print(f"Accuracy: {rf_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", rf_report)  # Remove unnecessary f-string


Accuracy: 90.60%
Classification report:
               precision    recall  f1-score   support

         0.0     0.9172    0.9791    0.9472     10873
         1.0     0.8441    0.7067    0.7693      2520
         2.0     1.0000    0.0092    0.0182       327

    accuracy                         0.9060     13720
   macro avg     0.9204    0.5650    0.5782     13720
weighted avg     0.9057    0.9060    0.8924     13720

