## preprocessing
- Tokenization
- Stop Word Removal
- Lemmatization
- Stemming

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package punkt to /home/saveriofnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saveriofnk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/saveriofnk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:

# Define your preprocessing function
def preprocess_text(text_series):
    # Tokenization
    tokens = text_series.apply(word_tokenize)

    # Lowercase and strip
    tokens = tokens.apply(lambda x: [word.lower().strip() for word in x])

    # Stop word removal
    stop_words = set(stopwords.words("english"))
    tokens = tokens.apply(lambda x: [word for word in x if word.lower() not in stop_words])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = tokens.apply(lambda x: [lemmatizer.lemmatize(word, pos="v") for word in x])

    # Stemming
    stemmer = PorterStemmer()
    tokens = tokens.apply(lambda x: [stemmer.stem(word) for word in x])

    # Join tokens into a single string
    tokens = tokens.apply(lambda x: ' '.join(x))

    return tokens

## Load the Dataset

In [3]:
import pandas as pd

from src.preprocessing.hatespeech_dataset_querying import prepare_hatespeech_v2_dataset, load_hatespeech_v2_dataset, split_hatespeech_v2_dataset


In [4]:
#RUN if you don't have the test and train dataset files
#create test and train dataset
df_train, df_test = split_hatespeech_v2_dataset("../data/hatespeech_v2/prepared_hatespeech_v2.csv")

Saved train data to: ../data/hatespeech_v2/train_hatespeech_v2.csv
Saved test data to: ../data/hatespeech_v2/test_hatespeech_v2.csv


In [5]:
# Load the test and train dataset
train_df = pd.read_csv('../data/hatespeech_v2/train_hatespeech_v2.csv', sep=',')
test_df = pd.read_csv('../data/hatespeech_v2/test_hatespeech_v2.csv', sep=',')


## Feature Extraction + TD-IDF + Naive Bayes

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.exceptions import UndefinedMetricWarning

In [7]:
# Split the training data into X_train and y_train
X_train = train_df['text']
y_train = train_df['label']

# Split the test data into X_test and y_test
X_test = test_df['text']
y_test = test_df['label']

# Suppress UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', MultinomialNB())  # Train a Naive Bayes classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict
nb_y_pred = pipeline.predict(X_test)

# Evaluate
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)

print(f"Accuracy: {nb_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", nb_report)  # Remove unnecessary f-string

Accuracy: 80.50%
Classification report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     10873
         1.0       0.88      0.07      0.13      2520
         2.0       0.00      0.00      0.00       327

    accuracy                           0.80     13720
   macro avg       0.56      0.36      0.34     13720
weighted avg       0.80      0.80      0.73     13720



In [None]:
import pickle
filename = '../data/model/nb_model_TFIDF_01.sav'
pickle.dump(pipeline, open(filename, 'wb'))

## Feature Extraction + TD-IDF + SVM

In [None]:
from sklearn.svm import SVC

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', SVC())  # Train an SVM classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict
svm_y_pred = pipeline.predict(X_test)

# Evaluate
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)

print(f"Accuracy: {svm_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", svm_report)  # Remove unnecessary f-string


In [None]:
#save the model in a file pickle
import pickle
filename = '../data/model/svm_model_TFIDF_01.sav'
pickle.dump(pipeline, open(filename, 'wb'))

#save the model only not the entire pipeline in a file pickle
#filename = 'svm_model.sav'
#pickle.dump(pipeline.named_steps['classifier'], open(filename, 'wb'))

In [None]:
#set(svm_y_pred)
#predict my sentence
# Preprocess the single sentence
# Convert the sentence into a pandas Series with a single element
single_series = pd.Series("fuck Italian")

preprocessed_sentence = preprocess_text(single_series)

# Vectorize the preprocessed sentence
vectorized_sentence = pipeline.named_steps['vectorizer'].transform([preprocessed_sentence])

# Predict the label
predicted_label = pipeline.predict(vectorized_sentence)

print(predicted_label)

## Feature Extraction + TD-IDF + Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:


# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', RandomForestClassifier())  # Train a Random Forest classifier
])

# Suppress UndefinedMetricWarning for Random Forest
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Train the model
pipeline.fit(X_train, y_train)

# Predict
rf_y_pred = pipeline.predict(X_test)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)

print(f"Accuracy: {rf_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", rf_report)  # Remove unnecessary f-string


## Comparison with Dataset Owner model

In [10]:
# # Use a pipeline as a high-level helper
# #from transformers import pipeline
# 
# #pipe = pipeline("text-classification", model="ctoraman/hate-speech-berturk")
# 
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# 
# tokenizer = AutoTokenizer.from_pretrained("ctoraman/hate-speech-berturk")
# model = AutoModelForSequenceClassification.from_pretrained("ctoraman/hate-speech-berturk")

In [12]:
# #use the model on test dataset and get the accuracy
# from sklearn.metrics import accuracy_score
# from transformers import TextClassificationPipeline
# 
# # Create a TextClassificationPipeline
# pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
# 
# # Predict
# predictions = pipe(X_test.tolist())
# 
# # Extract the predicted labels
# predictions 