## preprocessing
- Tokenization
- Stop Word Removal
- Lemmatization
- Stemming

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/saveriofnk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saveriofnk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/saveriofnk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:

# Define your preprocessing function
def preprocess_text(text_series):
    # Tokenization
    tokens = text_series.apply(word_tokenize)

    # Lowercase and strip
    tokens = tokens.apply(lambda x: [word.lower().strip() for word in x])

    # Stop word removal
    stop_words = set(stopwords.words("english"))
    filtered_tokens = tokens.apply(lambda x: [word for word in x if word.lower() not in stop_words])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = filtered_tokens.apply(lambda x: [lemmatizer.lemmatize(word, pos="v") for word in x])

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = lemmatized_tokens.apply(lambda x: [stemmer.stem(word) for word in x])

    # Join tokens into a single string
    preprocessed_text = stemmed_tokens.apply(lambda x: ' '.join(x))

    return preprocessed_text

## Applying the Preprocessing Function to Text Data

In [9]:
import pandas as pd

In [10]:
# Load the data from prepared_hatespeech_v2.csv
dataset = pd.read_csv('../data/hatespeech_v2/prepared_hatespeech_v2.csv', sep=',')

dataset = dataset[['tweet_id', 'label', 'text']]

dataset.head()

Unnamed: 0,tweet_id,label,text
0,1344794359233998850,0.0,You know maybe doing a “challenge” where I dri...
1,1344794162625916935,0.0,RT @thehill: Black transgender woman found dea...
2,1344794094837637121,0.0,2021 Goals: Playtest and release Rumrunners. R...
3,1344790842117140483,0.0,Guest Co Host: Men Like Us Podcast #StopTheHat...
4,1344788907360190465,0.0,👏 Congratulations @AyodejiOsowobi @StandtoEndR...


## Feature Extraction + TD-IDF + Naive Bayes

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

In [13]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    dataset["text"],
    dataset["label"],
    test_size=0.25,
    random_state=42
)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', MultinomialNB())  # Train a Naive Bayes classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict
nb_y_pred = pipeline.predict(X_test)

# Evaluate
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", report)  # Remove unnecessary f-string

Accuracy: 79.77%
Classification report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     13519
         1.0       0.84      0.05      0.10      3229
         2.0       0.00      0.00      0.00       402

    accuracy                           0.80     17150
   macro avg       0.55      0.35      0.33     17150
weighted avg       0.79      0.80      0.72     17150



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Feature Extraction + TD-IDF + SVM

In [14]:
from sklearn.svm import SVC

In [15]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text, validate=False)),  # Apply preprocessing function
    ('vectorizer', TfidfVectorizer()),  # Vectorize/extract features using TF-IDF
    ('classifier', SVC())  # Train an SVM classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict
svm_y_pred = pipeline.predict(X_test)

# Evaluate
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)

print(f"Accuracy: {svm_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", svm_report)  # Remove unnecessary f-string


Accuracy: 79.77%
Classification report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     13519
         1.0       0.84      0.05      0.10      3229
         2.0       0.00      0.00      0.00       402

    accuracy                           0.80     17150
   macro avg       0.55      0.35      0.33     17150
weighted avg       0.79      0.80      0.72     17150

