In [195]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import re
import string
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import string

from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Text extraction 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

import contractions
from gensim.models import Word2Vec

# Set pd options to display all columns and rows
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 30)
pd.set_option('display.max_colwidth', None)  # Show full text without truncation

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [53]:
# Define the base directory (where the notebook is)
BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

# Construct full paths to the CSV files
train_path = os.path.join(BASE_DIR, "data", "train.csv")
test_path = os.path.join(BASE_DIR, "data", "test.csv")

# Load the datasets
df_train = pd.read_csv(train_path)

### 3 - Text Pre-Processing for Traditional ML Models

### 3.1 - Stratified Data Splitting


To ensure the model learns from a representative distribution of the target variable and it mantains the class proportions, we performed a **stratified split** of the dataset into training (70%), validation (15%), and test (15%) sets. 

In [54]:
# Using stratify to maintain the distribution of classes in the train, validation, and test sets
train_df, val_test_df = train_test_split(df_train, test_size=0.3, stratify=df_train['label'], random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df['label'], random_state=42)

In [55]:
train_df_copy = train_df.copy()
val_df_copy = val_df.copy()
test_df_copy = test_df.copy()

### 3.2 - Pre-Processing pipeline


The effectiveness of pre-processing techniques is model-dependent. For classical machine learning approaches, the optimal combination from **Symeonidis, Effrosynidis, and Arampatzis (2018). A comparative evaluation of pre-processing techniques and their interactions for twitter sentiment analysis** includes:

- **URL/User Mention Replacement**: Replace URLs and user mentions with tags, as they do not contain sentiment information (Agarwal et al., 2011).
- **Contraction Handling**: Replacing contractions improves accuracy, as contractions are common in tweets and often exempt from sentiment lexicons (Chalil et al., 2015).
- **Number Removal**: While many researchers remove numbers (He, Lin, & Alani, 2011; Zhao, 2015), some argue that keeping numbers may improve classification effectiveness (Lin & He, 2009).
- **Replace Punctuation Repetition**: Normalizes language and generalizes vocabulary to represent sentiment (Balahur, 2013).
- **Lemmatization**: Passes baseline results for both datasets, especially for classic algorithms. However, it may ignore semantic information in large datasets (Shotaroo, Takamura, & Okumura, 2005).

In [56]:
lemmatizer = WordNetLemmatizer()

# Source: https://www.nltk.org/api/nltk.tokenize.casual.html
# Difference between TweetTokenizer and Word_Tokenize: https://stackoverflow.com/questions/61919670/how-nltk-tweettokenizer-different-from-nltk-word-tokenize
tokenizer = TweetTokenizer()

stop_words = set(stopwords.words('english'))

In [57]:
"""
standard_nltk = word_tokenize(text)
print(standard_nltk)
# output: ['The', 'quick', 'brown', 'fox', 'is', "n't", 'jumping', 'over', 
# 'the', 'lazy', 'dog', ',', 'co-founder', 'multi-word', 'expression', '.', 
# '#', 'yes', '!']

twitter_nltk = tweet_tokenizer.tokenize(text)
print(twitter_nltk)
# output: ['The', 'quick', 'brown', 'fox', "isn't", 'jumping', 'over', 
# 'the', 'lazy', 'dog', ',', 'co-founder', 'multi-word', 'expression', '.', 
# '#yes', '!']

"""

'\nstandard_nltk = word_tokenize(text)\nprint(standard_nltk)\n# output: [\'The\', \'quick\', \'brown\', \'fox\', \'is\', "n\'t", \'jumping\', \'over\', \n# \'the\', \'lazy\', \'dog\', \',\', \'co-founder\', \'multi-word\', \'expression\', \'.\', \n# \'#\', \'yes\', \'!\']\n\ntwitter_nltk = tweet_tokenizer.tokenize(text)\nprint(twitter_nltk)\n# output: [\'The\', \'quick\', \'brown\', \'fox\', "isn\'t", \'jumping\', \'over\', \n# \'the\', \'lazy\', \'dog\', \',\', \'co-founder\', \'multi-word\', \'expression\', \'.\', \n# \'#yes\', \'!\']\n\n'

We can try to remove stopwords and see how the model reacts, from **Symeonidis, Effrosynidis, and Arampatzis (2018). A comparative evaluation of pre-processing techniques and their interactions for twitter sentiment analysis**:

The technique of removing stopwords yielded ambiguous results. For the SS-Twitter dataset, none of the algorithms was over the baseline accuracy but for the SemEval dataset, on three classic algorithms, the results were satisfactory. 

The reasons for failure are: first, stopwords like ‘I’, ‘me’, ‘you’, present and are associated with expressions of sentiment ( Thelwall et al., 2012 ),second, the domain of Tweets for each dataset, and third the vocabulary and the age of users. According to Haas et al. (2011) young people tend to use more and more short text with slangs and many stopwords to express their feelings about themselves. 

In [None]:
def preprocess_traditional_ml(text):
    text = text.lower()

    # Replace URLs and user mentions
    text = re.sub(r"http\S+|www\.\S+", "URL", text)
    text = re.sub(r"@\w+", "USER", text)

    # Expand contractions (we use contractions library for this)
    # Contractions library Source: https://pypi.org/project/contractions/
    text = contractions.fix(text)

    # # Replace numbers with [NUM]
    # text = re.sub(r"\d+(\.\d+)?", "[NUM]", text)

    # Convert to tickers (e.g., $AAPL to [TICKER])
    text = re.sub(r"\$[a-z]{1,5}", "[TICKER]", text)

    #Remove numbers
    text = re.sub(r"\d+", "", text)

    # Normalize punctuation repetitions
    text = re.sub(r"([!?\.])\1+", r"\1", text)

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Lemmatize and remove stopwords/punctuation
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token not in string.punctuation]

    # Source: https://www.nltk.org/api/nltk.tokenize.treebank.html 
    return TreebankWordDetokenizer().detokenize(tokens)

In [132]:
# Example usage of the contractions library
test = "Hello how're you doing? I'm doing!"
test_fix = contractions.fix(test)
print(test_fix)

Hello how are you doing? I am doing!


In [133]:
train_df_copy['X_train'] = train_df_copy['text'].apply(lambda x: preprocess_traditional_ml(x))
val_df_copy['X_val'] = val_df_copy['text'].apply(lambda x: preprocess_traditional_ml(x))
test_df_copy['X_test'] = test_df_copy['text'].apply(lambda x: preprocess_traditional_ml(x))

y_train = train_df_copy['label']
y_val = val_df_copy['label']
y_test = test_df_copy['label']

In [134]:
train_df_copy.iloc[0]

text       Today in Brexit: European Union members are ratcheting up their negotiating demands https://t.co/Qnh48BCc2l
label                                                                                                                2
X_train                                           today brexit european union member ratcheting negotiating demand URL
Name: 7384, dtype: object

### 4 - Feature Extraction Methods for Traditional ML Models

#### 4.1. - TF-IDF

In [None]:
# # Load NLTK stopwords
# stop_words = set(stopwords.words('english'))

# keep_words = {
#     'not', 'no', 'never', "n't", 'barely', 'hardly', 'scarcely',  # negators & undercutters
#     'down', 'up', 'loss', 'gain', 'fall', 'rise',                  # directional sentiment
#     'after', 'what', 'over', 'more'                                # context/tone
# }  

# custom_stopwords = list(stop_words - keep_words)

In [228]:
# Fit vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=15_000)
vectorizer.fit(train_df_copy['X_train'])


# Transform and store in separate variables
X_train_tfidf = vectorizer.transform(train_df_copy['X_train'])
X_val_tfidf = vectorizer.transform(val_df_copy['X_val'])
X_test_tfidf = vectorizer.transform(test_df_copy['X_test'])

In [229]:
# Fit model
model = LogisticRegression(max_iter=200, class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

In [230]:
y_val_pred = model.predict(X_val_tfidf)
y_test_pred = model.predict(X_test_tfidf)

print("  Validation Set:")
print(classification_report(y_val, y_val_pred, digits=3))
print("  Test Set:")
print(classification_report(y_test, y_test_pred, digits=3))

  Validation Set:
              precision    recall  f1-score   support

           0      0.551     0.576     0.563       217
           1      0.641     0.694     0.667       288
           2      0.878     0.846     0.861       926

    accuracy                          0.774      1431
   macro avg      0.690     0.705     0.697      1431
weighted avg      0.781     0.774     0.777      1431

  Test Set:
              precision    recall  f1-score   support

           0      0.533     0.560     0.546       216
           1      0.652     0.727     0.687       289
           2      0.874     0.833     0.853       927

    accuracy                          0.770      1432
   macro avg      0.687     0.707     0.696      1432
weighted avg      0.778     0.770     0.773      1432



#### 4.2. - Word2Vec

In [225]:
# Train Word2Vec model using preprocessed text
tokenized_train = [text.split() for text in train_df_copy['X_train']]

# Same hyperparameters as in AStudy of Feature Extraction techniques for Sentiment Analysis
# Source: https://link.springer.com/chapter/10.1007/978-981-13-1501-5_41
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=10, min_count = 1 ,workers=7, sample= 0.00001, negative=5, seed=42)

# 2. Average Word Vectors
def average_word_vectors(tokens, model, vector_size):
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return np.zeros(vector_size)
    return np.mean(model.wv[valid_tokens], axis=0)


# 3. Create features for train, val, and test
X_train_w2v = np.array([average_word_vectors(text.split(), w2v_model, 100) for text in train_df_copy['X_train']])
X_val_w2v = np.array([average_word_vectors(text.split(), w2v_model, 100) for text in val_df_copy['X_val']])
X_test_w2v = np.array([average_word_vectors(text.split(), w2v_model, 100) for text in test_df_copy['X_test']])

In [226]:
model.fit(X_train_w2v, y_train)

In [227]:
y_val_pred_w2v = model.predict(X_val_w2v)
y_test_pred_w2v = model.predict(X_test_w2v)

print("  Validation Set:")
print(classification_report(y_val, y_val_pred_w2v, digits=3))
print("  Test Set:")
print(classification_report(y_test, y_test_pred_w2v, digits=3))

  Validation Set:
              precision    recall  f1-score   support

           0      0.265     0.382     0.313       217
           1      0.363     0.552     0.438       288
           2      0.772     0.567     0.654       926

    accuracy                          0.536      1431
   macro avg      0.467     0.501     0.468      1431
weighted avg      0.613     0.536     0.559      1431

  Test Set:
              precision    recall  f1-score   support

           0      0.260     0.343     0.295       216
           1      0.355     0.543     0.430       289
           2      0.777     0.591     0.672       927

    accuracy                          0.544      1432
   macro avg      0.464     0.492     0.466      1432
weighted avg      0.614     0.544     0.566      1432

