# Sentiment Analysis of Twitter Posts
<!-- Notebook name goes here -->
<center><b>Notebook: Neural Network Model, Error Analysis, and Tuning</b></center>
<br>

**By**: Stephen Borja, Justin Ching, Erin Chua, and Zhean Ganituen.

**Dataset**: Hussein, S. (2021). Twitter Sentiments Dataset [Dataset]. Mendeley. https://doi.org/10.17632/Z9ZW7NT5H2.1

**Motivation**: Every minute, social media users generate a large influx of textual data on live events. Performing sentiment analysis on this data provides a real-time view of public perception, enabling quick insights into the general populationâ€™s opinions and reactions.

**Goal**: By the end of the project, our goal is to create and compare supervised learning algorithms for sentiment analysis.

# **1. Project Setup**

In [20]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import hstack
import itertools

# sci-kit learn
from sklearn.model_selection import StratifiedKFold

# General Imports
from tqdm import tqdm
import numpy as np
import pandas as pd
import sys, os
sys.path.append(os.path.abspath("../lib"))

# **2. Data Setup**

Run the data processing pipeline

In [21]:
import IPython.core.page
import builtins
import time
from IPython.utils.capture import capture_output

pager = IPython.core.page.page
helper = builtins.help

IPython.core.page.page = lambda *args, **kwargs: None
builtins.help = lambda *args, **kwargs: None

try:
    with capture_output():
        %run data.ipynb
finally:
    IPython.core.page.page = pager
    builtins.help = helper

print("Data Setup is DONE")

# Tests
assert X.shape == (162_801, 29318), "Feature matrix shape is wrong; expected (162_801, 29318)"
assert y.shape == (162_801,), "Labels shape is wrong; expected (162_801,)"

assert X_train.shape == (113_960, 29_318), "Train shape is wrong; expected (113_960, 2)"
assert X_test.shape == (48_841, 29_318), "Test shape is wrong; expected (48_841, 2)"

assert y_train.shape == (113_960,), "Train labels shape is wrong; expected (113_960,)"
assert y_test.shape == (48_841,), "Test labels shape is wrong; expected (48_841,)"
print("All tests passed.")

Data Setup is DONE
All tests passed.


Create a combined TRAIn dataset (combined X and y train)

In [22]:
class SparseDataset(Dataset):
    def __init__(self, X, y):
        """
        X: scipy sparse matrix (features)
        y: pandas Series or numpy array (labels)
        """
        self.X = X.tocsr() if hasattr(X, 'tocsr') else X
        self.y = y.values if hasattr(y, 'values') else y
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        # Convert sparse row to dense tensor
        x = torch.FloatTensor(self.X[idx].toarray().flatten())
        y = torch.LongTensor([int(self.y[idx])])[0]
        return x, y

def prepare_dataset(X_train, y_train):
    """
    Transforms X_train and y_train into a PyTorch Dataset.
    
    Parameters:
    -----------
    X_train : scipy sparse matrix
        Feature matrix
    y_train : pandas Series
        Labels
    mapping : dict
        Mapping from original labels to class indices (default: {-1: 0, 0: 1, 1: 2})
    
    Returns:
    --------
    dataset : SparseDataset
        PyTorch Dataset object ready for DataLoader
    TRAIN_csr : scipy sparse matrix
        Combined X and y matrix in CSR format
    """
    mapping = {-1: 0, 0: 1, 1: 2}
    y_train = y_train.map(mapping)
    
    TRAIN = hstack([X_train, y_train.values.reshape(-1, 1)])
    
    assert TRAIN.shape[0] == X_train.shape[0], "Row count mismatch!"
    assert TRAIN.shape[1] == X_train.shape[1] + 1, "Column count mismatch!"
    
    TRAIN_csr = TRAIN.tocsr()
    
    last_col = TRAIN_csr[:, -1].toarray().flatten()
    flat_y = y_train.values.flatten()
    
    print("X_train shape:", X_train.shape)
    print("TRAIN shape:  ", TRAIN_csr.shape)
    print(last_col, flat_y)
    
    assert np.array_equal(last_col, flat_y), "The combined TRAIN set is not the same!"
    
    dataset = SparseDataset(X_train, y_train)
    
    return dataset, TRAIN_csr

In [23]:
TRAIN_object, TRAIN_csr = prepare_dataset(X_train, y_train)

X_train shape: (113960, 29318)
TRAIN shape:   (113960, 29319)
[0 1 2 ... 0 2 2] [0 1 2 ... 0 2 2]


# **3. Model Selection**

In [24]:
class MyLittlePony(nn.Module):
    def __init__(self, vocab_size, hidden_dim, n_hiddens, dropout=0.2):
        super().__init__()
        layers = [nn.Linear(vocab_size, hidden_dim), nn.ReLU(), nn.Dropout(dropout)]

        for _ in range(n_hiddens - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))

        layers.append(nn.Linear(
            hidden_dim, 
            # There are 3 states at the end for the three sentiments
            3
        ))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

## **Hyperparameters**

In [25]:
class Hyperparameters:
    """
    Hyperparameters for the multi-layer perceptron (MLP) used for sentiment analysis.

    Remark (Zhean). I defined this as a class to enforce IMMUTABILITY of the hyperparamters. That is,
    no matter what happens in the code, we can ensure that these are never changed.

    # Hyperparameters
    * N_EPOCHS: The number of training epochs.
    * N_HIDDENS: The number of hidden layers in the MLP.
    * N_SNEAKY_NEURONS: The number of neurons in each hidden layer.

    # Usage
    ```
    print(Hyperparameters.N_EPOCHS)        
    print(Hyperparameters.N_HIDDENS)       
    print(Hyperparameters.N_SNEAKY_NEURONS)
    ```
    """
    N_EPOCHS         = [10, 20, 30, 40, 50]
    N_HIDDENS        = [2, 4, 8, 16]
    N_SNEAKY_NEURONS = [128, 256, 512]
    N_BATCH_SIZE     = [64, 128, 256, 512]
    OPTIMIZER        = [torch.optim.Adam]
    
    # Fixed hyperparameters
    CRITERION = nn.CrossEntropyLoss()
    N_LEARNING_RATE = 1e-3

## **Initializing the MLP**

Use CUDA if available.

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# **4. Training the model**

In [27]:
def train(model, criterion, optimizer, train_loader, epoch, device):
    model.train()
    total_correct = 0
    total_samples = 0
    
    for _ in range(epoch):
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    
    return total_correct / total_samples

def valid(model, criterion, val_loader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            # Move data to GPU/device
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    
    return total_correct / total_samples

In [28]:
# Source - https://stackoverflow.com/a/64386444
# Posted by Skipper, modified by community. See post 'Timeline' for change history
# Retrieved 2026-02-03, License - CC BY-SA 4.0

# define a cross validation function
def crossvalid(
    model_class, 
    vocab_size,
    hidden_dim,
    n_hiddens, 
    epochs, 
    criterion, 
    optimizer_class, 
    lr,
    dataset, 
    k_fold, 
    device, 
    batch_size
):

    train_score = pd.Series()
    val_score = pd.Series()
    total_size = len(dataset)
    fraction = 1/k_fold
    seg = int(total_size * fraction)

    # stratified k fold
    labels = TRAIN_object.y # labels
    stratified_folds = StratifiedKFold(n_splits = k_fold, shuffle = True, random_state = 5) # StratifiedKFold instance

    allocate = np.zeros(len(labels)) # dummies for split
    fold_indices = list(stratified_folds.split(allocate, labels)) # actual stratified splitting

    print("Original Distribution:", np.bincount(labels) / len(labels)) # to check distribution, can remove
    for i in tqdm(range(k_fold), desc="K-Fold CV"):
        model = model_class(vocab_size, hidden_dim, n_hiddens).to(device)
        optimizer = optimizer_class(model.parameters(), lr=lr)

        train_indices, val_indices = fold_indices[i]

        train_set = torch.utils.data.dataset.Subset(dataset,train_indices)
        val_set = torch.utils.data.dataset.Subset(dataset,val_indices)

        train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
        val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
        
        train_acc = train(model, criterion, optimizer, train_loader, epochs, device)
        train_score.at[i] = train_acc
        val_acc = valid(model, criterion, val_loader, device)
        val_score.at[i] = val_acc

        fold_labels = labels[val_indices] # to check distribution, can remove
        distribution = np.bincount(fold_labels) / len(fold_labels) # to check distribution, can remove
        print(f"Fold {i} Val Distribution: {distribution}") # to check distribution, can remove
    
    return train_score, val_score

In [29]:
# get all possible combinations of the hyperparameters
hyperparam_combination = itertools.product(
    Hyperparameters.N_BATCH_SIZE,
    Hyperparameters.N_EPOCHS, 
    Hyperparameters.N_HIDDENS, 
    Hyperparameters.N_SNEAKY_NEURONS,
    Hyperparameters.OPTIMIZER
)

for batch_size_choice, epoch_choice, hidden_choice, neurons_choice, optimizer_choice in hyperparam_combination:
    train_score, val_score = crossvalid(
        # Constants
        model_class     = MyLittlePony,
        k_fold          = 5,
        device          = device,
        vocab_size      = TRAIN_csr.shape[1] - 1,
        
        # Variable Hyperparameters
        hidden_dim      = neurons_choice,
        n_hiddens       = hidden_choice,
        epochs          = epoch_choice,
        optimizer_class = optimizer_choice,
        batch_size      = batch_size_choice,
        dataset         = TRAIN_object,

        # Constant Hyperparameter
        criterion       = Hyperparameters.CRITERION, 
        lr              = Hyperparameters.N_LEARNING_RATE,
    )
    print(epoch_choice, hidden_choice, neurons_choice, optimizer_choice)
    print(train_score, val_score)

Original Distribution: [0.21730432 0.33796069 0.44473499]


K-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fold 0 Val Distribution: [0.21731309 0.33796946 0.44471744]
Fold 1 Val Distribution: [0.21731309 0.33796946 0.44471744]
Fold 2 Val Distribution: [0.21731309 0.33796946 0.44471744]
Fold 3 Val Distribution: [0.21731309 0.33792559 0.44476132]
Fold 4 Val Distribution: [0.21726922 0.33796946 0.44476132]
10 2 128 <class 'torch.optim.adam.Adam'>
0    0.949639
1    0.948639
2    0.948948
3    0.949093
4    0.949409
dtype: float64 0    0.815813
1    0.819235
2    0.814233
3    0.817962
4    0.816339
dtype: float64
Original Distribution: [0.21730432 0.33796069 0.44473499]


K-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

Fold 0 Val Distribution: [0.21731309 0.33796946 0.44471744]
Fold 1 Val Distribution: [0.21731309 0.33796946 0.44471744]
Fold 2 Val Distribution: [0.21731309 0.33796946 0.44471744]
Fold 3 Val Distribution: [0.21731309 0.33792559 0.44476132]


KeyboardInterrupt: 