# Sentiment Analysis of Twitter Posts
<!-- Notebook name goes here -->
<center><b>Notebook: Neural Network Model, Error Analysis, and Tuning</b></center>
<br>

**By**: Stephen Borja, Justin Ching, Erin Chua, and Zhean Ganituen.

**Dataset**: Hussein, S. (2021). Twitter Sentiments Dataset [Dataset]. Mendeley. https://doi.org/10.17632/Z9ZW7NT5H2.1

**Motivation**: Every minute, social media users generate a large influx of textual data on live events. Performing sentiment analysis on this data provides a real-time view of public perception, enabling quick insights into the general populationâ€™s opinions and reactions.

**Goal**: By the end of the project, our goal is to create and compare supervised learning algorithms for sentiment analysis.

# **1. Project Setup**

In [1]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import hstack

# General Imports
from tqdm import tqdm
import numpy as np
import pandas as pd
import sys, os
sys.path.append(os.path.abspath("../lib"))

# **2. Data Setup**

Run the data processing pipeline

In [2]:
import IPython.core.page
import builtins
import time
from IPython.utils.capture import capture_output

pager = IPython.core.page.page
helper = builtins.help

IPython.core.page.page = lambda *args, **kwargs: None
builtins.help = lambda *args, **kwargs: None

try:
    with capture_output():
        %run data.ipynb
finally:
    IPython.core.page.page = pager
    builtins.help = helper

print("Data Setup is DONE")

# Tests
assert X.shape == (162_801, 29318), "Feature matrix shape is wrong; expected (162_801, 29318)"
assert y.shape == (162_801,), "Labels shape is wrong; expected (162_801,)"

assert X_train.shape == (113_960, 29_318), "Train shape is wrong; expected (113_960, 2)"
assert X_test.shape == (48_841, 29_318), "Test shape is wrong; expected (48_841, 2)"

assert y_train.shape == (113_960,), "Train labels shape is wrong; expected (113_960,)"
assert y_test.shape == (48_841,), "Test labels shape is wrong; expected (48_841,)"
print("All tests passed.")

Data Setup is DONE
All tests passed.


Create a combined TRAIn dataset (combined X and y train)

In [3]:
class SparseDataset(Dataset):
    def __init__(self, X, y):
        """
        X: scipy sparse matrix (features)
        y: pandas Series or numpy array (labels)
        """
        self.X = X.tocsr() if hasattr(X, 'tocsr') else X
        self.y = y.values if hasattr(y, 'values') else y
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        # Convert sparse row to dense tensor
        x = torch.FloatTensor(self.X[idx].toarray().flatten())
        y = torch.LongTensor([int(self.y[idx])])[0]
        return x, y

def prepare_dataset(X_train, y_train):
    """
    Transforms X_train and y_train into a PyTorch Dataset.
    
    Parameters:
    -----------
    X_train : scipy sparse matrix
        Feature matrix
    y_train : pandas Series
        Labels
    mapping : dict
        Mapping from original labels to class indices (default: {-1: 0, 0: 1, 1: 2})
    
    Returns:
    --------
    dataset : SparseDataset
        PyTorch Dataset object ready for DataLoader
    TRAIN_csr : scipy sparse matrix
        Combined X and y matrix in CSR format
    """
    mapping = {-1: 0, 0: 1, 1: 2}
    y_train = y_train.map(mapping)
    
    TRAIN = hstack([X_train, y_train.values.reshape(-1, 1)])
    
    assert TRAIN.shape[0] == X_train.shape[0], "Row count mismatch!"
    assert TRAIN.shape[1] == X_train.shape[1] + 1, "Column count mismatch!"
    
    TRAIN_csr = TRAIN.tocsr()
    
    last_col = TRAIN_csr[:, -1].toarray().flatten()
    flat_y = y_train.values.flatten()
    
    print("X_train shape:", X_train.shape)
    print("TRAIN shape:  ", TRAIN_csr.shape)
    print(last_col, flat_y)
    
    assert np.array_equal(last_col, flat_y), "The combined TRAIN set is not the same!"
    
    dataset = SparseDataset(X_train, y_train)
    
    return dataset, TRAIN_csr

In [4]:
TRAIN_object, TRAIN_csr = prepare_dataset(X_train, y_train)

X_train shape: (113960, 29318)
TRAIN shape:   (113960, 29319)
[0 1 2 ... 0 2 2] [0 1 2 ... 0 2 2]


# **3. Model Selection**

In [5]:
class MyLittlePony(nn.Module):
    def __init__(self, vocab_size, hidden_dim, n_hiddens=1, dropout=0.2):
        super().__init__()
        # ERROR PRONE: THIS RELIES ON A SIDE EFFECT (i.e., TRAIN AS A GLOBAL VALUE)
        # TRAIN.shape[1] - 1
        # we do (- 1) because we concatenated y_train to X_train hence it adds a column.
        layers = [nn.Linear(vocab_size, hidden_dim), nn.ReLU(), nn.Dropout(dropout)]

        for _ in range(n_hiddens - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))

        # There are 3 states at the end for the three sentiments
        layers.append(nn.Linear(hidden_dim, 3))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

## **Hyperparameters**

In [6]:
class Hyperparameters:
    """
    Hyperparameters for the multi-layer perceptron (MLP) used for sentiment analysis.

    Remark (Zhean). I defined this as a class to enforce IMMUTABILITY of the hyperparamters. That is,
    no matter what happens in the code, we can ensure that these are never changed.

    # Hyperparameters
    * N_EPOCHS: The number of training epochs.
    * N_HIDDENS: The number of hidden layers in the MLP.
    * N_SNEAKY_NEURONS: The number of neurons in each hidden layer.

    # Usage
    ```
    print(Hyperparameters.N_EPOCHS)        
    print(Hyperparameters.N_HIDDENS)       
    print(Hyperparameters.N_SNEAKY_NEURONS)
    ```
    """
    N_EPOCHS = [10, 20, 30, 40, 50]
    N_HIDDENS = [2, 4]
    N_SNEAKY_NEURONS = [128, 256]
    N_LEARNING_RATE = [1e-3]
    CRITERION = [nn.CrossEntropyLoss()]
    # ISN'T LR also a hyperparameter?
    # ADD OPTIMIZER HERE LATER
    # torch.optim.Adam(model.parameters(), lr=1e-3)
    OPTIMIZER = [torch.optim.Adam]

## **Initializing the MLP**

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = MyLittlePony(
    vocab_size  = TRAIN_csr.shape[1] - 1, 
    hidden_dim  = Hyperparameters.N_SNEAKY_NEURONS[0],
    n_hiddens   = Hyperparameters.N_HIDDENS[0]
)

model.to(device)
print("MLP initialized")

Using device: cpu
MLP initialized


# **4. Training the model**

In [8]:
def train(model, criterion, optimizer, train_loader, epoch):
    model.train()
    total_correct = 0
    total_samples = 0
    
    for _ in range(epoch):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    
    return total_correct / total_samples

def valid(model, criterion, optimizer, val_loader):
    model.eval()
    total_correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    
    return total_correct / total_samples

In [None]:
# Source - https://stackoverflow.com/a/64386444
# Posted by Skipper, modified by community. See post 'Timeline' for change history
# Retrieved 2026-02-03, License - CC BY-SA 4.0

# define a cross validation function
def crossvalid(model, epochs, criterion, optimizer, dataset, k_fold):
    
    train_score = pd.Series()
    val_score = pd.Series()
    
    total_size = len(dataset)
    fraction = 1/k_fold
    seg = int(total_size * fraction)
    for i in range(k_fold):
        trll = 0
        trlr = i * seg
        vall = trlr
        valr = i * seg + seg
        trrl = valr
        trrr = total_size

        train_left_indices = list(range(trll,trlr))
        train_right_indices = list(range(trrl,trrr))
        
        train_indices = train_left_indices + train_right_indices
        val_indices = list(range(vall,valr))
        
        train_set = torch.utils.data.dataset.Subset(dataset,train_indices)
        val_set = torch.utils.data.dataset.Subset(dataset,val_indices)

        # BATCH SIZE HERE IS ANOTHER HYPERPARAMETER
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=50, shuffle=True, num_workers=4)
        val_loader = torch.utils.data.DataLoader(val_set, batch_size=50, shuffle=True, num_workers=4)
        train_acc = train(model,criterion,optimizer,train_loader,epochs)
        train_score.at[i] = train_acc
        val_acc = valid(model,criterion,optimizer,val_loader)
        val_score.at[i] = val_acc
    
    return train_score, val_score

train_score, val_score = crossvalid(
    model, 
    epochs = Hyperparameters.N_EPOCHS[0],
    criterion=Hyperparameters.CRITERION[0], 
    optimizer=Hyperparameters.OPTIMIZER[0](model.parameters(), lr=Hyperparameters.N_LEARNING_RATE[0]),
    dataset=TRAIN_object,
    k_fold=5
)
print(train_score, val_score)

# **5. Evaluation**

In [None]:
# def evaluate(model, data_loader, device):
#     """
#     Evaluate the model on a validation/test dataset.

#     # Parameters
#     * model: PyTorch model
#     * data_loader: DataLoader for validation/test set
#     * device: torch.device ('cpu' or 'cuda')

#     # Returns
#     * accuracy: float, proportion of correct predictions
#     """
#     model.eval() 
#     correct, total = 0, 0

#     with torch.no_grad():
#         for x_batch, y_batch in tqdm(data_loader, desc="Validation"):
#             x_batch = x_batch.to(device, non_blocking=True)
#             y_batch = y_batch.to(device, non_blocking=True)

#             logits = model(x_batch)
            
#             prediction = logits.argmax(dim=1)

#             correct += (prediction == y_batch).sum().item()
#             total += y_batch.size(0)

#     return correct / total

# val_accuracy = evaluate(model, train_loader, device)
# print(f"Validation Accuracy: {val_accuracy:.4f}")