# Sentiment Analysis
Neural Network with GloVe Embedding

### Check for CUDA

In [14]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## Data Preperation

### Load dataset

In [15]:
import pandas as pd
from datasets import load_dataset

# Breaking the existing dataset into 3 sectors: Train, Validation, Tests
data = load_dataset('go_emotions')
df_train = pd.DataFrame(data["train"])
df_val = pd.DataFrame(data["validation"])
df_test = pd.DataFrame(data["test"])

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert lists into multi-label format
mlb = MultiLabelBinarizer()
df_train['labels_vector'] = mlb.fit_transform(df_train["labels"]).tolist()


print(df_train)

                                                    text labels       id  \
0      My favourite food is anything I didn't have to...   [27]  eebbqej   
1      Now if he does off himself, everyone will thin...   [27]  ed00q6i   
2                         WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj   
3                            To make her feel threatened   [14]  ed7ypvh   
4                                 Dirty Southern Wankers    [3]  ed0bdzj   
...                                                  ...    ...      ...   
43405  Added you mate well I’ve just got the bow and ...   [18]  edsb738   
43406  Always thought that was funny but is it a refe...    [6]  ee7fdou   
43407  What are you talking about? Anything bad that ...    [3]  efgbhks   
43408            More like a baptism, with sexy results!   [13]  ed1naf8   
43409                                    Enjoy the ride!   [17]  eecwmbq   

                                           labels_vector  
0      [0, 0, 0, 0, 0, 0, 0,

In [17]:
# # Extract label mapping
label_names = data['train'].features['labels'].feature.names
# print(label_names)  # This will give you something like ['admiration', 'amusement', 'anger', ...]
# df_train['labels_text'] = df_train['labels'].apply(lambda x: [label_names[i] for i in x])
# print(df_train.head())


### Text Preprocessing

In [18]:
import os
import re
import string
import gdown
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove emails
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenization
    return " ".join(tokens)  # Return cleaned text

# Load dataset
df_train['text'] = df_train['text'].fillna("")  # Handle NaN values
df_train['clean_text'] = df_train['text'].apply(preprocess_text)

# Define local path for GloVe file
glove_file_path = "glove.6B.100d.txt"

# Check if the file already exists locally
if not os.path.exists(glove_file_path):
    print("GloVe file not found. Downloading from Google Drive...")
    file_id = "1QsPKoMTyODoqTklndJcbUjVdnGQ804H-"
    try:
        gdown.download(f"https://drive.google.com/uc?id={file_id}", glove_file_path, quiet=False)
        print("Download completed successfully!")
    except Exception as e:
        print(f"An error occurred while downloading: {e}")
        print("Please download the file manually from Google Drive")
else:
    print("GloVe file already exists locally.")

# Load GloVe word embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    return embeddings_index

glove_embeddings = load_glove_embeddings(glove_file_path)

# Convert text to GloVe embeddings
def preprocess_for_glove(text, embedding_dict, dim=100):
    tokens = text.split()  # Tokenize text (already preprocessed)
    embeddings = [embedding_dict[word] for word in tokens if word in embedding_dict]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(dim)

df_train['embedded_text'] = df_train['clean_text'].apply(lambda text: preprocess_for_glove(text, glove_embeddings))

# Add additional features
df_train['sentence_length'] = df_train['clean_text'].apply(lambda x: len(x.split()))
df_train['uppercase_ratio'] = df_train['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)

print(df_train.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


GloVe file already exists locally.
                                                text labels       id  \
0  My favourite food is anything I didn't have to...   [27]  eebbqej   
1  Now if he does off himself, everyone will thin...   [27]  ed00q6i   
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj   
3                        To make her feel threatened   [14]  ed7ypvh   
4                             Dirty Southern Wankers    [3]  ed0bdzj   

                                       labels_vector  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...   
4  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                          clean_text  \
0  my favourite food is anything i didnt have to ...   
1  now if he does off himself everyone will think...   
2                     why t

## Model Training

### Training Functions
Neural Network with 5 Fold Cross Validation

In [19]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold


class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()  # Multi-label classification needs Sigmoid activation

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return self.sigmoid(x)  # Sigmoid for multi-label classification


# Training with K-Fold Cross-Validation
def train_evaluate_model(X_tensor, y_tensor, input_size, hidden_size, num_classes, kf, epochs, learning_rate, criterion, optimizer_class, device):
    training_accuracies = []
    validation_accuracies = []

    for train_index, val_index in kf.split(X_tensor):
        X_train, X_val = X_tensor[train_index], X_tensor[val_index]
        y_train, y_val = y_tensor[train_index], y_tensor[val_index]

        model = SimpleNN(input_size, hidden_size, num_classes).to(device)
        optimizer = optimizer_class(model.parameters(), lr=learning_rate)

        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train)
            loss = criterion(outputs, y_train)
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            train_acc = ((model(X_train) > 0.5) == y_train).float().mean().item()  # Threshold at 0.5 for multi-label
            val_acc = ((model(X_val) > 0.5) == y_val).float().mean().item()
            training_accuracies.append(train_acc)
            validation_accuracies.append(val_acc)

    results = {
        'Training Accuracy Mean': np.mean(training_accuracies),
        'Training Accuracy StdDev': np.std(training_accuracies),
        'Validation Accuracy Mean': np.mean(validation_accuracies),
        'Validation Accuracy StdDev': np.std(validation_accuracies)
    }

    return results


### Convert Data to PyTorch Tensors

In [20]:
print(df_train['labels'].head())

0    [27]
1    [27]
2     [2]
3    [14]
4     [3]
Name: labels, dtype: object


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold

# Apply preprocessing with word embeddings
X_glove = np.array(df_train['embedded_text'].tolist())
y_glove = np.array(df_train['labels_vector'].tolist())

# Convert data to PyTorch tensors
X_tensor_glove = torch.tensor(X_glove, dtype=torch.float32)
y_tensor_glove = torch.tensor(y_glove, dtype=torch.float32)

# Move tensors to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_tensor_glove = X_tensor_glove.to(device)
y_tensor_glove = y_tensor_glove.to(device)



### Run Model

In [22]:
# Parameters
input_size = 100  # GloVe embedding size (100D)
hidden_size = 128
num_classes = len(label_names)  # Number of emotion labels in GoEmotions dataset
epochs = 10
learning_rate = 0.001
kf = KFold(n_splits=5)
criterion = nn.BCEWithLogitsLoss()  # Multi-label classification requires BCE loss
optimizer_class = optim.Adam

# Train the model
cv_results = train_evaluate_model(X_tensor_glove, y_tensor_glove, input_size, hidden_size, num_classes, kf, epochs, learning_rate, criterion, optimizer_class, device)

# Print Results
from tabulate import tabulate
print(tabulate([cv_results], headers="keys", tablefmt="pretty"))


+------------------------+--------------------------+--------------------------+----------------------------+
| Training Accuracy Mean | Training Accuracy StdDev | Validation Accuracy Mean | Validation Accuracy StdDev |
+------------------------+--------------------------+--------------------------+----------------------------+
|   0.9533659815788269   |  0.0058698376873344855   |    0.9533369660377502    |    0.005977000497952253    |
+------------------------+--------------------------+--------------------------+----------------------------+
