In [1]:
import torch
import torch.nn as nn

manual_seed = 42
torch.manual_seed(manual_seed)

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import unidecode

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngoso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} device")

Using cuda device


In [3]:
dataset_path = 'all-data.csv'
header = ['sentiment', 'content']
df = pd.read_csv(
    dataset_path,
    names=header,
    encoding='ISO-8859-1'
)

In [4]:
classes_list = df['sentiment'].unique()
print(classes_list.tolist())

['neutral', 'negative', 'positive']


In [5]:
classes = {
    class_name : idx for idx, class_name in enumerate(classes_list)
}
df['sentiment'] = df['sentiment'].apply(lambda x: classes[x])
df

Unnamed: 0,sentiment,content
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...
...,...,...
4841,1,LONDON MarketWatch -- Share prices ended lower...
4842,0,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,1,Operating profit fell to EUR 35.4 mn from EUR ...
4844,1,Net sales of the Paper segment decreased to EU...


In [6]:
english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

def text_normalization(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = text.strip()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = [word for word in text.split() if word not in english_stopwords]
    text = [stemmer.stem(word) for word in text]
    return ' '.join(text)

# Build Vocab

In [7]:
df['content'] = df['content'].apply(lambda x: text_normalization(x)).astype(str)

In [8]:
vocab = set()
for text in df['content']:
    for word in text.split():
        vocab.add(word)

vocab.add('[PAD]')
vocab.add('[UNK]')
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

def transform(text, word_to_idx, max_seq_len):
    words = text.split()
    words = words[:max_seq_len]
    words = [word_to_idx.get(word, word_to_idx['[UNK]']) for word in words]
    words += [word_to_idx['[PAD]']] * (max_seq_len - len(words))
    return words

In [9]:
val_size = 0.2
test_size = 0.125
is_shuffle = True

texts = df['content'].tolist()
labels = df['sentiment'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, 
    test_size=val_size, 
    random_state=manual_seed, 
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size, 
    random_state=manual_seed, 
    shuffle=is_shuffle
)

print(f"Total size: {len(texts)}")
print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

Total size: 4846
Train size: 3391
Validation size: 970
Test size: 485


In [10]:
class FinancialNewsDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx, max_seq_len, transform=None):
        self.texts = texts
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.max_seq_len = max_seq_len
        self.transform = transform

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        if self.transform:
            text = self.transform(text, self.word_to_idx, self.max_seq_len)

        text = torch.tensor(text)
        return (text, label)

# Create Dataset

In [11]:
max_seq_len = 32

train_dataset = FinancialNewsDataset(
    X_train, y_train,
    word_to_idx, 
    max_seq_len, 
    transform= transform
)

val_dataset = FinancialNewsDataset(
    X_val, y_val,
    word_to_idx, 
    max_seq_len, 
    transform= transform
)

test_dataset = FinancialNewsDataset(
    X_test, y_test,
    word_to_idx, 
    max_seq_len, 
    transform= transform
)

# Create DataLoader

In [12]:
train_batch_size = 64
test_batch_size = 8

train_loader = DataLoader(
    train_dataset, 
    batch_size=train_batch_size,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=test_batch_size,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=test_batch_size,
    shuffle=False
)

In [13]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes, dropout_prob):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)

        self.rnn = nn.RNN(
            embed_size, hidden_size,
            num_layers=num_layers, 
            batch_first=True,
        )

        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.norm(out)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Initialize Model Parameters

In [14]:
num_classes = len(classes_list)
embedding_dim = 64
hidden_dim = 64
num_layers = 2
dropout_prob = 0.2

In [15]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentClassifier(
    vocab_size=vocab_size,
    embed_size=embedding_dim,
    hidden_size=hidden_dim,
    num_layers=num_layers,
    num_classes=num_classes,
    dropout_prob=dropout_prob
).to(device)

In [16]:
def evaluate(model, data_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    val_losses = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_losses.append(loss.item())
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return np.mean(val_losses), correct / total

In [17]:
def fit(model, train_loader, val_loader, criterion, optimizer, device, epochs):

    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        batch_train_loss = []

        model.train()
        for idx, (texts, labels) in enumerate(train_loader):
            texts = texts.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            batch_train_loss.append(loss.item())

        epoch_train_loss = np.mean(batch_train_loss)
        train_losses.append(epoch_train_loss)

        val_losses, val_accuracy = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {epoch_train_loss:.4f} - Val Loss: {val_losses:.4f} - Val Accuracy: {val_accuracy:.4f}")

    return train_losses, val_losses
        

In [22]:
lr = 1e-4
epochs = 500

criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

train_losses, val_losses = fit(
    model, train_loader, val_loader,
    criteria, optimizer, device, epochs
)

Epoch 1/500 - Train Loss: 0.7213 - Val Loss: 0.8524 - Val Accuracy: 0.6546
Epoch 2/500 - Train Loss: 0.7098 - Val Loss: 0.8484 - Val Accuracy: 0.6536
Epoch 3/500 - Train Loss: 0.7193 - Val Loss: 0.8427 - Val Accuracy: 0.6577
Epoch 4/500 - Train Loss: 0.7083 - Val Loss: 0.8431 - Val Accuracy: 0.6526
Epoch 5/500 - Train Loss: 0.7083 - Val Loss: 0.8425 - Val Accuracy: 0.6577
Epoch 6/500 - Train Loss: 0.7046 - Val Loss: 0.8587 - Val Accuracy: 0.6526
Epoch 7/500 - Train Loss: 0.7023 - Val Loss: 0.8388 - Val Accuracy: 0.6567
Epoch 8/500 - Train Loss: 0.7158 - Val Loss: 0.8768 - Val Accuracy: 0.6464
Epoch 9/500 - Train Loss: 0.6962 - Val Loss: 0.8588 - Val Accuracy: 0.6567
Epoch 10/500 - Train Loss: 0.6987 - Val Loss: 0.8819 - Val Accuracy: 0.6412
Epoch 11/500 - Train Loss: 0.6881 - Val Loss: 0.8558 - Val Accuracy: 0.6546
Epoch 12/500 - Train Loss: 0.6871 - Val Loss: 0.8613 - Val Accuracy: 0.6567
Epoch 13/500 - Train Loss: 0.6874 - Val Loss: 0.8550 - Val Accuracy: 0.6536
Epoch 14/500 - Train 

KeyboardInterrupt: 

In [21]:
test_loss, test_accuracy = evaluate(model, test_loader, criteria)
val_loss, val_accuracy = evaluate(model, val_loader, criteria)
print(f"Test Loss: {test_loss:.4f} - Test Accuracy: {test_accuracy:.4f}")
print(f"Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")

Test Loss: 0.9099 - Test Accuracy: 0.6268
Val Loss: 0.8484 - Val Accuracy: 0.6505
