In [1]:
import os
import re
import nltk
import unidecode
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [2]:
seed = 1
torch.manual_seed(seed)

<torch._C.Generator at 0x776784ee70f0>

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/Sinner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
ROOT_PATH = "data/all-data.csv"
headers = ["sentiment", "content"]
df = pd.read_csv(
    ROOT_PATH,
    names=headers,
    encoding="ISO-8859-1"
)

In [5]:
classes = {
    class_name: idx for idx, class_name in enumerate(df['sentiment'].unique().tolist())
}
df['sentiment'] = df['sentiment'].apply(lambda x: classes[x])

In [6]:
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()

In [7]:
def text_normalize(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = text.strip()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split(' ') if word not in english_stop_words])

    return text

In [8]:
df['content'] = df['content'].apply(lambda x: text_normalize(x))

In [9]:
vocab = []
for sentence in df['content'].tolist():
    tokens = sentence.split()
    for token in tokens:
        if token not in vocab:
            vocab.append(token)

vocab.append('UNK')
vocab.append('PAD')

In [10]:
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [11]:
def transform(text, word_to_idx, max_seq_len):
    tokens = []
    for w in text.split():
        try:
            w_ids = word_to_idx[w]
        except:
            w_ids = word_to_idx['UNK']
        tokens.append(w_ids)
    
    if len(tokens) < max_seq_len:
        tokens += [word_to_idx['PAD']] * (max_seq_len - len(tokens))
    elif len(tokens) >  max_seq_len:
        tokens = tokens[:max_seq_len]
    
    return tokens

In [12]:
val_size = 0.2
test_size = 0.125
is_shuffle = True
texts = df['content'].tolist()
labels = df['sentiment'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle
)

In [13]:
class FinancialNews(Dataset):
    def __init__(self, X, y, word_to_idx, max_seq_len, transform=None):
        self.texts = X
        self.labels = y
        self.word_to_idx = word_to_idx
        self.max_seq_len = max_seq_len
        self.transform = transform

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if self.transform:
            text = self.transform(text, self.word_to_idx, self.max_seq_len)
        text = torch.tensor(text)

        return text, label

In [14]:
max_seq_len = 32

train_dataset = FinancialNews(X_train, y_train, word_to_idx, max_seq_len, transform=transform)
val_dataset = FinancialNews(X_val, y_val, word_to_idx, max_seq_len, transform=transform)
test_dataset = FinancialNews(X_test, y_test, word_to_idx, max_seq_len, transform=transform)

train_batch_size = 32
test_batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False
)
test_loader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

In [15]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_classes, dropout_prob):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first=True)
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, n_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = x[:, -1, :]
        x = self.norm(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        return x

In [16]:
n_classes = len(list(classes.keys()))
embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentimentClassifier(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    n_layers=n_layers,
    n_classes=n_classes,
    dropout_prob=dropout_prob
).to(device)

In [17]:
lr = 1e-4
epochs = 50

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [18]:
def fit(model, train_loader, val_loader, criterion, optimizer, device, epochs):
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        batch_train_losses = []

        model.train()
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            batch_train_losses.append(loss.item())

        train_loss = sum(batch_train_losses) / len(batch_train_losses)
        train_losses.append(train_loss)

        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        val_losses.append(val_loss)

        print(f'EPOCH {epoch + 1}:\tTrain loss: {train_loss:.4f}\tVal loss: {val_loss:.4f}')

    return train_losses, val_losses

In [19]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    losses = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            _, predicted = torch.max(outputs.data, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    loss = sum(losses) / len(losses)
    acc = correct / total

    return loss, acc

In [20]:
train_losses, val_losses = fit(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    device,
    epochs,
)

EPOCH 1:	Train loss: 0.9629	Val loss: 0.9317


EPOCH 2:	Train loss: 0.9343	Val loss: 0.9303


EPOCH 3:	Train loss: 0.9278	Val loss: 0.9294


EPOCH 4:	Train loss: 0.9333	Val loss: 0.9293


EPOCH 5:	Train loss: 0.9316	Val loss: 0.9291


EPOCH 6:	Train loss: 0.9304	Val loss: 0.9290


EPOCH 7:	Train loss: 0.9296	Val loss: 0.9290


EPOCH 8:	Train loss: 0.9294	Val loss: 0.9289


EPOCH 9:	Train loss: 0.9291	Val loss: 0.9289


EPOCH 10:	Train loss: 0.9310	Val loss: 0.9287


EPOCH 11:	Train loss: 0.9279	Val loss: 0.9289


EPOCH 12:	Train loss: 0.9278	Val loss: 0.9285


EPOCH 13:	Train loss: 0.9302	Val loss: 0.9289


EPOCH 14:	Train loss: 0.9314	Val loss: 0.9302


EPOCH 15:	Train loss: 0.9283	Val loss: 0.9282


EPOCH 16:	Train loss: 0.9281	Val loss: 0.9279


EPOCH 17:	Train loss: 0.9258	Val loss: 0.9295


EPOCH 18:	Train loss: 0.9284	Val loss: 0.9288


EPOCH 19:	Train loss: 0.9269	Val loss: 0.9275


EPOCH 20:	Train loss: 0.9263	Val loss: 0.9283


EPOCH 21:	Train loss: 0.9252	Val loss: 0.9261


EPOCH 22:	Train loss: 0.9265	Val loss: 0.9257


EPOCH 23:	Train loss: 0.9255	Val loss: 0.9252


EPOCH 24:	Train loss: 0.9215	Val loss: 0.9272


EPOCH 25:	Train loss: 0.9203	Val loss: 0.9214


EPOCH 26:	Train loss: 0.9201	Val loss: 0.9199


EPOCH 27:	Train loss: 0.9137	Val loss: 0.9229


EPOCH 28:	Train loss: 0.9062	Val loss: 0.9246


EPOCH 29:	Train loss: 0.8922	Val loss: 0.9205


EPOCH 30:	Train loss: 0.8784	Val loss: 0.9005


EPOCH 31:	Train loss: 0.8627	Val loss: 0.9184


EPOCH 32:	Train loss: 0.8553	Val loss: 0.9014


EPOCH 33:	Train loss: 0.8372	Val loss: 0.8918


EPOCH 34:	Train loss: 0.8358	Val loss: 0.8893


EPOCH 35:	Train loss: 0.8296	Val loss: 0.8923


EPOCH 36:	Train loss: 0.8125	Val loss: 0.8908


EPOCH 37:	Train loss: 0.8135	Val loss: 0.8886


EPOCH 38:	Train loss: 0.8037	Val loss: 0.9013


EPOCH 39:	Train loss: 0.8045	Val loss: 0.8870


EPOCH 40:	Train loss: 0.7850	Val loss: 0.9156


EPOCH 41:	Train loss: 0.7915	Val loss: 0.8977


EPOCH 42:	Train loss: 0.7761	Val loss: 0.8890


EPOCH 43:	Train loss: 0.7667	Val loss: 0.9790


EPOCH 44:	Train loss: 0.7672	Val loss: 0.8974


EPOCH 45:	Train loss: 0.7560	Val loss: 0.8873


EPOCH 46:	Train loss: 0.7487	Val loss: 0.9061


EPOCH 47:	Train loss: 0.7426	Val loss: 0.8970


EPOCH 48:	Train loss: 0.7339	Val loss: 0.8857


EPOCH 49:	Train loss: 0.7296	Val loss: 0.9076


EPOCH 50:	Train loss: 0.7194	Val loss: 0.8980


In [21]:
val_loss, val_acc = evaluate(
    model,
    val_loader,
    criterion,
    device
)
test_loss, test_acc = evaluate(
    model,
    test_loader,
    criterion,
    device
)

print('Evaluation on val/test dataset')
print('Val accuracy: ', val_acc)
print('Test accuracy: ', test_acc)

Evaluation on val/test dataset
Val accuracy:  0.6288659793814433
Test accuracy:  0.6350515463917525
