### Import FastText Embedding

In [42]:
import torch

ckpt = torch.load("../model/embedding.pt", map_location="cpu")

embedding_matrix = ckpt["embedding"]
word2idx = ckpt["word2idx"]

In [43]:
vocab_size, embed_dim = embedding_matrix.shape
vocab_size, embed_dim

(47041, 300)

### Declare Dataset

In [44]:
from torch.utils.data import Dataset
import re

class SentimentDataset(Dataset):
    def __init__(self, dataframe, word2idx, max_len=128):
        self.df = dataframe
        self.word2idx = word2idx
        self.max_len = max_len


    def tokenize(self, text):
        patterns = [
            r"\[[A-Z_]+\]",
            r"<\/?[\w_]+>",
            r"\w+",
            r"[?!]{2,}",
            r"\.{3,}",
            r"[^\w\s]"
        ]

        combined = re.compile("|".join(patterns), re.UNICODE)

        return combined.findall(text)


    def encode_text(self, text):
        tokens = self.tokenize(text)
        ids = [self.word2idx.get(token, self.word2idx['<unk>']) for token in tokens]
        ids = ids[:self.max_len]

        return ids + [self.word2idx['<pad>']] * (self.max_len - len(ids))


    def __len__(self):
        return len(self.df)


    def __getitem__(self, index):
        row = self.df.loc[index]

        text_ids = torch.tensor(self.encode_text(row['text']))
        extra_feats = torch.tensor([row["ex_intensity"],
            row["emoji_score"],
            row["all_uppercase"],
            row["uppercase_ratio"]]
        , dtype=torch.float32)

        return text_ids, extra_feats, torch.tensor(int(row['label']), dtype=torch.long)
    

### Declare Model

In [45]:
from torch import nn

class Model(nn.Module):
    def __init__(self, embedding_matrix, lstm_hidden=128, lstm_layers=1, num_classes=3):
        super().__init__()
        self.embedding_matrix = embedding_matrix
        self.num_classes = num_classes
        
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), padding_idx=0, freeze=False)

        self.lstm = nn.LSTM(
            input_size=embedding_matrix.size(1),
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True
        )

        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden * 2 + 4, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(16, num_classes)
        )

    def forward(self, text_ids, extra_feats):
        x = self.embedding(text_ids)

        _, (h, _) = self.lstm(x)
        h = torch.cat([h[-2], h[-1]], dim=1) # BiLSTM

        features = torch.cat([h, extra_feats], dim=1)

        logits = self.mlp(features)

        return logits

### Training config

In [46]:
import pandas as pd

train_df = pd.read_csv('../data/preprocessed/train.csv')

In [47]:
from torch.utils.data import DataLoader

MAX_LEN = 128
BATCH_SIZE = 32

train_dataset = SentimentDataset(train_df, word2idx, max_len=MAX_LEN)
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

In [48]:
LSTM_HIDDEN = 128
LSTM_LAYERS = 1

model = Model(embedding_matrix, lstm_hidden=LSTM_HIDDEN, lstm_layers=LSTM_LAYERS).to('cuda' if torch.cuda.is_available() else 'cpu')

  self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), padding_idx=0, freeze=False)


In [49]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=0.0001)
criterion = nn.CrossEntropyLoss()

In [50]:
from torch.utils.data import DataLoader
from tqdm import tqdm
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    total = 0
    correct = 0

    for text_ids, extra_feats, labels in tqdm(dataloader, desc='Epoch training'):
        text_ids = text_ids.to(device)
        extra_feats = extra_feats.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(text_ids, extra_feats)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    acc = correct / total

    return total_loss / len(dataloader), acc


### Evaluate config

In [51]:
val_df = pd.read_csv('../data/preprocessed/val.csv')
val_dataset = SentimentDataset(val_df, word2idx, max_len=MAX_LEN)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [52]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for text_ids, extra_feats, labels in tqdm(dataloader, desc='Validating'):
            text_ids = text_ids.to(device)
            extra_feats = extra_feats.to(device)
            labels = labels.to(device)

            logits = model(text_ids, extra_feats)
            loss = criterion(logits, labels)

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    acc = correct / total

    return avg_loss, acc


### Loop

In [53]:
EPOCHS = 15
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(EPOCHS):
    train_loss, train_acc = train(
        model,
        train_loader,
        optimizer,
        criterion,
        device
    )

    val_loss, val_acc = evaluate(
        model,
        val_loader,
        criterion,
        device
    )

    print(
        f"Epoch {epoch+1}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"Val Acc: {val_acc:.4f}"
        "\n--------------------------------------------\n"
    )


Epoch training: 100%|██████████| 1263/1263 [00:32<00:00, 39.12it/s]
Validating: 100%|██████████| 271/271 [00:02<00:00, 96.94it/s] 


Epoch 1/15 | Train Loss: 0.9523 | Train Acc: 0.5436 | Val Loss: 0.9110 | Val Acc: 0.5686
--------------------------------------------



Epoch training: 100%|██████████| 1263/1263 [00:33<00:00, 37.32it/s]
Validating: 100%|██████████| 271/271 [00:02<00:00, 92.67it/s]


Epoch 2/15 | Train Loss: 0.8624 | Train Acc: 0.6020 | Val Loss: 0.8402 | Val Acc: 0.6416
--------------------------------------------



Epoch training: 100%|██████████| 1263/1263 [00:34<00:00, 36.58it/s]
Validating: 100%|██████████| 271/271 [00:03<00:00, 89.91it/s]


Epoch 3/15 | Train Loss: 0.8249 | Train Acc: 0.6357 | Val Loss: 0.8516 | Val Acc: 0.6178
--------------------------------------------



Epoch training: 100%|██████████| 1263/1263 [00:32<00:00, 39.01it/s]
Validating: 100%|██████████| 271/271 [00:02<00:00, 104.02it/s]


Epoch 4/15 | Train Loss: 0.7967 | Train Acc: 0.6561 | Val Loss: 0.7861 | Val Acc: 0.6576
--------------------------------------------



Epoch training:  16%|█▌        | 205/1263 [00:05<00:27, 38.50it/s]


KeyboardInterrupt: 