# Sentiment Classification with Bi-LSTM

In [1]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

device

device(type='cuda')

## Load Data

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/train.csv')
X = df['text']
Y = df['suicide']

In [3]:
sentences = [sen for sen in X]

sentences[: 5]

["It makes me happy to think that I'd rather commit suicide than to live an unhappy lifeAt least that is something that many of the people who are not suicidal cannot do.\n\nI used to worry a lot about my future being worried about if I fuck things up and I create a bad ad unhappy life for me that I'd have to deal for the rest of my life, but now that I know that I can just end it all if things go bad, that makes me feel much better for some reason...",
 "My dad got the Corona Virus... Please pray for hi am ya'll 🥺",
 'the everlasting question why is this art!!!!!!1211111!!!!!',
 'If someone ik finds my reddit acct i am saying someone catfished using my pictures ong Im a loser but i could at least try to hide it 🥴',
 "I lost everything in span of a month.Hey guys, \n\numm...so I'm pretty shook up right now. \n\nLast month the woman who I thought I will marry left me five years into our relationship. \n\nI sort of dealt with it and did my best to move on...got my own place and started t

In [4]:
sen_labels = torch.tensor(Y.values)

sen_labels[: 5]

tensor([1, 0, 0, 0, 1])

## Pre-processing data

In [5]:
from transformers import BertTokenizer

# get pre-trained tokenizer model
tokenizer = BertTokenizer.from_pretrained('./pretrained/bert-base-uncased')

tokenizer

BertTokenizer(name_or_path='./pretrained/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
max_length = 512

tokenized = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

In [7]:
sen_ids = tokenized['input_ids']

print(sen_ids.size())

torch.Size([185659, 512])


## Releasing memory

In [8]:
del tokenizer
del tokenized
del sentences
del X
del Y
del df

import gc
gc.collect()

0

## Build the model

```
LSTMClassifier(
  (embedding_layer): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lstm): LSTM(768, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (output_layer): Linear(in_features=128, out_features=2, bias=True)
  (sigmoid): Sigmoid()
  (dropout): Dropout(p=0.2, inplace=False)
)
```

In [9]:
import torch.nn as nn
from transformers.models.bert.modeling_bert import BertEmbeddings
from transformers import AutoConfig

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes, num_layers, device, drop_prob=0.2):
        super(LSTMClassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = num_layers
        self.device = device

        config = AutoConfig.from_pretrained('./pretrained/bert-base-uncased')
        self.embedding_layer = BertEmbeddings(config)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True, dropout=drop_prob, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim * 2, num_classes)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(drop_prob)

    def forward(self, input, hidden):
        embedded = self.embedding_layer(input) # (batch_size, seq_length, embedding_size)
        lstm_out, hidden = self.lstm(embedded, hidden) # (batch_size, seq_length, hidden_size)
        out = self.dropout(lstm_out)
        out = self.output_layer(out) # (batch_size, seq_length, num_classes)
        out = self.sigmoid(out)
        out = out[:, -1, :]
        return out, hidden

    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.n_layers * 2, batch_size, self.hidden_dim).to(self.device),
                  torch.zeros(self.n_layers * 2, batch_size, self.hidden_dim).to(self.device))
        return hidden

## Cross Validation

A stratified 5-fold cross validation will be applied to the model.

### Training method

In [10]:
# training model
def model_train(model, train_dataloader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        train_losses = []

        model.train()
        for i, batch_data in enumerate(train_dataloader):
            input_ids, input_labels = tuple(data.to(device) for data in batch_data)
            cur_batch = len(input_ids)
            hidden = model.init_hidden(cur_batch)

            optimizer.zero_grad()
            output, hidden =  model(input_ids, hidden)

            loss = criterion(output, input_labels)
            train_losses.append(loss.item())

            loss.backward()
            optimizer.step()

        print("Epoch: {}/{}".format((epoch + 1), epochs),
              "\n\tTraining Loss: {:.4f}".format(np.mean(train_losses)))
        
    return model

### Validation method

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# validation model
def model_validation(model, val_dataloader, criterion):
    val_losses = []
    accuracy_score_list, recall_score_list, precision_score_list, f1_score_list = [], [], [], []

    model.eval()
    for batch_data in val_dataloader:
        input_ids, input_labels = tuple(data.to(device) for data in batch_data)

        cur_batch = len(input_ids)
        hidden = model.init_hidden(cur_batch)

        with torch.no_grad():
            preds, hidden = model(input_ids, hidden)

        loss = criterion(preds, input_labels)
        val_losses.append(loss.item())
        preds = preds.detach().to('cpu').numpy()
        labels = input_labels.to('cpu').numpy()
        preds = preds.argmax(1) # shape = [1, : ]

        # Evaluate model
        AccuracyScore = accuracy_score(labels, preds)
        RecallScore = recall_score(labels, preds)
        PrecisionScore = precision_score(labels, preds)
        F1Score = f1_score(labels, preds)
        
        # Add to lists
        accuracy_score_list.append(AccuracyScore)
        recall_score_list.append(RecallScore)
        precision_score_list.append(PrecisionScore)
        f1_score_list.append(F1Score)

    print("Validation Loss: {:.4f}".format(np.mean(val_losses)),
          "Validation Accuracy: {:.4f}%".format(np.mean(accuracy_score_list) * 100))
    
    return np.mean(accuracy_score_list), np.mean(recall_score_list), np.mean(precision_score_list), np.mean(f1_score_list)

### Train and validate

In [12]:
# parameters
embedding_dim = 768
hidden_size = 64
num_classes = 2
num_layers = 2

learning_rate = 0.0005
epochs = 4

In [13]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import TensorDataset, DataLoader
from torch import optim


skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
accuracy_score_lists, recall_score_lists, precision_score_lists, f1_score_lists = [], [], [], []


for time, (train_index, val_index) in enumerate(skfolds.split(sen_ids, sen_labels)):
    print('Time: ', time + 1)
    X_train, X_val = sen_ids[train_index], sen_ids[val_index]
    Y_train, Y_val = sen_labels[train_index], sen_labels[val_index]

    # pack dataloaders
    batch_size = 32

    train_dataset = TensorDataset(X_train, Y_train)
    val_dataset = TensorDataset(X_val, Y_val)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    model = LSTMClassifier(embedding_dim, hidden_size, num_classes, num_layers, device)
    model = model.cuda()

    # optimizer
    optimizer = optim.Adam(model.parameters(), learning_rate)
    # loss function
    criterion = nn.CrossEntropyLoss()

    # train model
    model = model_train(model, train_dataloader, criterion, optimizer, epochs)

    # validate model
    AccuracyScore, RecallScore, PrecisionScore, F1Score = model_validation(model, val_dataloader, criterion)

    accuracy_score_lists.append(AccuracyScore)
    recall_score_lists.append(RecallScore)
    precision_score_lists.append(PrecisionScore)
    f1_score_lists.append(F1Score)

print("Accuracy: {:.2%}".format(np.average(accuracy_score_lists)))
print("Recall: {:.2%}".format(np.average(recall_score_lists)))
print("Precision: {:.2%}".format(np.average(precision_score_lists)))
print("F1_score: {:.2%}".format(np.average(f1_score_lists)))

Time:  1
Epoch: 1/4 
	Training Loss: 0.6166
Epoch: 2/4 
	Training Loss: 0.4079
Epoch: 3/4 
	Training Loss: 0.3781
Epoch: 4/4 
	Training Loss: 0.3712
Validation Loss: 0.3670 Validation Accuracy: 94.4687%
Time:  2
Epoch: 1/4 
	Training Loss: 0.6468
Epoch: 2/4 
	Training Loss: 0.3947
Epoch: 3/4 
	Training Loss: 0.3733
Epoch: 4/4 
	Training Loss: 0.3657
Validation Loss: 0.3648 Validation Accuracy: 94.6759%
Time:  3
Epoch: 1/4 
	Training Loss: 0.5343
Epoch: 2/4 
	Training Loss: 0.3812
Epoch: 3/4 
	Training Loss: 0.3688
Epoch: 4/4 
	Training Loss: 0.3621
Validation Loss: 0.3597 Validation Accuracy: 95.1613%
Time:  4
Epoch: 1/4 
	Training Loss: 0.5747
Epoch: 2/4 
	Training Loss: 0.4750
Epoch: 3/4 
	Training Loss: 0.4224
Epoch: 4/4 
	Training Loss: 0.3990
Validation Loss: 0.3945 Validation Accuracy: 91.6469%
Time:  5
Epoch: 1/4 
	Training Loss: 0.5471
Epoch: 2/4 
	Training Loss: 0.3797
Epoch: 3/4 
	Training Loss: 0.3683
Epoch: 4/4 
	Training Loss: 0.3626
Validation Loss: 0.3642 Validation Accu

In [15]:
train_dataset = TensorDataset(sen_ids, sen_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model = LSTMClassifier(embedding_dim, hidden_size, num_classes, num_layers, device)
model = model.cuda()

optimizer = optim.Adam(model.parameters(), learning_rate)

criterion = nn.CrossEntropyLoss()

# train model
model = model_train(model, train_dataloader, criterion, optimizer, epochs)

Epoch: 1/4 
	Training Loss: 0.5142
Epoch: 2/4 
	Training Loss: 0.4064
Epoch: 3/4 
	Training Loss: 0.3852
Epoch: 4/4 
	Training Loss: 0.3825


In [16]:
net_state_dict = model.state_dict()

torch.save(net_state_dict, './model/LSTM_classifier.pt')

to load the model:

```python
m_state_dict = torch.load('./model/LSTM_classifier.pt')
model = LSTMClassifier(768, 64, 2, 2, device)
model.load_state_dict(m_state_dict)
```