In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from collections import Counter
import pandas as pd
from torch.utils.data import DataLoader
import spacy
from torchtext.datasets import IMDB, SST
from torchtext.data import Field, LabelField, BucketIterator
import torchtext
from IPython.display import display
pd.options.display.max_columns = 100
pd.options.display.width = 1000


## For sentiment analysis we will be using SST dataset

In [2]:
SRC = Field()
TRG = LabelField(dtype=torch.int64)
sp = spacy.load('en')


train_data, test_data = SST.splits(SRC, TRG, validation=None)
train_data, val_data = train_data.split(0.8, (torch.Generator().manual_seed(10), ))

print(f'Train data size and label count (%), n={len(train_data)}')
display(pd.Series([x.label for x in train_data.examples]).value_counts(normalize=True).rename('').to_frame())
print(f'Val data size and label count (%), n={len(val_data)}')
display(pd.Series([x.label for x in val_data.examples]).value_counts(normalize=True).rename('').to_frame())
print(f'Test data size and label count (%), n={len(test_data)}')
display(pd.Series([x.label for x in test_data.examples]).value_counts(normalize=True).rename('').to_frame())

Train data size and label count (%), n=6835


Unnamed: 0,Unnamed: 1
positive,0.422531
negative,0.387418
neutral,0.190051


Val data size and label count (%), n=1709


Unnamed: 0,Unnamed: 1
positive,0.422469
negative,0.387361
neutral,0.19017


Test data size and label count (%), n=2210


Unnamed: 0,Unnamed: 1
negative,0.41267
positive,0.411312
neutral,0.176018


### Build vocabulary

In [3]:
SRC.build_vocab(train_data, max_size=10000, min_freq=5, vectors="glove.6B.100d")  # using pretrained word embedding
TRG.build_vocab(train_data, min_freq=5)

print(vars(TRG.vocab))
print(f"Unique tokens in source vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in TRG vocabulary: {len(TRG.vocab)}")

{'freqs': Counter({'positive': 2888, 'negative': 2648, 'neutral': 1299}), 'itos': ['positive', 'negative', 'neutral'], 'unk_index': None, 'stoi': defaultdict(None, {'positive': 0, 'negative': 1, 'neutral': 2}), 'vectors': None}
Unique tokens in source vocabulary: 2927
Unique tokens in TRG vocabulary: 3


## Create batches

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 300
train_iterator, val_iterator, test_iterator = BucketIterator.splits(
      (train_data, val_data, test_data),
      batch_size=BATCH_SIZE,
      device=device
    )


## Create GRU NN (we use GRU now, because it showed better accuracy on this dataset):

In [5]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        # input_dim <--- vocabulary size
        # output_dim <--- len ([positive, negative]) == 2
        # emb_dim <--- embedding dimension of embedding matrix

        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout)

        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc2 = nn.Linear(hidden_dim // 2, output_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # shape: [source_len, batch_size]
        embedded = self.dropout(self.embedding(src))  # sahpe: [src_len, batch_size, embed_dim]
        output, hidden = self.rnn(embedded)
        # output[batch, hidden_dim]
        # hiddden[n_layers, batch, hidden_dim]
        output = self.fc1(output[-1])
        output = self.fc2(self.relu(output))
        return output


## Init model

In [6]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

model = GRU(INPUT_DIM, OUTPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
model.embedding.weight.data.copy_(SRC.vocab.vectors)
optimizer = optim.Adam(model.parameters(), lr=3e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
criterion = nn.CrossEntropyLoss()


## Training functions

In [7]:
# Model training function
def train(model, iterator, optimizer=optimizer, criterion=criterion, clip=1):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src = batch.text.to(device)
        trg = batch.label.to(device)
        optimizer.zero_grad()
        output = model(src)

        loss = criterion(output, trg)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    mean_loss = epoch_loss / len(iterator)
    scheduler.step(mean_loss)
    return mean_loss  # mean loss


def check_accuracy(data_iterator, model):
    model.eval()
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for i, batch in enumerate(data_iterator):
            src = batch.text.to(device)
            trg = batch.label.to(device)
            output = model(src)

            total_correct += torch.sum(torch.eq(output.argmax(1), trg))
            total_count += len(trg)

    return f'{total_correct}/{total_count}', round(float(total_correct/total_count), 3)


def do_prediction(sentence):

    if type(sentence) == str:
        tokanised_sentence = [word.text for word in sp.tokenizer(sentence)]
    else:
        tokanised_sentence = sentence

    input_data = [SRC.vocab.stoi[word.lower()] for word in tokanised_sentence]
    input_data = torch.tensor(input_data, dtype=torch.int64).unsqueeze(1).to(device)

    model.eval()
    output = model(input_data)
    label_mapping = train_data.fields['label'].vocab.stoi
    r = {'text': sentence, **{k: v for k, v in zip(sorted(label_mapping, key=lambda x: label_mapping[x]), output[0].tolist())}}
    return r

## Training model

In [8]:
df = pd.DataFrame()

total_epoch = 40
for epoch in range(total_epoch):
    result = train(model=model, iterator=train_iterator)
    print(f'Epoch {epoch} -->', result)
    train_check = check_accuracy(train_iterator, model)
    val_check = check_accuracy(val_iterator, model)
    test_check = check_accuracy(test_iterator, model)
    s = pd.Series({'train_accuracy_str': train_check[0],
                   'val_accuracy_str': val_check[0],
                   'test_accuracy_str': test_check[0],
                   'train_accuracy': train_check[1],
                   'val_accuracy': val_check[1],
                   'test_accuracy': test_check[1],
                   }, name=epoch)
    df = df.append(s)[s.index.tolist()]
    torch.save(model.state_dict(), os.path.join('model_snapshots', f'{epoch}.pth'))

display(df)
best_epoch = df['val_accuracy'].idxmax()

display(df.loc[best_epoch].rename(f'Best epoch ({best_epoch}) result').to_frame())

print(f'Loading model on epoch={best_epoch}')
model.load_state_dict(torch.load(f'model_snapshots/{best_epoch}.pth'))

for e in range(total_epoch):
    os.remove(f'model_snapshots/{e}.pth')

Epoch 0 --> 1.055275144784347
Epoch 1 --> 1.051476214243018
Epoch 2 --> 1.048110158547111
Epoch 3 --> 1.0495818438737288
Epoch 4 --> 1.0480802421984465
Epoch 5 --> 1.0476576556330142
Epoch 6 --> 1.0477439165115356
Epoch 7 --> 1.031999743503073
Epoch 8 --> 0.9405067874037701
Epoch 9 --> 0.8484045344850292
Epoch 10 --> 0.7865908171819604
Epoch 11 --> 0.7268832325935364
Epoch 12 --> 0.6954181764436804
Epoch 13 --> 0.6643368161242941
Epoch 14 --> 0.6369987985362178
Epoch 15 --> 0.615580255570619
Epoch 16 --> 0.6121622479480245
Epoch 17 --> 0.5638920936895453
Epoch 18 --> 0.5509859880675441
Epoch 19 --> 0.5286451526310133
Epoch 20 --> 0.49738914292791614
Epoch 21 --> 0.477946981139805
Epoch 22 --> 0.4571795554264732
Epoch 23 --> 0.43299377612445666
Epoch 24 --> 0.4277369431827379
Epoch 25 --> 0.40416069263997284
Epoch 26 --> 0.3864903890568277
Epoch 27 --> 0.3590003407519797
Epoch 28 --> 0.3592718930348106
Epoch 29 --> 0.3268903584583946
Epoch 30 --> 0.312598405972771
Epoch 31 --> 0.2910182

Unnamed: 0,train_accuracy_str,val_accuracy_str,test_accuracy_str,train_accuracy,val_accuracy,test_accuracy
0,2888/6835,722/1709,909/2210,0.423,0.422,0.411
1,2888/6835,722/1709,909/2210,0.423,0.422,0.411
2,2888/6835,722/1709,909/2210,0.423,0.422,0.411
3,2888/6835,722/1709,909/2210,0.423,0.422,0.411
4,2888/6835,722/1709,909/2210,0.423,0.422,0.411
5,2888/6835,722/1709,909/2210,0.423,0.422,0.411
6,2888/6835,722/1709,909/2210,0.423,0.422,0.411
7,3856/6835,900/1709,1269/2210,0.564,0.527,0.574
8,4035/6835,998/1709,1279/2210,0.59,0.584,0.579
9,4656/6835,1047/1709,1397/2210,0.681,0.613,0.632


Unnamed: 0,Best epoch (11) result
train_accuracy_str,4950/6835
val_accuracy_str,1099/1709
test_accuracy_str,1457/2210
train_accuracy,0.724
val_accuracy,0.643
test_accuracy,0.659


Loading model on epoch=11


## Doing some predictions:

In [9]:
to_predict = [
    'That was great!',
    'That was very bad!',
    'It is wonderful film. I like it!',
    "Terrible, stupid, tasteless! Worst thing I've ever seen",
    "Don't know, what to say.",
]

display(pd.DataFrame([do_prediction(p) for p in to_predict]))

Unnamed: 0,text,positive,negative,neutral
0,That was great!,1.180104,-0.411761,-0.529966
1,That was very bad!,-1.181618,1.398464,0.069267
2,It is wonderful film. I like it!,3.511635,-3.60253,-1.794998
3,"Terrible, stupid, tasteless! Worst thing I've ...",-1.990963,2.411172,0.200588
4,"Don't know, what to say.",-0.793966,0.99593,-0.002925
