In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
import torchtext
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
import sys

In [3]:
df = pd.read_csv("data.tsv", sep = '\t')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [7]:
stop_words = stopwords.words("english")
def clean(review):
    clean_html = BeautifulSoup(review).get_text()
    clean_non_letters = re.sub("[^a-zA-Z]", " ", clean_html) 
    cleaned_lowercase = clean_non_letters.lower()
    words = cleaned_lowercase.split()
    cleaned_words = [w for w in words if w not in stop_words] 
    return " ".join(cleaned_words)

df["cleaned_review"] = df["review"].apply(clean)
df

Unnamed: 0,id,sentiment,review,cleaned_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...
...,...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...,seems like consideration gone imdb reviews fil...
24996,5064_1,0,I don't believe they made this film. Completel...,believe made film completely unnecessary first...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil...",guy loser get girls needs build picked stronge...
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...,minute documentary bu uel made early one spain...


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_review"], df["sentiment"], test_size = 0.3, 
                                                    random_state = 10)
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (17500,)
Test set size: (7500,)


In [9]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [10]:
def tokenize_data(sample, y, tokenizer, max_length = 256):
    tokens = tokenizer(sample)[:max_length]
    return {'tokens': tokens, 'length': len(tokens), 'sentiment': y}

In [11]:
train_data = [tokenize_data(x, y, tokenizer) for x, y in zip(X_train, y_train)]
test_data = [tokenize_data(x, y, tokenizer) for x, y in zip(X_test, y_test)]

In [18]:
vocab = torchtext.vocab.build_vocab_from_iterator(
    [d['tokens'] for d in train_data],
    specials = ['<unk>', '<pad>']
)
vocab.set_default_index(vocab["<unk>"])
len(vocab)

62237

In [19]:
for idx in range(len(train_data)):
    tokens = train_data[idx]["tokens"]
    ids = [vocab[token] for token in tokens]
    train_data[idx]["ids"] = ids

In [20]:
for idx in range(len(test_data)):
    tokens = test_data[idx]["tokens"]
    ids = [vocab[token] for token in tokens]
    test_data[idx]["ids"] = ids

In [21]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                            dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True, 
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [batch size, seq len, hidden dim * n directions]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
            # hidden = [batch size, hidden dim * 2]
        else:
            hidden = self.dropout(hidden[-1])
            # hidden = [batch size, hidden dim]
        prediction = self.fc(hidden)
        # prediction = [batch size, output dim]
        return prediction

In [25]:
model = LSTM(vocab_size = len(vocab), embedding_dim = 300, hidden_dim = 300, output_dim = 2, n_layers = 2,
             bidirectional = True, dropout_rate = 0.5)

In [26]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
model.apply(initialize_weights)

LSTM(
  (embedding): Embedding(62237, 300, padding_idx=0)
  (lstm): LSTM(300, 300, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=600, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [27]:
vectors = torchtext.vocab.FastText()
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding

In [28]:
device = torch.device("cuda:0")
model = model.to(device)

In [29]:
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-4)
criterion = nn.CrossEntropyLoss()

In [30]:
def collate(batch):
    batch_ids = [torch.tensor(i["ids"]) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=0, batch_first=True)
    batch_length = [i["length"] for i in batch]
    batch_label = torch.tensor([i["sentiment"] for i in batch])
    return {"ids": batch_ids, "length": batch_length, "label": batch_label}

In [31]:
train_dataloader = torch.utils.data.DataLoader(train_data, 
                                               batch_size = 64, 
                                               collate_fn = collate, 
                                               shuffle = True)

test_dataloader = torch.utils.data.DataLoader(test_data, batch_size = 64, collate_fn = collate)

In [32]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [33]:
def train(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        length = batch['length']
        label = batch['label'].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [34]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [35]:
n_epochs = 30
best_valid_loss = float('inf')

train_losses = []
train_accs = []
test_losses = []
test_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    test_losses.extend(test_loss)
    test_accs.extend(test_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_test_loss = np.mean(test_loss)
    epoch_test_acc = np.mean(test_acc)
    
    print(f'epoch: {epoch+1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')

training...: 100%|██████████| 274/274 [00:25<00:00, 10.79it/s]
evaluating...: 100%|██████████| 118/118 [00:02<00:00, 44.52it/s]
epoch: 1
train_loss: 0.478, train_acc: 0.765
test_loss: 0.385, test_acc: 0.830
training...: 100%|██████████| 274/274 [00:24<00:00, 10.96it/s]
evaluating...: 100%|██████████| 118/118 [00:02<00:00, 47.81it/s]
epoch: 2
train_loss: 0.318, train_acc: 0.867
test_loss: 0.297, test_acc: 0.880
training...: 100%|██████████| 274/274 [00:25<00:00, 10.69it/s]
evaluating...: 100%|██████████| 118/118 [00:02<00:00, 39.45it/s]
epoch: 3
train_loss: 0.231, train_acc: 0.911
test_loss: 0.282, test_acc: 0.882
training...: 100%|██████████| 274/274 [00:25<00:00, 10.67it/s]
evaluating...: 100%|██████████| 118/118 [00:02<00:00, 42.18it/s]
epoch: 4
train_loss: 0.170, train_acc: 0.935
test_loss: 0.308, test_acc: 0.888
training...: 100%|██████████| 274/274 [00:25<00:00, 10.67it/s]
evaluating...: 100%|██████████| 118/118 [00:02<00:00, 41.01it/s]
epoch: 5
train_loss: 0.127, train_acc: 0.954