In [50]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

import kaggle
from torch.utils.data import DataLoader, Dataset

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

In [43]:
# donwload the data
!kaggle datasets download -p ../datasets -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to ../datasets
100%|██████████████████████████████████████| 25.7M/25.7M [00:14<00:00, 2.37MB/s]
100%|██████████████████████████████████████| 25.7M/25.7M [00:14<00:00, 1.92MB/s]


In [47]:
# unzip the data
!unzip -d ../datasets/imdb ../datasets/imdb-dataset-of-50k-movie-reviews.zip

Archive:  ../datasets/imdb-dataset-of-50k-movie-reviews.zip
  inflating: ../datasets/imdb/IMDB Dataset.csv  


In [48]:
!ls -la ../datasets/imdb/

total 64672
drwxrwxr-x 2 petruschka petruschka     4096 Sep  7 19:49  .
drwxrwxr-x 5 petruschka petruschka     4096 Sep  7 19:49  ..
-rw-rw-r-- 1 petruschka petruschka 66212309 Oct 19  2019 'IMDB Dataset.csv'


In [53]:
df = pd.read_csv('../datasets/imdb/IMDB Dataset.csv')

In [61]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [58]:
train_df = df[:30000]
val_df = df[30000:40000]
test_df = df[40000:]

In [62]:
train_df["sentiment"].value_counts()

positive    15015
negative    14985
Name: sentiment, dtype: int64

In [63]:
val_df["sentiment"].value_counts()

negative    5022
positive    4978
Name: sentiment, dtype: int64

In [64]:
test_df["sentiment"].value_counts()

positive    5007
negative    4993
Name: sentiment, dtype: int64

In [80]:
class IMDBDataset(Dataset):
    def __init__(self, df):
        self.length = len(df)
        self.reviews = df["review"].to_numpy()
        self.sentiments = df["sentiment"].to_numpy()
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        return self.reviews[idx], self.sentiments[idx]

In [87]:
train_dataset = IMDBDataset(train_df)
val_dataset = IMDBDataset(val_df)
test_dataset = IMDBDataset(test_df)

In [84]:
# create a tokenizer
tokenizer = get_tokenizer("basic_english")

In [85]:
tokenizer("How are you doing today?")

['how', 'are', 'you', 'doing', 'today', '?']

In [88]:
# create a vocabulary
# vocab expects with word
from collections import Counter, OrderedDict

In [91]:
counter = Counter()
for review, _ in train_dataset:
    counter.update(tokenizer(review))

In [93]:
print(len(counter))

112119


In [95]:
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

In [285]:
imdb_vocab = vocab(ordered_dict, min_freq=1, specials=["<pad>", "<unk>"], special_first=True)
imdb_vocab.set_default_index(1)

In [286]:
# reduced number of tokens
len(imdb_vocab)

112121

In [287]:
imdb_vocab(["what"])

[54]

In [344]:
def collate_fn(batch):
    token_ls, sentiment_ls, len_ls = [], [], []
    for review, sentiment in batch:
        tokens = [imdb_vocab[token] for token in tokenizer(review)]
        sentiment_idx = 1 if sentiment == 'positive' else 0
        token_ls.append(torch.tensor(tokens, dtype=torch.int64))
        len_ls.append(len(tokens))
        sentiment_ls.append(sentiment_idx)
    return nn.utils.rnn.pad_sequence(token_ls, batch_first=True), torch.tensor(sentiment_ls), torch.tensor(len_ls)

In [345]:
dl = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [346]:
itr = iter(dl)

In [347]:
BATCH_SIZE=32
NUM_EMBEDDINGS=len(imdb_vocab)
EMBEDDING_DIM=20
LSTM_HIDDEN_SIZE=64
FC_HIDDEN_SIZE=64

In [348]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [349]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=NUM_EMBEDDINGS, embedding_dim=EMBEDDING_DIM, padding_idx=0)
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LSTM_HIDDEN_SIZE, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(in_features=LSTM_HIDDEN_SIZE, out_features=FC_HIDDEN_SIZE)
        self.fc2 = nn.Linear(in_features=FC_HIDDEN_SIZE, out_features=1)
    
    def forward(self, x, sizes):
        x = self.embedding(x)
        x = nn.utils.rnn.pack_padded_sequence(
            x, sizes.cpu().numpy(), enforce_sorted=False, batch_first=True
        )
        _, (h_n, _) = self.lstm(x)
        x = h_n[-1, ...]
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        
        return x

In [350]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [351]:
def track_performance(dataloader, model, criterion):
    # switch to evaluation mode
    model.eval()
    num_samples = 0
    num_correct = 0
    loss_sum = 0
    
    # no need to calculate gradients
    with torch.inference_mode():
        for batch_idx, (features, labels, sizes) in enumerate(dataloader):
            features = features.to(DEVICE)
            labels = labels.to(DEVICE).view(-1, 1).float()
            logits = model(features, sizes)
            probs = torch.sigmoid(logits)
                        
            predictions = (probs > 0.5).float()
            num_correct += (predictions == labels).sum().item()
            
            loss = criterion(logits, labels)
            loss_sum += loss.cpu().item()
            num_samples += len(features)
    
    # we return the average loss and the accuracy
    return loss_sum/num_samples, num_correct/num_samples


In [355]:
def train(num_epochs, train_dataloader, val_dataloader, model, criterion, optimizer, scheduler=None):
    history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}
    
    model.to(DEVICE)
    
    for epoch in range(num_epochs):
        for batch_idx, (features, labels, sizes) in enumerate(train_dataloader):
            model.train()
            features = features.to(DEVICE)
            labels = labels.to(DEVICE).view(-1, 1).float()
            
            # Empty the gradients
            optimizer.zero_grad()
            
            # Forward Pass
            logits = model(features, sizes)
            
            # Calculate Loss
            loss = criterion(logits, labels)
            
            # Backward Pass
            loss.backward()
            
            # Gradient Descent
            optimizer.step()
            
        train_loss, train_acc = track_performance(train_dataloader, model, criterion)
        val_loss, val_acc = track_performance(val_dataloader, model, criterion)

        if scheduler:
          scheduler.step(val_acc)

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_acc"].append(train_acc)
        history["val_acc"].append(val_acc)

        print(f'Epoch: {epoch+1:>2}/{num_epochs} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f} | Train Acc: {train_acc:.3f} | Val Acc: {val_acc:.3f}')
    return history            


In [356]:
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.1,
                                                       mode='max',
                                                       patience=2,
                                                       verbose=True)

criterion = nn.BCEWithLogitsLoss()

In [357]:
history = train(15, train_dataloader, val_dataloader, model, criterion, optimizer, scheduler)

Epoch:  1/15 | Train Loss: 0.01862 | Val Loss: 0.01864 | Train Acc: 0.696 | Val Acc: 0.700
Epoch:  2/15 | Train Loss: 0.01583 | Val Loss: 0.01656 | Train Acc: 0.752 | Val Acc: 0.735
Epoch:  3/15 | Train Loss: 0.01139 | Val Loss: 0.01301 | Train Acc: 0.850 | Val Acc: 0.823
Epoch:  4/15 | Train Loss: 0.01244 | Val Loss: 0.01455 | Train Acc: 0.822 | Val Acc: 0.785
Epoch:  5/15 | Train Loss: 0.01405 | Val Loss: 0.01540 | Train Acc: 0.810 | Val Acc: 0.768
Epoch:  6/15 | Train Loss: 0.00737 | Val Loss: 0.01065 | Train Acc: 0.910 | Val Acc: 0.859
Epoch:  7/15 | Train Loss: 0.00614 | Val Loss: 0.00999 | Train Acc: 0.925 | Val Acc: 0.871
Epoch:  8/15 | Train Loss: 0.00468 | Val Loss: 0.00964 | Train Acc: 0.948 | Val Acc: 0.878
Epoch:  9/15 | Train Loss: 0.00349 | Val Loss: 0.00988 | Train Acc: 0.963 | Val Acc: 0.884
Epoch: 10/15 | Train Loss: 0.00304 | Val Loss: 0.00997 | Train Acc: 0.968 | Val Acc: 0.881
Epoch: 11/15 | Train Loss: 0.00219 | Val Loss: 0.01063 | Train Acc: 0.979 | Val Acc: 0.885