In [283]:
import numpy as np
import torch
import torch.nn as nn

In [284]:
# use GPU in PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [285]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU is available')
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU")

GPU is available


In [286]:
#!pip install 'portalocker>=2.0.0'

In [287]:
!pip install datasets




In [288]:
from datasets import load_dataset

# Load the IMDB dataset
imdb_dataset = load_dataset("imdb")

# Access the train and test splits
train_set = imdb_dataset["train"]
test_set = imdb_dataset["test"]

# Print a sample from the train set
print(train_set[0])


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [289]:
#!pip install --upgrade torchtext

In [290]:
print(tuple(train_set[0].values()))

('I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [291]:
train_set1 = []
for i in train_set:
  train_set1.append(tuple(i.values()))


In [292]:
train_set2 = [(t[1], t[0]) for t in train_set1]


In [293]:
print(train_set2[0])

(0, 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between

In [294]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_set2), [20000, 5000])


In [295]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emotions = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emotions).replace('-', '')
    return text.split()

token_counts = Counter()
for label,line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print("Total number of tokens in the training dataset: ", len(token_counts))

Total number of tokens in the training dataset:  69023


In [296]:
#pip install torchtext
from torchtext.vocab import vocab
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)


In [297]:
print([vocab[token] for token in ['this', 'is', 'a', 'test']])

[11, 7, 4, 2297]


In [298]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 0 if x == 'neg' else 1

In [299]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return  padded_text_list,label_list, lengths


In [300]:
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=8, shuffle=False, collate_fn = collate_batch )


In [301]:
text_batch, label_batch, length_batch = next(iter(dataloader))
print("Text batch size: ", length_batch.size())

Text batch size:  torch.Size([8])


In [302]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn = collate_batch)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn = collate_batch)
val_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn = collate_batch)


In [303]:
print(len(next(iter(train_loader))))

3


In [304]:
embedding = nn.Embedding(num_embeddings = 20, embedding_dim = 5, padding_idx = 0)
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
output = embedding(input)
print(output)

tensor([[[-0.3446,  1.0529,  0.2161, -0.4643,  0.3046],
         [-0.0251, -0.5973, -1.0290, -0.3284,  0.8326],
         [ 1.5575, -1.6080, -0.5036,  1.1374,  0.5263],
         [ 1.4874,  1.7950,  0.3312,  0.5808,  1.2903]],

        [[ 1.5575, -1.6080, -0.5036,  1.1374,  0.5263],
         [-0.5173, -0.9281,  1.6486, -0.2286,  0.0294],
         [-0.0251, -0.5973, -1.0290, -0.3284,  0.8326],
         [ 1.7383, -0.5254,  1.0608, -0.5329, -0.7423]]],
       grad_fn=<EmbeddingBackward0>)


In [305]:
class RNN(nn.Module):
    def __init__ (self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers = 2, batch_first = True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        _, hidden = self.rnn(x)
        s, b, h = x.shape
        out = hidden[-1,:,:]
        out = self.fc(out)
        return out

model = RNN(64,32)
print(model)

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [306]:
model(torch.randn(5,3,64))

tensor([[-0.1283],
        [-0.0111],
        [-0.2183],
        [ 0.2983],
        [ 0.0711]], grad_fn=<AddmmBackward0>)

In [307]:
class RNN(nn.Module):
    def __init__ (self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding( vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM( embed_dim, rnn_hidden_size, batch_first=True)
        #self.fc = nn.LSTM(rnn_hidden_size, fc_hidden_size, batch_first=True)
        self.linear = nn.Linear(rnn_hidden_size,fc_hidden_size)
        self.leakyrelu = nn.ReLU()
        self.linear2 = nn.Linear(fc_hidden_size,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        out = self.embedding(x)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), batch_first=True, enforce_sorted=False)
        out, (hidden,cell) = self.rnn(out)
        out = hidden[-1,:,:]
        out = self.linear(out)
        out = self.leakyrelu(out)
        out = self.linear2(out)
        out = self.sigmoid(out)
        return out

In [308]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size).to(device)
print(model)

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=64, bias=True)
  (leakyrelu): ReLU()
  (linear2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [309]:
def train(dataloader):
    model.train()
    total_loss = 0
    total_acc= 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device)
        lengths = lengths.to(device)
        output = model(text_batch, lengths)[:,0]

        loss = loss_fn(output.float(), label_batch.float())

        loss.backward()
        optimizer.step()
        total_loss += loss.item()*label_batch.size(0)
        total_acc += ((output >= 0.5).float() == label_batch).float().sum().item()
        #print(total_acc,total_loss)
    return total_loss / len(dataloader), total_acc / len(dataloader.dataset)

In [310]:
def evaluate(dataloader):
    model.eval()
    total_loss = 0
    total_acc = 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch, label_batch = text_batch.to(device), label_batch.to(device)
            output = model(text_batch, lengths)[:,0]
            loss = loss_fn(output.float(), label_batch.float())
            total_loss += loss.item()*label_batch.size(0)
            total_acc += ((output >= 0.5).float() == label_batch).float().sum().item()
    return total_loss / len(dataloader), total_acc / len(dataloader.dataset)

In [311]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [312]:
num_epoch = 1
torch.manual_seed(1)
for epoch in range(num_epoch):
    loss_train, acc_train = train(train_loader)
    loss_valid, acc_valid = evaluate(val_loader)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f}'f' val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.9984 val_accuracy: 1.0000


In [313]:
 loss_test,acc_test = evaluate(test_loader)

In [314]:
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 1.0000


In [None]:
path = 'imdb_RNN.pt'
torch.save(model, path)