# 1. Preparing data

In [1]:
import torch
from torch import nn
import torch.optim as optim
from model import RNN
from functools import partial
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from preprocessing import get_total_tokens_from_files, create_vocab, get_total_tokens, save_total_tokens

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 1234
torch.manual_seed(1234)

<torch._C.Generator at 0x7f10209e7c30>

In [3]:
def apply_prefix(task, x):
    return f"{task}: " + x[0], x[1]

def process_labels(labels, x):
    return x[1], labels[str(x[0])]

imdb_batch_size = 3
task = "sst2 sentence"
labels = {"1": "negative", "2": "positive"}

# Test set
imdb_datapipe_test = IMDB(split="test")
imdb_datapipe_test = imdb_datapipe_test.map(partial(process_labels, labels))
imdb_datapipe_test = imdb_datapipe_test.map(partial(apply_prefix, task))
imdb_datapipe_test = imdb_datapipe_test.batch(imdb_batch_size)
imdb_datapipe_test = imdb_datapipe_test.rows2columnar(["text", "label"])
imdb_dataloader_test = DataLoader(imdb_datapipe_test, batch_size=None)

# Train set
imdb_datapipe_train = IMDB(split="train")
imdb_datapipe_train = imdb_datapipe_train.map(partial(process_labels, labels))
imdb_datapipe_train = imdb_datapipe_train.map(partial(apply_prefix, task))
imdb_datapipe_train = imdb_datapipe_train.batch(imdb_batch_size)
imdb_datapipe_train = imdb_datapipe_train.rows2columnar(["text", "label"])
imdb_dataloader_train = DataLoader(imdb_datapipe_train, batch_size=None)

print(imdb_dataloader_train)
# for batch in imdb_dataloader:
#     print(batch["label"])

<torch.utils.data.dataloader.DataLoader object at 0x7f0f31819250>


In [4]:
# ! Do not delete !

# total_tokens = get_total_tokens(imdb_dataloader_train)
# save_total_tokens(total_tokens)

total_tokens = get_total_tokens_from_files()

In [5]:
# ! One hot encode instead ?
vocab = create_vocab(total_tokens)

# 2. Model

In [6]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,592,005 trainable parameters


# 3. Train the model 

In [8]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [9]:
from preprocessing import tokenize


def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        print("batch => ", batch)
        
        optimizer.zero_grad()
                
        # predictions = model(batch.text).squeeze(1)
        tokens = tokenize(batch["text"][0][15:])
        final_list = []
        for token in tokens:
            final_list.append(vocab[token])
        
        print("final list ==============> ",final_list)
        
        # predictions = model(batch["text"][0][15:])
        predictions = model(batch["text"][0][15:])


        # ! bidouillage
        posCount = 0
        negCount = 0
        for label in batch["label"]:
            if label == 'negative':
                negCount += 1
            elif label == 'positive':
                posCount +=1

        if posCount > negCount:
            label = 1
        else:
            label = 0
        # loss = criterion(predictions, batch.label)
        loss = criterion(predictions, label)

        
        # acc = binary_accuracy(predictions, batch.label)
        acc = binary_accuracy(predictions, label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [10]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [11]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [12]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, imdb_dataloader_train, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, imdb_dataloader_test, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

batch =>  {'text': ['sst2 sentence: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str