In [1]:
from src.model import get_dataset, Trainer, RNN, LSTM
import torch.optim as optim
import torch.nn as nn

## Preparing Data

In [3]:
dataset = get_dataset()

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [7]:
print(vars(dataset["train_data"].examples[0]))

{'text': ['How', 'can', 'a', 'movie', 'with', 'Amy', ',', 'Posey', 'and', 'Raechel', 'have', 'NOTHING', 'funny', 'in', 'it', '?', 'Believe', 'it', 'or', 'not', "'", 'House', 'Bunny', "'", 'did', 'this', 'better', 'and', 'funnier', '.', 'Hopefully', 'the', 'principals', 'had', 'a', 'good', 'holiday', 'and', 'got', 'some', 'money', '-', 'this', 'movie', 'is', 'an', 'embarrassment', 'to', 'all', 'of', 'them', '.', 'It', 'is', 'a', 'cliché', 'from', 'beginning', 'to', 'end', '.', 'Clichés', 'can', 'work', 'well', 'with', 'a', 'script', ',', 'or', 'at', 'least', 'an', 'idea', '.', 'This', 'movie', 'does', 'nothing', 'but', 'use', 'cliché', 'after', 'cliché', 'rather', 'than', 'ideas', 'or', 'script', '.', 'It', 'uses', 'the', 'preexisting', 'persona', "'s", 'of', 'the', 'actresses', 'rather', 'than', 'develop', 'characters', '.', 'Bad', ',', 'sad', ',', 'and', 'rubbish', '.', 'Now', 'I', 'apparently', 'have', 'to', 'have', 'ten', 'lines', 'of', 'text', 'for', 'a', 'comment', '.', 'Really', 

In [8]:
print(dataset["TEXT"].vocab.freqs.most_common(20))

[('the', 202820), (',', 192918), ('.', 164754), ('and', 109435), ('a', 109381), ('of', 101102), ('to', 93825), ('is', 76552), ('in', 61403), ('I', 54228), ('it', 53519), ('that', 49356), ('"', 44224), ("'s", 43374), ('this', 42438), ('-', 37284), ('/><br', 35417), ('was', 34920), ('as', 30245), ('with', 29869)]


In [9]:
print(dataset["TEXT"].vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [10]:
print(dataset["LABEL"].vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


# RNN

In [11]:
INPUT_DIM = len(dataset["TEXT"].vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [12]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [16]:
trainer = Trainer(model, dataset, optimizer, criterion, dump_path="/home/jupyter/wl_research/wl_research_challenge/dump_path")

The model has 2,592,105 trainable parameters


In [18]:
trainer.train(n_epoch = 5)

Epoch: 01 | Epoch Time: 0m 19s
	Train Loss: 0.708 | Train Acc: 49.45%
	 Val. Loss: 0.700 |  Val. Acc: 50.36%
Epoch: 02 | Epoch Time: 0m 20s
	Train Loss: 0.696 | Train Acc: 49.69%
	 Val. Loss: 0.696 |  Val. Acc: 50.60%
Epoch: 03 | Epoch Time: 0m 19s
	Train Loss: 0.696 | Train Acc: 49.42%
	 Val. Loss: 0.697 |  Val. Acc: 50.44%
Epoch: 04 | Epoch Time: 0m 20s
	Train Loss: 0.696 | Train Acc: 49.90%
	 Val. Loss: 0.695 |  Val. Acc: 51.12%
Epoch: 05 | Epoch Time: 0m 20s
	Train Loss: 0.696 | Train Acc: 50.05%
	 Val. Loss: 0.701 |  Val. Acc: 49.74%


In [19]:
trainer.test()

Test Loss: 0.688 | Test Acc: 56.00%


In [20]:
predict = trainer.get_predict_sentiment() #.predict_sentiment()

In [21]:
print(predict("What can you do with the fire ?"))

0.46443313360214233


In [22]:
print(predict("NLP is awesome !"))

0.572141706943512


In [23]:
print(predict("Corona-virus has killed the whole world, the economy is suffering, empires have fallen, what a disaster !"))
print(predict("Gone has killed the whole world, the economy is suffering, empires have fallen, what a disaster !"))

0.6057085394859314
0.45355963706970215


# LSTM

In [24]:
INPUT_DIM = len(dataset["TEXT"].vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = dataset["TEXT"].vocab.stoi[dataset["TEXT"].pad_token]

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [25]:
trainer = Trainer(model, dataset, optimizer, criterion, dump_path="/home/jupyter/wl_research/wl_research_challenge/dump_path")

The model has 4,810,857 trainable parameters


In [None]:
trainer.train(n_epoch = 5)

# CNN

In [29]:
from src.model import CNN, CNN1d

In [30]:
INPUT_DIM = len(dataset["TEXT"].vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = dataset["TEXT"].vocab.stoi[dataset["TEXT"].pad_token]
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
#model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [31]:
trainer = Trainer(model, dataset, optimizer, criterion, dump_path="/home/jupyter/wl_research/wl_research_challenge/dump_path")

The model has 2,620,801 trainable parameters


In [34]:
import torch.nn.functional as F
trainer.train(n_epoch = 5)

NameError: name 'F' is not defined

In [36]:
from transformers import BertTokenizer, BertModel
from src.model import BERTGRUSentiment

In [37]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [38]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
model = BERTGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [39]:
trainer = Trainer(model, dataset, optimizer, criterion, dump_path="/home/jupyter/wl_research/wl_research_challenge/dump_path")

The model has 112,241,409 trainable parameters


In [41]:
trainer.train(n_epoch = 5)

ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([621]))