In [1]:
from src.model import get_dataset, Trainer, RNN, LSTM
import torch.optim as optim
import torch.nn as nn

## Preparing Data

In [2]:
dataset = get_dataset()

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [4]:
print(vars(dataset["train_data"].examples[0]))

{'text': ['How', 'can', 'a', 'movie', 'with', 'Amy', ',', 'Posey', 'and', 'Raechel', 'have', 'NOTHING', 'funny', 'in', 'it', '?', 'Believe', 'it', 'or', 'not', "'", 'House', 'Bunny', "'", 'did', 'this', 'better', 'and', 'funnier', '.', 'Hopefully', 'the', 'principals', 'had', 'a', 'good', 'holiday', 'and', 'got', 'some', 'money', '-', 'this', 'movie', 'is', 'an', 'embarrassment', 'to', 'all', 'of', 'them', '.', 'It', 'is', 'a', 'cliché', 'from', 'beginning', 'to', 'end', '.', 'Clichés', 'can', 'work', 'well', 'with', 'a', 'script', ',', 'or', 'at', 'least', 'an', 'idea', '.', 'This', 'movie', 'does', 'nothing', 'but', 'use', 'cliché', 'after', 'cliché', 'rather', 'than', 'ideas', 'or', 'script', '.', 'It', 'uses', 'the', 'preexisting', 'persona', "'s", 'of', 'the', 'actresses', 'rather', 'than', 'develop', 'characters', '.', 'Bad', ',', 'sad', ',', 'and', 'rubbish', '.', 'Now', 'I', 'apparently', 'have', 'to', 'have', 'ten', 'lines', 'of', 'text', 'for', 'a', 'comment', '.', 'Really', 

In [5]:
print(dataset["TEXT"].vocab.freqs.most_common(20))

[('the', 202820), (',', 192918), ('.', 164754), ('and', 109435), ('a', 109381), ('of', 101102), ('to', 93825), ('is', 76552), ('in', 61403), ('I', 54228), ('it', 53519), ('that', 49356), ('"', 44224), ("'s", 43374), ('this', 42438), ('-', 37284), ('/><br', 35417), ('was', 34920), ('as', 30245), ('with', 29869)]


In [6]:
print(dataset["TEXT"].vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [7]:
print(dataset["LABEL"].vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [3]:
INPUT_DIM = len(dataset["TEXT"].vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

trainer = Trainer(model, dataset, optimizer, criterion)

The model has 2,592,105 trainable parameters


In [4]:
trainer.train(N_EPOCHS = 1)

Epoch: 01 | Epoch Time: 0m 20s
	Train Loss: 0.708 | Train Acc: 49.45%
	 Val. Loss: 0.700 |  Val. Acc: 50.36%


In [5]:
trainer.test()

Test Loss: 0.716 | Test Acc: 43.73%


In [6]:
predict = trainer.get_predict_sentiment() #.predict_sentiment()

In [7]:
print(predict("What can you do with the fire ?"))

0.453779399394989


In [8]:
print(predict("NLP is awesome !"))

0.5793689489364624


In [11]:
print(predict("Corona-virus has killed the whole world, the economy is suffering, empires have fallen, what a disaster !"))
print(predict("Gone has killed the whole world, the economy is suffering, empires have fallen, what a disaster !"))

0.4317341446876526
0.5609487891197205
