In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    reviewsFile = open('../data/reviews.txt','r')
    reviews = list(map(lambda x:x[:-1],reviewsFile.readlines()))
    reviewsFile.close()

    labelsFile = open('../data/labels.txt','r')
    labels = list(map(lambda x:x[:-1],labelsFile.readlines()))
    labelsFile.close()
    
    return reviews,labels

In [3]:
reviews,labels = load_data()

In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    "aren't",
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'done',
    'down',
    'except',
    'few',
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'however',
    'isn',
    "isn't",
    'least',
    'mightn',
    "mightn't",
    'move',
    'much',
    'must',
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    "should've",
    'shouldn',
    "shouldn't",
    'too',
    'top',
    'up',
    'very'
    'wasn',
    "wasn't",
    'well',
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't",
}

In [8]:
stop_words = set(stop_words).union(STOP_WORDS)

In [9]:
final_stop_words = stop_words-exceptionStopWords

In [10]:
import spacy
nlp = spacy.load("en",disable=['parser', 'tagger', 'ner'])

In [11]:
def make_token(review):
    return tokenizer.tokenize(str(review))

In [12]:
def remove_stopwords(review):
    return [token for token in review if token not in final_stop_words]

In [13]:
def lemmatization(review):
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result

In [14]:
def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [15]:
%%time
reviews = list(map(lambda review: pipeline(review),reviews))

CPU times: user 38.4 s, sys: 103 ms, total: 38.5 s
Wall time: 38.6 s


In [16]:
reviews[:2]

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teach',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  'satire',
  'much',
  'close',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'situation',
  'remind',
  'school',
  'know',
  'student',
  'see',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'down',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  'sack',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  'isn'],
 ['story',
  'man',
  'unnatural',
  'feeling',
  'pig',
  'start',
  'open',
  'scene',
  'terrific',
  'example',
  'absurd',
  'comedy',
  'formal',
  'orchestra',
  'audience',
  'turn',
  '

In [17]:
from gensim.models import Word2Vec

In [18]:
embedding_dimension = 100

In [19]:
model = Word2Vec(reviews,size=embedding_dimension, window=3, min_count=3, workers=4)

In [20]:
model.sg

0

In [21]:
word_vectors = model.wv

In [22]:
del model

In [23]:
len(word_vectors.vocab)

28165

In [24]:
word_vectors.similar_by_word(word="good", topn=5)

[('decent', 0.7306010723114014),
 ('alright', 0.6821662187576294),
 ('darn', 0.658607542514801),
 ('okay', 0.6319257616996765),
 ('fair', 0.6208689212799072)]

In [25]:
word_vectors.similar_by_word(word="bad", topn=5)

[('horrible', 0.7087627053260803),
 ('terrible', 0.7078632116317749),
 ('lame', 0.6842957735061646),
 ('ritchie', 0.6836880445480347),
 ('awful', 0.6716997027397156)]

In [26]:
word_vectors.most_similar(positive="bad",topn=4)

[('horrible', 0.7087627053260803),
 ('terrible', 0.7078632116317749),
 ('lame', 0.6842957735061646),
 ('ritchie', 0.6836880445480347)]

In [27]:
word_vectors.similarity("good","bad")

0.57048476

In [28]:
word_vectors.similarity("good","be")

0.27348474

In [29]:
word_vectors.similar_by_word(word="school", topn=5)

[('college', 0.782921552658081),
 ('class', 0.7638683319091797),
 ('schooler', 0.7603098750114441),
 ('schoolers', 0.7321504950523376),
 ('doddsville', 0.7250348329544067)]

In [30]:
word_vectors.similar_by_word(word="comedy", topn=5)

[('humor', 0.6835606098175049),
 ('slapstick', 0.6748438477516174),
 ('satire', 0.6725203990936279),
 ('farce', 0.668818473815918),
 ('drama', 0.6469231843948364)]

In [31]:
word_vectors.similar_by_word(word="action", topn=5)

[('suspense', 0.635033905506134),
 ('thrill', 0.5868341326713562),
 ('gory', 0.5625475645065308),
 ('courtroom', 0.5608855485916138),
 ('overlong', 0.5559324622154236)]

In [32]:
word_vectors.similar_by_word(word="sad", topn=5)

[('depress', 0.7547096014022827),
 ('heartwarming', 0.7460227012634277),
 ('cry', 0.7299706935882568),
 ('genuinely', 0.6788302659988403),
 ('honestly', 0.6724828481674194)]

In [33]:
word_vectors.most_similar(negative=["bad"],positive=["decent"],topn=5)

[('fine', 0.4173683524131775),
 ('solid', 0.4102835953235626),
 ('support', 0.40012240409851074),
 ('ensemble', 0.39356565475463867),
 ('large', 0.386192262172699)]

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
SEED = 2222

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [35]:
def word2idx(embedding_model,review):
    index_review = []
    for word in review:
        try:
            index_review.append(embedding_model.vocab[word].index)
        except: 
             pass
    return torch.tensor(index_review)

In [36]:
padding_value = len(word_vectors.index2word)

In [37]:
padding_value

28165

In [38]:
index_review = list(map(lambda review: word2idx(word_vectors,review),reviews))

In [39]:
embedding_weights = torch.Tensor(word_vectors.vectors)

In [40]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout,embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, text_lengths):
        #x [sent length , batch size]
        embedded = self.embedding(x) #[sentect len,batch size,embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)#output[sent length,batch size,hiddendin*num of directions],[numberlayers*num of dir,batch size,hid dim]
        #[f0,b0,f1,b1,.......fn,bn]
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

In [41]:
INPUT_DIM = padding_value
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [42]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, embedding_weights)

In [43]:
model

RNN(
  (embedding): Embedding(28165, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)

In [44]:
optimizer = optim.Adam(model.parameters())

In [45]:
criterion = nn.BCEWithLogitsLoss()

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [47]:
device.type

'cuda'

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
labels[:5]

['positive', 'negative', 'positive', 'negative', 'positive']

In [50]:
labels = [0 if label == 'negative' else 1 for label in labels ]

In [51]:
labels[:5]

[1, 0, 1, 0, 1]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(index_review, labels, test_size=0.2)

In [53]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [54]:
print(len(X_train),len(X_test),len(X_val))

16000 5000 4000


In [55]:
print(len(y_train),len(y_test),len(y_val))

16000 5000 4000


In [56]:
batch_size = 128 
import numpy as np

In [57]:
def iterator_func(X,y):
    size = len(X)
    permutation = np.random.permutation(size)
    iterator = []
    for i in range(0,size, batch_size):
        indices = permutation[i:i+batch_size]
        batch = {}
        batch["text"] = [X[i] for i in indices]
        batch["label"] = [y[i] for i in indices]
        
        batch["text"],batch["label"] = zip(*sorted(zip(batch["text"],batch["label"]),key=lambda x: len(x[0]),reverse=True))
        batch["length"] = [len(review) for review in batch["text"]]
        batch["length"] = torch.IntTensor(batch["length"])
        batch["text"] = torch.nn.utils.rnn.pad_sequence(batch["text"],batch_first=True).t()
        batch["label"] = torch.Tensor(batch["label"])
        
        batch["label"]  = batch["label"].to(device)
        batch["length"] = batch["length"].to(device) 
        batch["text"]   = batch["text"].to(device) 
        
        iterator.append(batch)
        
    return iterator

In [58]:
train_iterator = iterator_func(X_train,y_train)
valid_iterator = iterator_func(X_val,y_val)
test_iterator = iterator_func(X_test,y_test)

In [59]:
print(len(train_iterator),len(test_iterator),len(valid_iterator))

125 40 32


In [60]:
model = model.to(device)
criterion = criterion.to(device)

In [61]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [62]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch["text"],batch["length"]).squeeze(1)
        loss = criterion(predictions, batch["label"])
        acc = binary_accuracy(predictions, batch["label"])
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [63]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch["text"],batch["length"]).squeeze(1)
            loss = criterion(predictions, batch["label"])
            acc = binary_accuracy(predictions, batch["label"])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [64]:
N_EPOCHS = 5

In [65]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.513 | Train Acc: 74.94% | Val. Loss: 0.423 | Val. Acc: 80.91% |
| Epoch: 02 | Train Loss: 0.411 | Train Acc: 81.57% | Val. Loss: 0.381 | Val. Acc: 83.96% |
| Epoch: 03 | Train Loss: 0.385 | Train Acc: 83.10% | Val. Loss: 0.350 | Val. Acc: 84.86% |
| Epoch: 04 | Train Loss: 0.361 | Train Acc: 84.71% | Val. Loss: 0.355 | Val. Acc: 83.98% |
| Epoch: 05 | Train Loss: 0.429 | Train Acc: 80.15% | Val. Loss: 0.384 | Val. Acc: 81.71% |


In [66]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.399 | Test Acc: 82.21% |


In [67]:
def predict_sentiment(sentence):
    tokenized = pipeline(sentence)
    indexed = word2idx(word_vectors,tokenized)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor,torch.LongTensor([len(indexed)]).to(device)))
    return prediction.item()

In [68]:
predict_sentiment("this is a awesome movie")

0.8104231953620911

In [69]:
predict_sentiment("this is not an action movie, is a  very good movie")

0.6436399221420288

In [70]:
predict_sentiment("this is comedy movie")

0.6326010227203369

In [71]:
predict_sentiment("this is an awful movie")

0.04732838273048401

In [73]:
predict_sentiment("this is a bad movie")

0.11437103152275085