In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    reviewsFile = open('../data/reviews.txt','r')
    reviews = list(map(lambda x:x[:-1],reviewsFile.readlines()))
    reviewsFile.close()

    labelsFile = open('../data/labels.txt','r')
    labels = list(map(lambda x:x[:-1],labelsFile.readlines()))
    labelsFile.close()
    
    return reviews,labels

In [3]:
reviews,labels = load_data()

In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    "aren't",
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'done',
    'down',
    'except',
    'few',
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'however',
    'isn',
    "isn't",
    'least',
    'mightn',
    "mightn't",
    'move',
    'much',
    'must',
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    "should've",
    'shouldn',
    "shouldn't",
    'too',
    'top',
    'up',
    'very'
    'wasn',
    "wasn't",
    'well',
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't",
}

In [8]:
stop_words = set(stop_words).union(STOP_WORDS)

In [9]:
final_stop_words = stop_words-exceptionStopWords

In [10]:
import spacy
nlp = spacy.load("en",disable=['parser', 'tagger', 'ner'])

In [11]:
def make_token(review):
    return tokenizer.tokenize(str(review))

In [12]:
def remove_stopwords(review):
    return [token for token in review if token not in final_stop_words]

In [13]:
def lemmatization(review):
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result

In [14]:
def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [15]:
%%time
reviews = list(map(lambda review: pipeline(review),reviews))

CPU times: user 37.4 s, sys: 8.84 ms, total: 37.5 s
Wall time: 37.5 s


In [16]:
reviews[:2]

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teach',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  'satire',
  'much',
  'close',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'situation',
  'remind',
  'school',
  'know',
  'student',
  'see',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'down',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  'sack',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  'isn'],
 ['story',
  'man',
  'unnatural',
  'feeling',
  'pig',
  'start',
  'open',
  'scene',
  'terrific',
  'example',
  'absurd',
  'comedy',
  'formal',
  'orchestra',
  'audience',
  'turn',
  '

In [17]:
from gensim.models import Word2Vec

In [18]:
embedding_dimension = 100

In [19]:
model = Word2Vec(reviews,size=embedding_dimension, window=3, min_count=3, workers=4)

In [20]:
model.sg

0

In [21]:
word_vectors = model.wv

In [22]:
del model

In [23]:
len(word_vectors.vocab)

28165

In [24]:
word_vectors.similar_by_word(word="good", topn=5)

[('decent', 0.7149477005004883),
 ('alright', 0.6797058582305908),
 ('darn', 0.6612451672554016),
 ('nice', 0.635985255241394),
 ('okay', 0.6258107423782349)]

In [25]:
word_vectors.similar_by_word(word="bad", topn=5)

[('terrible', 0.7072927355766296),
 ('horrible', 0.6914027333259583),
 ('suck', 0.6833032369613647),
 ('lousy', 0.6696577668190002),
 ('lame', 0.6681134700775146)]

In [26]:
word_vectors.most_similar(positive="bad",topn=4)

[('terrible', 0.7072927355766296),
 ('horrible', 0.6914027333259583),
 ('suck', 0.6833032369613647),
 ('lousy', 0.6696577668190002)]

In [27]:
word_vectors.similarity("good","bad")

0.56167686

In [28]:
word_vectors.similarity("good","be")

0.31550848

In [29]:
word_vectors.similar_by_word(word="school", topn=5)

[('schooler', 0.779565691947937),
 ('college', 0.7704999446868896),
 ('class', 0.7457226514816284),
 ('bromwell', 0.7275199890136719),
 ('student', 0.719336986541748)]

In [30]:
word_vectors.similar_by_word(word="comedy", topn=5)

[('slapstick', 0.701862633228302),
 ('satire', 0.6545994281768799),
 ('parody', 0.6525259017944336),
 ('drama', 0.6442978382110596),
 ('spoof', 0.6419641971588135)]

In [31]:
word_vectors.similar_by_word(word="action", topn=5)

[('suspense', 0.6146787405014038),
 ('thrill', 0.5885767340660095),
 ('cliffhanging', 0.5708556771278381),
 ('gory', 0.5699746608734131),
 ('cq', 0.5642054677009583)]

In [32]:
word_vectors.similar_by_word(word="sad", topn=5)

[('depress', 0.7681834697723389),
 ('heartwarming', 0.7197287678718567),
 ('cry', 0.6938855051994324),
 ('happy', 0.6869232058525085),
 ('honest', 0.6752169132232666)]

In [33]:
word_vectors.most_similar(negative=["bad"],positive=["decent"],topn=5)

[('solid', 0.4306844472885132),
 ('fine', 0.3992552161216736),
 ('splendid', 0.3872664272785187),
 ('fantastic', 0.38341987133026123),
 ('support', 0.37757763266563416)]

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
SEED = 2222

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [35]:
def word2idx(embedding_model,review):
    index_review = []
    for word in review:
        try:
            index_review.append(embedding_model.vocab[word].index)
        except: 
             pass
    return torch.tensor(index_review)

In [36]:
padding_value = len(word_vectors.index2word)

In [37]:
padding_value

28165

In [38]:
index_review = list(map(lambda review: word2idx(word_vectors,review),reviews))

In [39]:
embedding_weights = torch.Tensor(word_vectors.vectors)

In [40]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, text_lengths):
        #x [sent length , batch size]
        embedded = self.embedding(x) #[sentect len,batch size,embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, hidden = self.rnn(packed_embedded)#[sentence length,batch size, hidden dim],[1,batch size,hidden dim]
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        return self.fc(hidden.squeeze(0))

In [41]:
INPUT_DIM = padding_value
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [42]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,embedding_weights)

In [43]:
model

RNN(
  (embedding): Embedding(28165, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [44]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [45]:
criterion = nn.BCEWithLogitsLoss()

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
device.type

'cuda'

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
labels[:5]

['positive', 'negative', 'positive', 'negative', 'positive']

In [51]:
labels = [0 if label == 'negative' else 1 for label in labels ]

In [52]:
labels[:5]

[1, 0, 1, 0, 1]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(index_review, labels, test_size=0.2)

In [54]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [55]:
print(len(X_train),len(X_test),len(X_val))

16000 5000 4000


In [56]:
print(len(y_train),len(y_test),len(y_val))

16000 5000 4000


In [57]:
batch_size = 128 
import numpy as np

In [58]:
def iterator_func(X,y):
    size = len(X)
    permutation = np.random.permutation(size)
    iterator = []
    for i in range(0,size, batch_size):
        indices = permutation[i:i+batch_size]
        batch = {}
        batch["text"] = [X[i] for i in indices]
        batch["label"] = [y[i] for i in indices]
        
        batch["text"],batch["label"] = zip(*sorted(zip(batch["text"],batch["label"]),key=lambda x: len(x[0]),reverse=True))
        batch["length"] = [len(review) for review in batch["text"]]
        batch["length"] = torch.IntTensor(batch["length"])
        batch["text"] = torch.nn.utils.rnn.pad_sequence(batch["text"],batch_first=True).t()
        batch["label"] = torch.Tensor(batch["label"])
        
        batch["label"]  = batch["label"].to(device)
        batch["length"] = batch["length"].to(device) 
        batch["text"]   = batch["text"].to(device) 
        
        iterator.append(batch)
        
    return iterator

In [59]:
train_iterator = iterator_func(X_train,y_train)
valid_iterator = iterator_func(X_val,y_val)
test_iterator = iterator_func(X_test,y_test)

In [60]:
print(len(train_iterator),len(test_iterator),len(valid_iterator))

125 40 32


In [61]:
model = model.to(device)
criterion = criterion.to(device)

In [62]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [63]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch["text"],batch["length"]).squeeze(1)
        loss = criterion(predictions, batch["label"])
        acc = binary_accuracy(predictions, batch["label"])
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [64]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch["text"],batch["length"]).squeeze(1)
            loss = criterion(predictions, batch["label"])
            acc = binary_accuracy(predictions, batch["label"])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [65]:
N_EPOCHS = 5

In [66]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.694 | Train Acc: 50.51% | Val. Loss: 0.696 | Val. Acc: 50.22% |
| Epoch: 02 | Train Loss: 0.691 | Train Acc: 52.19% | Val. Loss: 0.693 | Val. Acc: 51.46% |
| Epoch: 03 | Train Loss: 0.687 | Train Acc: 53.73% | Val. Loss: 0.690 | Val. Acc: 53.32% |
| Epoch: 04 | Train Loss: 0.684 | Train Acc: 55.58% | Val. Loss: 0.687 | Val. Acc: 55.30% |
| Epoch: 05 | Train Loss: 0.682 | Train Acc: 57.12% | Val. Loss: 0.685 | Val. Acc: 57.15% |


In [67]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.683 | Test Acc: 57.21% |
