In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def load_data():
    reviewsFile = open('../data/reviews.txt','r')
    reviews = list(map(lambda x:x[:-1],reviewsFile.readlines()))
    reviewsFile.close()

    labelsFile = open('../data/labels.txt','r')
    labels = list(map(lambda x:x[:-1],labelsFile.readlines()))
    labelsFile.close()
    
    return reviews,labels

In [4]:
reviews,labels = load_data()

In [5]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [7]:
from spacy.lang.en.stop_words import STOP_WORDS

In [8]:
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    "aren't",
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'done',
    'down',
    'except',
    'few',
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'however',
    'isn',
    "isn't",
    'least',
    'mightn',
    "mightn't",
    'move',
    'much',
    'must',
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    "should've",
    'shouldn',
    "shouldn't",
    'too',
    'top',
    'up',
    'very'
    'wasn',
    "wasn't",
    'well',
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't",
}

In [9]:
stop_words = set(stop_words).union(STOP_WORDS)

In [10]:
final_stop_words = stop_words-exceptionStopWords

In [11]:
import spacy
nlp = spacy.load("en",disable=['parser', 'tagger', 'ner'])

In [12]:
def make_token(review):
    return tokenizer.tokenize(str(review))

In [13]:
def remove_stopwords(review):
    return [token for token in review if token not in final_stop_words]

In [14]:
def lemmatization(review):
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result

In [15]:
def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [16]:
%%time
reviews = list(map(lambda review: pipeline(review),reviews))

CPU times: user 35 s, sys: 70.8 ms, total: 35.1 s
Wall time: 35.1 s


In [17]:
reviews[:2]

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teach',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  'satire',
  'much',
  'close',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'situation',
  'remind',
  'school',
  'know',
  'student',
  'see',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'down',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  'sack',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  'isn'],
 ['story',
  'man',
  'unnatural',
  'feeling',
  'pig',
  'start',
  'open',
  'scene',
  'terrific',
  'example',
  'absurd',
  'comedy',
  'formal',
  'orchestra',
  'audience',
  'turn',
  '

In [18]:
from gensim.models import Word2Vec

In [19]:
embedding_dimension = 100

In [20]:
model = Word2Vec(reviews,size=embedding_dimension, window=3, min_count=3, workers=4)

In [21]:
model.sg

0

In [22]:
word_vectors = model.wv

In [23]:
del model

In [24]:
len(word_vectors.vocab)

28165

In [25]:
word_vectors.similar_by_word(word="good", topn=5)

[('decent', 0.7116297483444214),
 ('alright', 0.68076491355896),
 ('okay', 0.645799994468689),
 ('darn', 0.6437720060348511),
 ('excellent', 0.6200926303863525)]

In [26]:
word_vectors.similar_by_word(word="bad", topn=5)

[('horrible', 0.7325153946876526),
 ('awful', 0.7025212645530701),
 ('terrible', 0.6949663758277893),
 ('lousy', 0.6866083145141602),
 ('suck', 0.6829736828804016)]

In [27]:
word_vectors.most_similar(positive="bad",topn=4)

[('horrible', 0.7325153946876526),
 ('awful', 0.7025212645530701),
 ('terrible', 0.6949663758277893),
 ('lousy', 0.6866083145141602)]

In [28]:
word_vectors.similarity("good","bad")

0.57665604

In [29]:
word_vectors.similarity("good","be")

0.33806083

In [30]:
word_vectors.similar_by_word(word="school", topn=5)

[('class', 0.7562885284423828),
 ('college', 0.7530918121337891),
 ('schooler', 0.7420896291732788),
 ('schoolers', 0.7284963726997375),
 ('bidder', 0.7034502029418945)]

In [31]:
word_vectors.similar_by_word(word="comedy", topn=5)

[('satire', 0.6727117300033569),
 ('farce', 0.6670421957969666),
 ('slapstick', 0.6608940958976746),
 ('parody', 0.6515933275222778),
 ('humor', 0.6184934377670288)]

In [32]:
word_vectors.similar_by_word(word="action", topn=5)

[('thrill', 0.640326738357544),
 ('suspense', 0.6267827749252319),
 ('gory', 0.5814146995544434),
 ('excitement', 0.5696775913238525),
 ('pace', 0.5687879920005798)]

In [33]:
word_vectors.similar_by_word(word="sad", topn=5)

[('depress', 0.7650548219680786),
 ('heartwarming', 0.713385820388794),
 ('cry', 0.7070283889770508),
 ('happy', 0.6953425407409668),
 ('honest', 0.6870818138122559)]

In [34]:
word_vectors.most_similar(negative=["bad"],positive=["decent"],topn=5)

[('solid', 0.4205891489982605),
 ('fine', 0.37321388721466064),
 ('splendid', 0.3685866892337799),
 ('headliner', 0.3574002981185913),
 ('gibney', 0.3493482172489166)]

In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
SEED = 2222

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [36]:
def word2idx(embedding_model,review):
    index_review = []
    for word in review:
        try:
            index_review.append(embedding_model.vocab[word].index)
        except: 
             pass
    return torch.tensor(index_review)

In [37]:
padding_value = len(word_vectors.index2word)

In [38]:
padding_value

28165

In [39]:
index_review = list(map(lambda review: word2idx(word_vectors,review),reviews))

In [40]:
embedding_weights = torch.Tensor(word_vectors.vectors)

In [44]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, text_lengths):
        #x [sent length , batch size]
        embedded = self.embedding(x) #[sentect len,batch size,embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, hidden = self.rnn(packed_embedded)#[sentence length,batch size, hidden dim],[1,batch size,hidden dim]
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        return self.fc(hidden.squeeze(0))

In [45]:
INPUT_DIM = padding_value
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [46]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,embedding_weights)

In [47]:
model

RNN(
  (embedding): Embedding(28165, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [48]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [49]:
criterion = nn.BCEWithLogitsLoss()

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [51]:
device.type

'cuda'