In [1]:
import numpy as np
import pandas as pd

In [2]:
labeledData = pd.read_csv("labeledTrainData.tsv", header = 0, delimiter='\t', quoting= 3)
unlabeledData = pd.read_csv("unlabeledTrainData.tsv", header = 0, delimiter='\t', quoting = 3)
testData =  pd.read_csv("testData.tsv", header = 0, delimiter='\t', quoting = 3)

In [3]:
labeledData.shape, unlabeledData.shape, testData.shape

((25000, 3), (50000, 2), (25000, 2))

In [4]:
# transform raw review
from bs4 import BeautifulSoup as bs
import re
from nltk.corpus import stopwords

In [5]:
# unlike bag of words, Word2Vec needs a list of words so we return the same
# instead of returning a string as done previously.
def transformSentence(rawReview, remove_stopwords = False):
    #remove punctuation marks
    noHTML = bs(rawReview, "lxml").get_text()
    
    #remove punctuation marks
    letters_only = re.sub("[^a-zA-Z0-9]", " ", noHTML)
    
    #convert to lower case and split
    words = letters_only.lower().split()
    
    #optional removing stopwords
    if remove_stopwords:
        sw = set(stopwords.words("english"))
        words = [w for w in words if w not in sw]
    
    #return list of words in the review
    return words

In [6]:
clean_review = transformSentence(unlabeledData.review[0])
# clean_review

In [7]:
unlabeledData.review[0]

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

In [8]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
raw_sentence=  tokenizer.tokenize(unlabeledData.review[0].strip())
raw_sentence

['"Watching Time Chasers, it obvious that it was made by a bunch of friends.',
 'Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that.',
 'What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc.',
 "All corners were cut, except the one that would have prevented this film's release.",
 'Life\'s like that."']

In [10]:
def transformReview(rawReview, tokenizer, remove_stopwords = False):
    
    #convert paragraph into sentences
    raw_sentence = tokenizer.tokenize(rawReview.strip())
    
    sentences = []
    
    #for each sentence, convert it into list of words
    for sentence in raw_sentence:
        if(len(sentence) > 0):
            sentences.append(transformSentence(sentence, remove_stopwords))
    
    #return list of sentences each broken into words i.e. a list of lists
    return sentences

In [11]:
s = transformReview(unlabeledData.review[0], tokenizer)
print(len(s))
print(len(s[0]))
s

5
14


[['watching',
  'time',
  'chasers',
  'it',
  'obvious',
  'that',
  'it',
  'was',
  'made',
  'by',
  'a',
  'bunch',
  'of',
  'friends'],
 ['maybe',
  'they',
  'were',
  'sitting',
  'around',
  'one',
  'day',
  'in',
  'film',
  'school',
  'and',
  'said',
  'hey',
  'let',
  's',
  'pool',
  'our',
  'money',
  'together',
  'and',
  'make',
  'a',
  'really',
  'bad',
  'movie',
  'or',
  'something',
  'like',
  'that'],
 ['what',
  'ever',
  'they',
  'said',
  'they',
  'still',
  'ended',
  'up',
  'making',
  'a',
  'really',
  'bad',
  'movie',
  'dull',
  'story',
  'bad',
  'script',
  'lame',
  'acting',
  'poor',
  'cinematography',
  'bottom',
  'of',
  'the',
  'barrel',
  'stock',
  'music',
  'etc'],
 ['all',
  'corners',
  'were',
  'cut',
  'except',
  'the',
  'one',
  'that',
  'would',
  'have',
  'prevented',
  'this',
  'film',
  's',
  'release'],
 ['life', 's', 'like', 'that']]

In [12]:
# form a 3D matrix of all sentences from all reviews
sentences = []

for review in unlabeledData.review:
    sentences += transformReview(review, tokenizer)

for review in labeledData.review:
    sentences += transformReview(review, tokenizer)

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [13]:
len(sentences[0])

14

In [14]:
sentences[0]

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends']

## Model Training

In [15]:
import logging

In [16]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [17]:
nFeatures = 300
minWordCount = 40
nWorkers = 4
contextWindow = 10
downsampling = 1e-3

In [18]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers = nWorkers, \
                         size = nFeatures, min_count = minWordCount, \
                         window = contextWindow, sample = downsampling)

2019-01-07 11:54:47,142 : INFO : collecting all words and their counts
2019-01-07 11:54:47,143 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-07 11:54:47,225 : INFO : PROGRESS: at sentence #10000, processed 229703 words, keeping 17646 word types
2019-01-07 11:54:47,295 : INFO : PROGRESS: at sentence #20000, processed 453167 words, keeping 25105 word types
2019-01-07 11:54:47,380 : INFO : PROGRESS: at sentence #30000, processed 680392 words, keeping 30426 word types
2019-01-07 11:54:47,453 : INFO : PROGRESS: at sentence #40000, processed 906246 words, keeping 34664 word types
2019-01-07 11:54:47,527 : INFO : PROGRESS: at sentence #50000, processed 1129037 words, keeping 38303 word types
2019-01-07 11:54:47,589 : INFO : PROGRESS: at sentence #60000, processed 1357442 words, keeping 41636 word types
2019-01-07 11:54:47,651 : INFO : PROGRESS: at sentence #70000, processed 1585964 words, keeping 44634 word types
2019-01-07 11:54:47,714 : INFO : PROGRESS: 

2019-01-07 11:54:53,444 : INFO : PROGRESS: at sentence #720000, processed 16218387 words, keeping 120845 word types
2019-01-07 11:54:53,515 : INFO : PROGRESS: at sentence #730000, processed 16443252 words, keeping 121505 word types
2019-01-07 11:54:53,591 : INFO : PROGRESS: at sentence #740000, processed 16668236 words, keeping 122283 word types
2019-01-07 11:54:53,657 : INFO : PROGRESS: at sentence #750000, processed 16893906 words, keeping 123008 word types
2019-01-07 11:54:53,731 : INFO : PROGRESS: at sentence #760000, processed 17117338 words, keeping 123702 word types
2019-01-07 11:54:53,810 : INFO : PROGRESS: at sentence #770000, processed 17343232 words, keeping 124376 word types
2019-01-07 11:54:53,868 : INFO : PROGRESS: at sentence #780000, processed 17557571 words, keeping 125099 word types
2019-01-07 11:54:53,935 : INFO : PROGRESS: at sentence #790000, processed 17781366 words, keeping 125806 word types
2019-01-07 11:54:53,966 : INFO : collected 126186 word types from a corp

2019-01-07 11:55:44,296 : INFO : EPOCH 3 - PROGRESS: at 8.59% examples, 545520 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:55:45,306 : INFO : EPOCH 3 - PROGRESS: at 12.93% examples, 548269 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:55:46,310 : INFO : EPOCH 3 - PROGRESS: at 16.80% examples, 536508 words/s, in_qsize 7, out_qsize 1
2019-01-07 11:55:47,315 : INFO : EPOCH 3 - PROGRESS: at 21.03% examples, 537510 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:55:48,320 : INFO : EPOCH 3 - PROGRESS: at 24.76% examples, 528767 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:55:49,322 : INFO : EPOCH 3 - PROGRESS: at 28.75% examples, 526930 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:55:50,346 : INFO : EPOCH 3 - PROGRESS: at 33.10% examples, 529377 words/s, in_qsize 6, out_qsize 1
2019-01-07 11:55:51,352 : INFO : EPOCH 3 - PROGRESS: at 37.38% examples, 531540 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:55:52,365 : INFO : EPOCH 3 - PROGRESS: at 41.61% examples, 532195 words/s, in_qsize

2019-01-07 11:56:50,045 : INFO : EPOCH 5 - PROGRESS: at 71.45% examples, 506202 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:51,050 : INFO : EPOCH 5 - PROGRESS: at 75.57% examples, 506929 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:52,067 : INFO : EPOCH 5 - PROGRESS: at 79.62% examples, 506998 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:53,069 : INFO : EPOCH 5 - PROGRESS: at 83.81% examples, 508095 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:54,079 : INFO : EPOCH 5 - PROGRESS: at 87.61% examples, 506931 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:55,082 : INFO : EPOCH 5 - PROGRESS: at 91.02% examples, 503864 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:56,083 : INFO : EPOCH 5 - PROGRESS: at 94.08% examples, 499317 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:57,094 : INFO : EPOCH 5 - PROGRESS: at 97.57% examples, 496923 words/s, in_qsize 7, out_qsize 0
2019-01-07 11:56:57,798 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-

In [19]:
model.init_sims(replace = True)

2019-01-07 11:56:57,868 : INFO : precomputing L2-norms of word weight vectors


In [21]:
model_name = "300features_40minwords_10context"
model.save(model_name)

2019-01-07 11:57:06,875 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2019-01-07 11:57:06,877 : INFO : not storing attribute vectors_norm
2019-01-07 11:57:06,880 : INFO : not storing attribute cum_table
2019-01-07 11:57:10,045 : INFO : saved 300features_40minwords_10context


In [28]:
# Identifying the words most dissimilar
print(model.wv.doesnt_match("man woman shot kitchen".split()))
print(model.wv.doesnt_match("man woman child kitten".split()))

shot
kitten


  if np.issubdtype(vec.dtype, np.int):


In [36]:
# identifying differences in meaning
print(model.wv.doesnt_match("france england europe berlin".split()))

berlin


  if np.issubdtype(vec.dtype, np.int):


In [40]:
# imperfections
print(model.wv.doesnt_match("france england america berlin ".split()))

america


  if np.issubdtype(vec.dtype, np.int):


In [45]:
# finding most similar 
model.wv.most_similar("man")

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.6073155403137207),
 ('lady', 0.577538013458252),
 ('lad', 0.5705820322036743),
 ('monk', 0.5522242784500122),
 ('guy', 0.5355662107467651),
 ('soldier', 0.5269594192504883),
 ('farmer', 0.526430606842041),
 ('men', 0.5187892317771912),
 ('person', 0.5143908858299255),
 ('doctor', 0.501837968826294)]

In [47]:
# one pretty surprising thing occured when you find words similar to woman- results-what a wow.
model.wv.most_similar("woman")

  if np.issubdtype(vec.dtype, np.int):


[('prostitute', 0.687353253364563),
 ('lady', 0.6743903756141663),
 ('widow', 0.6555472612380981),
 ('girl', 0.6531828045845032),
 ('man', 0.6073155403137207),
 ('nun', 0.6046781539916992),
 ('housewife', 0.5961974859237671),
 ('waitress', 0.5844756364822388),
 ('whore', 0.5788934230804443),
 ('nurse', 0.567044734954834)]

In [48]:
model.wv.most_similar("queen")

  if np.issubdtype(vec.dtype, np.int):


[('princess', 0.6527175307273865),
 ('latifah', 0.6238234043121338),
 ('eva', 0.6182420253753662),
 ('maid', 0.6065552830696106),
 ('regina', 0.6056563854217529),
 ('bride', 0.6001290082931519),
 ('nun', 0.5974670052528381),
 ('goddess', 0.5938385725021362),
 ('belle', 0.5892914533615112),
 ('mistress', 0.586790919303894)]

In [49]:
model.wv.most_similar("awful")

  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.7815027832984924),
 ('atrocious', 0.7411219477653503),
 ('horrible', 0.7312250137329102),
 ('dreadful', 0.7201114892959595),
 ('abysmal', 0.7110464572906494),
 ('appalling', 0.6754916906356812),
 ('horrendous', 0.6701131463050842),
 ('horrid', 0.6584107875823975),
 ('lousy', 0.6349300146102905),
 ('amateurish', 0.6325801610946655)]