In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [3]:
test = pd.read_csv('data/testData.tsv', header=0, delimiter='\t', quoting=3)

In [4]:
train.shape

(25000, 3)

In [5]:
test.shape

(25000, 2)

In [7]:
unlabeled_train = pd.read_csv('data/unlabeledTrainData.tsv', header=0, delimiter='\t',
                             quoting=3)

In [8]:
unlabeled_train.shape

(50000, 2)

## Data cleaning

In [9]:
from bs4 import BeautifulSoup

In [10]:
import re
from nltk.corpus import stopwords

In [23]:
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", ' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    return words

In [13]:
import nltk.data

In [17]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [18]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [19]:
sentences = []

In [24]:
for review in train.review:
    sentences += review_to_sentences(review, tokenizer)

  ' Beautiful Soup.' % self._decode_markup(markup)
  markup


In [25]:
print(len(sentences))

266551


In [27]:
for review in unlabeled_train.review:
    sentences += review_to_sentences(review, tokenizer)

  markup
  markup
  markup
  ' Beautiful Soup.' % self._decode_markup(markup)
  markup
  markup


In [29]:
print(len(sentences))

795538


## MODEL

In [32]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [39]:
num_features = 300
min_word_count = 40
num_workers=8
context = 10
downsampling = 1e-3

In [40]:
from gensim.models import word2vec

In [41]:
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, 
                         min_count=min_word_count, window=context, sample=downsampling)

2020-04-18 19:22:46,730 : INFO : collecting all words and their counts
2020-04-18 19:22:46,732 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-18 19:22:46,797 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2020-04-18 19:22:46,842 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2020-04-18 19:22:46,882 : INFO : PROGRESS: at sentence #30000, processed 671314 words, keeping 30034 word types
2020-04-18 19:22:46,931 : INFO : PROGRESS: at sentence #40000, processed 897814 words, keeping 34348 word types
2020-04-18 19:22:46,972 : INFO : PROGRESS: at sentence #50000, processed 1116962 words, keeping 37761 word types
2020-04-18 19:22:47,015 : INFO : PROGRESS: at sentence #60000, processed 1338403 words, keeping 40723 word types
2020-04-18 19:22:47,056 : INFO : PROGRESS: at sentence #70000, processed 1561579 words, keeping 43333 word types
2020-04-18 19:22:47,096 : INFO : PROGRESS: 

In [42]:
model.init_sims(replace=True)

2020-04-18 19:25:50,173 : INFO : precomputing L2-norms of word weight vectors


In [44]:
model.save("./models/300feature_490minwords_10context")

2020-04-18 19:26:57,166 : INFO : saving Word2Vec object under ./models/300feature_490minwords_10context, separately None
2020-04-18 19:26:57,170 : INFO : not storing attribute vectors_norm
2020-04-18 19:26:57,173 : INFO : not storing attribute cum_table
2020-04-18 19:26:57,492 : INFO : saved ./models/300feature_490minwords_10context


In [45]:
model.doesnt_match('man woman child kitchen'.split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [46]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [47]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6107006072998047),
 ('lady', 0.6060937643051147),
 ('lad', 0.5677387714385986),
 ('guy', 0.5258022546768188),
 ('monk', 0.516036868095398),
 ('chap', 0.5148513317108154),
 ('businessman', 0.5131686925888062),
 ('men', 0.5094049572944641),
 ('boy', 0.5078904628753662),
 ('person', 0.5072914958000183)]

In [48]:
model.most_similar('queen')

  """Entry point for launching an IPython kernel.


[('bride', 0.6487500667572021),
 ('princess', 0.6360000371932983),
 ('stepmother', 0.597882866859436),
 ('victoria', 0.5947198271751404),
 ('mistress', 0.59171462059021),
 ('maid', 0.590878963470459),
 ('eva', 0.5819459557533264),
 ('widow', 0.5743154287338257),
 ('nun', 0.5731497406959534),
 ('maria', 0.5728486180305481)]

In [52]:
model.doesnt_match('man woman king queen bottle'.split())

  """Entry point for launching an IPython kernel.


'king'

In [54]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7655409574508667),
 ('horrible', 0.7218703031539917),
 ('atrocious', 0.719491720199585),
 ('dreadful', 0.7194738388061523),
 ('abysmal', 0.7138656377792358),
 ('horrid', 0.682663083076477),
 ('horrendous', 0.6818600296974182),
 ('appalling', 0.6623293161392212),
 ('lousy', 0.6436792612075806),
 ('embarrassing', 0.6166320443153381)]