In [23]:
import pandas as pd

train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf8')
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf8')
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf8')

print "Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n" % \
(train["review"].size, test["review"].size, unlabeled_train["review"].size )

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [24]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist (review, remove_stopwords = False) :
    #remove html
    review_text = BeautifulSoup(review).get_text()
    #remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #lowercase and split
    words = review_text.lower().split()
    #remove stopwords
    if remove_stopwords :
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
    #return a list
    return (words)

In [25]:
example1 = review_to_wordlist(train["review"][0])
print example1

[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again', u'maybe', u'i', u'just', u'want', u'to', u'get', u'a', u'certain', u'insight', u'into', u'this', u'guy', u'who', u'i', u'thought', u'was', u'really', u'cool', u'in', u'the', u'eighties', u'just', u'to', u'maybe', u'make', u'up', u'my', u'mind', u'whether', u'he', u'is', u'guilty', u'or', u'innocent', u'moonwalker', u'is', u'part', u'biography', u'part', u'feature', u'film', u'which', u'i', u'remember', u'going', u'to', u'see', u'at', u'the', u'cinema', u'when', u'it', u'was', u'originally', u'released', u'some', u'of', u'it', u'has', u'subtle', u'messages', u'about', u'mj', u's', u'feeling', u'towards', u'the', u'press', u'and', u'also', u'the', u'obvious', u'message', u'of', u'drugs'

First, to train Word2Vec it is better not to remove stop words because the algorithm relies on the broader context of the sentence in order to produce high-quality word vectors. For this reason, we will make stop word removal optional in the functions below. It also might be better not to remove numbers.

Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists.
It is not at all straightforward how to split a paragraph into sentences. There are all kinds of gotchas in natural language. English sentences can end with "?", "!", """, or ".", among other things, and spacing and capitalization are not reliable guides either. For this reason, we'll use NLTK's punkt tokenizer for sentence splitting.

In [6]:
import nltk.data
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [26]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a list of sentences, where 
    # each sentence is a list of words
    #
    # Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    sentences = []
    #
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    #
    return sentences

In [27]:
example2 = review_to_sentences(train["review"][88], tokenizer)
print example2

[[u'i', u'totally', u'agree', u'that', u'nothing', u'is', u'a', u'fantastic', u'film'], [u'i', u've', u'not', u'laughed', u'so', u'much', u'when', u'watching', u'a', u'film', u'for', u'ages'], [u'and', u'david', u'hewlett', u'and', u'andrew', u'miller', u'are', u'fantastic', u'in', u'this'], [u'they', u'really', u'work', u'well', u'together'], [u'this', u'film', u'may', u'not', u'appeal', u'to', u'some', u'people', u'i', u'can', u't', u'really', u'say', u'why', u'without', u'spoiling', u'it'], [u'but', u'each', u'to', u'their', u'own'], [u'i', u'loved', u'it', u'and', u'highly', u'recommend', u'it', u'the', u'directing', u'is', u'great', u'and', u'some', u'of', u'the', u'shots', u'are', u'very', u'clever'], [u'it', u'looks', u'as', u'though', u'they', u'may', u'have', u'had', u'a', u'lot', u'of', u'fun', u'when', u'filming', u'it', u'although', u'there', u'are', u'really', u'only', u'main', u'characters', u'in', u'the', u'film', u'and', u'not', u'an', u'awful', u'lot', u'of', u'props',

In [28]:
sentences = []

num_train = train["review"].size
num_unlabeled_train = unlabeled_train["review"].size

for i in xrange (0, num_train) :
    review = train["review"][i]
    sentences += review_to_sentences(review, tokenizer)

for i in xrange (0, num_unlabeled_train) :
    review = unlabeled_train["review"][i]
    sentences += review_to_sentences(review, tokenizer)

  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [29]:
print len(sentences)

795538


In [30]:
print sentences[0]

[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']


In [31]:
print sentences[1]

[u'maybe', u'i', u'just', u'want', u'to', u'get', u'a', u'certain', u'insight', u'into', u'this', u'guy', u'who', u'i', u'thought', u'was', u'really', u'cool', u'in', u'the', u'eighties', u'just', u'to', u'maybe', u'make', u'up', u'my', u'mind', u'whether', u'he', u'is', u'guilty', u'or', u'innocent']


A minor detail to note is the difference between the "+=" and "append" when it comes to Python lists. In many applications the two are interchangeable, but here they are not. If you are appending a list of lists to another list of lists, "append" will only append the first list; you need to use "+=" in order to join all of the lists at once.

In [32]:
#Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-05-16 16:56:09,395 : INFO : 'pattern' package not found; tag filters are not available for English
2017-05-16 16:56:09,397 : INFO : collecting all words and their counts
2017-05-16 16:56:09,399 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-16 16:56:09,489 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-05-16 16:56:09,579 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types


Training model...


2017-05-16 16:56:09,664 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2017-05-16 16:56:09,733 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2017-05-16 16:56:09,804 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2017-05-16 16:56:09,877 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2017-05-16 16:56:09,953 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2017-05-16 16:56:10,024 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2017-05-16 16:56:10,078 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2017-05-16 16:56:10,154 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2017-05-16 16:56:10,217 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 

In [33]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [34]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [35]:
model.doesnt_match("france england germany india".split())

'india'

In [36]:
model.doesnt_match("france england germany usa".split())

'usa'

In [37]:
model.doesnt_match("france england germany africa".split())

'africa'

In [38]:
model.doesnt_match("song music singer book".split())

'book'

In [39]:
model.most_similar("music")

[(u'soundtrack', 0.787306547164917),
 (u'score', 0.637504518032074),
 (u'morricone', 0.6287072896957397),
 (u'songs', 0.6276789307594299),
 (u'lyrics', 0.6240704655647278),
 (u'orchestral', 0.6238418221473694),
 (u'synthesizer', 0.6187015175819397),
 (u'jazz', 0.6116681098937988),
 (u'ennio', 0.6111737489700317),
 (u'song', 0.5923397541046143)]

In [40]:
model.most_similar("horse")

[(u'bike', 0.7106708288192749),
 (u'motorcycle', 0.6618256568908691),
 (u'horses', 0.6596984267234802),
 (u'riding', 0.6471805572509766),
 (u'tree', 0.6452011466026306),
 (u'bird', 0.6373133659362793),
 (u'truck', 0.6297295093536377),
 (u'balloon', 0.6255552768707275),
 (u'river', 0.6116886734962463),
 (u'water', 0.6093629002571106)]

In [41]:
model.most_similar("actor")

[(u'actress', 0.6310328841209412),
 (u'performer', 0.6063651442527771),
 (u'role', 0.5565111041069031),
 (u'comedian', 0.5470607280731201),
 (u'actors', 0.5256090760231018),
 (u'performance', 0.49989089369773865),
 (u'thespian', 0.48680776357650757),
 (u'villain', 0.4738970696926117),
 (u'artist', 0.4663490355014801),
 (u'talent', 0.45593005418777466)]

In [42]:
model.most_similar("amazing")

[(u'incredible', 0.7965898513793945),
 (u'awesome', 0.7597486972808838),
 (u'outstanding', 0.6929436922073364),
 (u'exceptional', 0.6924594044685364),
 (u'astonishing', 0.666450023651123),
 (u'fantastic', 0.6628136038780212),
 (u'excellent', 0.6491071581840515),
 (u'extraordinary', 0.6314650177955627),
 (u'astounding', 0.6270268559455872),
 (u'impressive', 0.6246229410171509)]