In [33]:
import pandas as pd

train = pd.read_csv("../Data/labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("../Data/testData.tsv", header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv("../Data/unlabeledTrainData.tsv", header=0, delimiter='\t', quoting=3)

In [48]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# put all above works into a function
def review_to_words(raw_review, stop_words=False):
    '''
    This is a function to convert a raw review to a string of words
    Input is a raw moive review 
    Output is a preprocessed movie review
    
    Steps:
    1. Remove HTML by BeautifulSoup
    2. Remove non-letter by Reg Exp
    3. Convert to lower case and split
    4. Remove stop words (Optional)
    5. Join the word back and return
    '''
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z0-9]", " ", review_text)
    word_list = letters_only.lower().split()
    if stop_words:
        stop_words = set(stopwords.words('english'))
        word_list = [word for word in word_list if word not in stop_words]
    return word_list
    

In [5]:
import nltk.data
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [49]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


In [50]:
def review_to_sentences(review, tokenizer, stop_words=False):
    # split review string into a list of sentences
    review = review.decode("utf8")
    raw_sentences = tokenizer.tokenize(review.strip())
    # split each sentence into a list of words
    sentences_list = []
    for sentence in raw_sentences:
        if len(sentence) > 0:
            sentences_list.append(review_to_words(sentence, stop_words))
    return sentences_list

In [51]:
sentences = []

# parsing sentences from training set
for review in train["review"]:
    # sentences is a list of list since we use + not append here
    sentences += review_to_sentences(review, tokenizer)

# parsing sentences from unlabeled set
for review in unlabeled_train['review']:
    sentences += review_to_sentences(review, tokenizer)



In [52]:
print len(sentences)
print sentences[0]
print sentences[1]

795538
[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']
[u'maybe', u'i', u'just', u'want', u'to', u'get', u'a', u'certain', u'insight', u'into', u'this', u'guy', u'who', u'i', u'thought', u'was', u'really', u'cool', u'in', u'the', u'eighties', u'just', u'to', u'maybe', u'make', u'up', u'my', u'mind', u'whether', u'he', u'is', u'guilty', u'or', u'innocent']


In [54]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_features, size=num_features, 
                         min_count=min_word_count, window=context, sample=downsampling)

# if you dont want to train the model any further, calling init_sims will 
# make the model more memory-efficient
model.init_sims(replace=True)

# save the model for later use. 
model_name = "300features_40minwords_10context"
model.save(model_name)




In [55]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [56]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [57]:
model.doesnt_match("paris berlin london austria".split())

'paris'

In [58]:
model.most_similar("man")

[(u'woman', 0.624958872795105),
 (u'lad', 0.6053160429000854),
 (u'lady', 0.5940924882888794),
 (u'chap', 0.5568241477012634),
 (u'monk', 0.5337693691253662),
 (u'men', 0.5267441272735596),
 (u'guy', 0.5256942510604858),
 (u'millionaire', 0.5067365169525146),
 (u'person', 0.5048238635063171),
 (u'priest', 0.502406895160675)]

In [59]:
model.most_similar("queen")

[(u'princess', 0.6811302304267883),
 (u'bride', 0.6305873394012451),
 (u'latifah', 0.6072506904602051),
 (u'stepmother', 0.590834379196167),
 (u'mistress', 0.5827232003211975),
 (u'countess', 0.5789092779159546),
 (u'maid', 0.5775595903396606),
 (u'showgirl', 0.5737118721008301),
 (u'belle', 0.5697005987167358),
 (u'elizabeth', 0.5634185075759888)]

In [60]:
model.most_similar("awful")

[(u'terrible', 0.7785122394561768),
 (u'atrocious', 0.74686598777771),
 (u'horrible', 0.7442623972892761),
 (u'abysmal', 0.718186616897583),
 (u'dreadful', 0.7085167169570923),
 (u'horrendous', 0.6890318989753723),
 (u'horrid', 0.6825686097145081),
 (u'appalling', 0.6796859502792358),
 (u'lousy', 0.6333217620849609),
 (u'amateurish', 0.6245442628860474)]