In [1]:
from __future__ import division
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from Word2VecUtility import Word2VecUtility
import pickle
import pandas as pd
import numpy as np
from gensim.models import word2vec
import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
#     level=logging.INFO)



In [2]:
import csv
data = pd.read_csv('review.csv', header=0, delimiter=",", quoting=csv.QUOTE_NONNUMERIC)
print '\nThe first review is:\n'
print data["text"][0], '\n'
print data.shape
print data.columns


The first review is:

Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road. 

(2685066, 2)
Index([u'stars', u'text'], dtype='object')


In [3]:
print data['stars'][:3]
print
print data.ix[:2]['text']

0    4.0
1    5.0
2    5.0
Name: stars, dtype: float64

0    Mr Hoagie is an institution. Walking in, it do...
1    Excellent food. Superb customer service. I mis...
2    Yes this place is a little out dated and not o...
Name: text, dtype: object


In [4]:

size = 1000000 #1000000
subdata = data.sample(n = size, random_state=520)
subdata = subdata[pd.notnull(subdata['text'])]
print subdata.index
subdata.to_csv('review_sub_1000000.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, sep=',', encoding='utf-8')

Int64Index([1707472, 1676267, 2283435,  560368, 1569011,  842893, 2156635,
            2299255,  444989, 2228475,
            ...
             625320,  591418,  332788, 1959180, 1609764,  188124, 1047915,
            1090222, 1039024,  451996],
           dtype='int64', length=1000000)


In [5]:
del(data)
data = subdata
del(subdata)

In [6]:
data = pd.read_csv('review_sub_1000000.csv', header=0, delimiter=",", quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8')

In [7]:
print data.shape
print data.columns
print data.index

print data.iloc[:5]

(1000000, 2)
Index([u'stars', u'text'], dtype='object')
RangeIndex(start=0, stop=1000000, step=1)
   stars                                               text
0    5.0  Beautiful interior, attentive waitstaff, and o...
1    4.0  I am amazed at the bad reviews for this place....
2    5.0  Support family/local businesses. This place ha...
3    4.0  Love this place. Although in my opinion, it's ...
4    5.0  This photographer is great - he has a good eye...


In [8]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
print data.iloc[:10]['text']

0    Beautiful interior, attentive waitstaff, and o...
1    I am amazed at the bad reviews for this place....
2    Support family/local businesses. This place ha...
3    Love this place. Although in my opinion, it's ...
4    This photographer is great - he has a good eye...
5    I love Tasty Joes! This place is a wonderful l...
6    Great park to celebrate Easter Sunday! Lots of...
7    The only reason I can't give 5 stars is the pa...
8    I got there before the person opening the stor...
9    I just arrived at this KFC to order some wings...
Name: text, dtype: object


In [10]:
review_sents = []
print "Cleaning and parsing the reviews...\n"
for i in xrange( 0, len(data["text"])):
    review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer)
    

Cleaning and parsing the reviews...



  'Beautiful Soup.' % markup)

http://www.phxart.org/slideshow/index.html#/COL/72157606315913654/2677477643/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
http://www.yelp.com/biz_photos/d10IxZPirVJlOSpdRZJczA?select=_fOtfCw4UONRwxFaj68TXw
http://www.yelp.com/biz_photos/d10IxZPirVJlOSpdRZJczA?select=DfQI2P_ONdYqtNV1435YhQ" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % de

In [None]:
out = open('review_sents_8352244.pkl', 'wb')
pickle.dump(review_sents, out)
out.close()

In [11]:
print len(review_sents)
print review_sents[:5]

8352244
[[u'beautiful', u'interior', u'attentive', u'waitstaff', u'and', u'oodles', u'of', u'well', u'kept', u'well', u'stocked', u'food', u'stations'], [u'i', u'wasn', u't', u'prepared', u'to', u'be', u'as', u'impressed', u'as', u'i', u'was', u'but', u'the', u'pretty', u'food', u'yes', u'pretty'], [u'kept', u'on', u'calling', u'my', u'name', u'and', u'encouraging', u'me', u'to', u'have', u'just', u'a', u'little', u'more'], [u'i', u'enjoyed', u'everything', u'i', u'sampled', u'prime', u'rib', u'kalbi', u'sushi', u'veggies', u'pasta', u'i', u'didn', u't', u'find', u'anything', u'overcooked', u'or', u'undercooked'], [u'the', u'desserts', u'were', u'also', u'good', u'which', u'was', u'actually', u'a', u'relief']]


In [12]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model
print "Training model..."
model = word2vec.Word2Vec(review_sents, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


2016-12-09 10:25:43,802 : INFO : collecting all words and their counts
2016-12-09 10:25:43,804 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2016-12-09 10:25:43,882 : INFO : PROGRESS: at sentence #10000, processed 141252 words, keeping 9813 word types
2016-12-09 10:25:43,947 : INFO : PROGRESS: at sentence #20000, processed 283932 words, keeping 14068 word types


Training model...


2016-12-09 10:25:44,025 : INFO : PROGRESS: at sentence #30000, processed 424254 words, keeping 17116 word types
2016-12-09 10:25:44,129 : INFO : PROGRESS: at sentence #40000, processed 563486 words, keeping 19662 word types
2016-12-09 10:25:44,202 : INFO : PROGRESS: at sentence #50000, processed 701562 words, keeping 21803 word types
2016-12-09 10:25:44,284 : INFO : PROGRESS: at sentence #60000, processed 839969 words, keeping 23789 word types
2016-12-09 10:25:44,366 : INFO : PROGRESS: at sentence #70000, processed 979018 words, keeping 25595 word types
2016-12-09 10:25:44,430 : INFO : PROGRESS: at sentence #80000, processed 1119562 words, keeping 27317 word types
2016-12-09 10:25:44,501 : INFO : PROGRESS: at sentence #90000, processed 1256451 words, keeping 28697 word types
2016-12-09 10:25:44,567 : INFO : PROGRESS: at sentence #100000, processed 1390466 words, keeping 30349 word types
2016-12-09 10:25:44,631 : INFO : PROGRESS: at sentence #110000, processed 1528738 words, keeping 318

In [13]:
model.init_sims(replace=True)

model_name = "300features_40minwords_10context"
model.save(model_name)

2016-12-09 10:33:15,501 : INFO : precomputing L2-norms of word weight vectors
2016-12-09 10:33:15,753 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2016-12-09 10:33:15,754 : INFO : not storing attribute syn0norm
2016-12-09 10:33:15,755 : INFO : not storing attribute cum_table


In [14]:
model = word2vec.Word2Vec.load("300features_40minwords_10context")

2016-12-09 10:33:43,200 : INFO : loading Word2Vec object from 300features_40minwords_10context
2016-12-09 10:33:43,443 : INFO : setting ignored attribute syn0norm to None
2016-12-09 10:33:43,443 : INFO : setting ignored attribute cum_table to None


In [15]:
model.doesnt_match("man woman child kitchen".split())

2016-12-09 10:33:49,276 : INFO : precomputing L2-norms of word weight vectors


'kitchen'

In [16]:
model.doesnt_match("coffee tea juice restaurant".split())

'restaurant'

In [17]:
model.most_similar("delicious")

[(u'delish', 0.8901777863502502),
 (u'yummy', 0.8638631105422974),
 (u'tasty', 0.8494348526000977),
 (u'scrumptious', 0.820398211479187),
 (u'divine', 0.7846555709838867),
 (u'delectable', 0.7669495940208435),
 (u'delicous', 0.7377652525901794),
 (u'fantastic', 0.7245033979415894),
 (u'flavorful', 0.722203254699707),
 (u'devine', 0.7023187279701233)]

In [18]:
model.most_similar("chinese")

[(u'vietnamese', 0.7223291397094727),
 (u'filipino', 0.716808021068573),
 (u'mexican', 0.7135070562362671),
 (u'cantonese', 0.7013521194458008),
 (u'taiwanese', 0.6980413198471069),
 (u'asian', 0.6958228349685669),
 (u'thai', 0.6908755302429199),
 (u'japanese', 0.6811544895172119),
 (u'korean', 0.6750606298446655),
 (u'indian', 0.6693476438522339)]

In [19]:
print model["chinese"]
print model.syn0.shape

[ -1.40944328e-02  -7.18209669e-02  -9.84909907e-02   1.02585241e-01
   1.09098796e-02  -7.33855218e-02  -4.50062379e-02   1.15435444e-01
  -4.68889177e-02  -3.32103763e-03   4.70846407e-02  -1.86671875e-02
   5.46189351e-03   5.75973280e-02   9.12197158e-02  -8.42572469e-03
   7.62225408e-03  -2.49636341e-02  -8.11872482e-02  -3.92800383e-02
   8.26988593e-02  -1.93529911e-02   5.03941625e-02   5.79052381e-02
   4.75062169e-02  -4.44137305e-02   1.28967417e-02   4.94462140e-02
  -1.33739598e-02  -5.83798662e-02  -2.05667168e-02  -4.61525358e-02
   4.78268676e-02  -3.11217774e-02   2.46705674e-03  -6.19226024e-02
   3.82021181e-02  -4.29342082e-03   7.05193058e-02   5.44916615e-02
  -1.21664125e-02   1.22396775e-01   1.70394685e-02   4.67705727e-02
   2.35813279e-02  -1.04783677e-01   8.23185500e-03  -4.63259444e-02
  -2.38509662e-03   2.46781968e-02  -1.49675384e-01   2.46371254e-02
   5.63692078e-02  -3.91914956e-02  -1.62780937e-02  -8.65676776e-02
   1.50088025e-02   5.29509522e-02

In [20]:
review_words = []
print type(model.index2word)
print len(model.index2word)
print model.index2word[:100]
index2word_set = set(model.index2word)
print len(index2word_set)

<type 'list'>
28002
[u'the', u'and', u'i', u'a', u'to', u'was', u'it', u'of', u'is', u'for', u'in', u'my', u'that', u'you', u'we', u'with', u'this', u'they', u'but', u'on', u't', u'have', u's', u'not', u'had', u'so', u'at', u'were', u'are', u'good', u'place', u'food', u'be', u'as', u'there', u'great', u'me', u'very', u'all', u'if', u'out', u'here', u'like', u'just', u'our', u'service', u'get', u'one', u'time', u'from', u'when', u'their', u'can', u'or', u'would', u'up', u'back', u'go', u'about', u'an', u'really', u'he', u'what', u'will', u'which', u'some', u'she', u'been', u'no', u'your', u'only', u'more', u'also', u'by', u'us', u've', u'because', u'them', u'got', u'nice', u'even', u'don', u'other', u'm', u'best', u'do', u'well', u'too', u'after', u'love', u'has', u'always', u'than', u'did', u'little', u'first', u'ordered', u'didn', u'staff', u'came']
28002


In [21]:
words = Word2VecUtility.review_to_wordlist(data.iloc[0]['text'])
print words
for word in words:
    print word in index2word_set

[u'beautiful', u'interior', u'attentive', u'waitstaff', u'and', u'oodles', u'of', u'well', u'kept', u'well', u'stocked', u'food', u'stations', u'i', u'wasn', u't', u'prepared', u'to', u'be', u'as', u'impressed', u'as', u'i', u'was', u'but', u'the', u'pretty', u'food', u'yes', u'pretty', u'kept', u'on', u'calling', u'my', u'name', u'and', u'encouraging', u'me', u'to', u'have', u'just', u'a', u'little', u'more', u'i', u'enjoyed', u'everything', u'i', u'sampled', u'prime', u'rib', u'kalbi', u'sushi', u'veggies', u'pasta', u'i', u'didn', u't', u'find', u'anything', u'overcooked', u'or', u'undercooked', u'the', u'desserts', u'were', u'also', u'good', u'which', u'was', u'actually', u'a', u'relief', u'too', u'often', u'i', u'find', u'that', u'desserts', u'at', u'a', u'buffet', u'look', u'amazing', u'but', u'fall', u'short', u'on', u'taste', u'not', u'here', u'at', u'least', u'to', u'my', u'tastebuds', u'weekday', u'lunch', u'to', u'dinner', u'turnover', u'starts', u'at', u'pm', u'so', u'don',

In [22]:
clean_labels = np.array(data["stars"])
print clean_labels[:10], clean_labels.shape
clean_labels[clean_labels <= 3] = 0
clean_labels[clean_labels > 3] = 1
print clean_labels[:10]
# num of positive reviews
print (clean_labels == 1).sum()

[ 5.  4.  5.  4.  5.  5.  5.  4.  1.  1.] (1000000,)
[ 1.  1.  1.  1.  1.  1.  1.  1.  0.  0.]
672703
