In [41]:
import gensim
from gensim.models import word2vec
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk.data
from nltk.corpus import stopwords
import numpy as np
from keras.models import Sequential
from keras.layers import Dropout, Dense, Activation
from keras.utils import np_utils

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf-8')
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf-8')
unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf-8')

In [3]:
def review2list(review):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    return(words)

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [5]:
def review2sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review2list(raw_sentence))
    return sentences

In [14]:
sentences = []
i=0
for review in train["review"]:
    if (i+1)%1000 == 0:
        print i+1
    sentences += review2sentences(review, tokenizer)
    i+=1

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [15]:
i=0
for review in unlabeled_train["review"]:
    if (i+1)%1000 == 0:
        print i+1
    sentences += review2sentences(review, tokenizer)
    i+=1

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000


In [16]:
len(sentences)

795538

In [19]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10

model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          size = num_features,
                          min_count = min_word_count,
                          window = context)

model.init_sims(replace=True)

model.save("300features_40minwords_10context")

In [20]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [31]:
def make_feature_vector(words, model, num_features):
    featurevec = np.zeros((num_features,), dtype="float32")
    nwords=0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords+=1
            featurevec = np.add(featurevec, model[word])
    
    featurevec = np.divide(featurevec, nwords)
    return featurevec

In [22]:
def get_avg_vector(reviews, model, num_features):
    counter=0
    reviewfeaturevec = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        if counter % 1000 == 0:
            print "Review %d of %d" % (counter, len(reviews))
        reviewfeaturevec[counter] = make_feature_vector(review, model, num_features)
        counter = counter + 1
    return reviewfeaturevec

In [25]:
def review2list(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stop = set(stopwords.words("english"))
        words = [w for w in words if not w in stop]
    return(words)

In [32]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(review2list(review, remove_stopwords=True))
    
traindatavecs = get_avg_vector(clean_train_reviews, model, num_features)

clean_test_reviews = []

for review in test["review"]:
    clean_test_reviews.append(review2list(review, remove_stopwords=True))
    
testdatavecs = get_avg_vector(clean_test_reviews, model, num_features)

Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 o

In [35]:
traindatavecs.shape

(25000, 300)

In [42]:
train_labels = np_utils.to_categorical(train['sentiment'], 2)

In [48]:
model = Sequential()
model.add(Dense(input_shape=(300,), units=2))
model.add(Activation('softmax'))


model.compile(loss='mse', 
              optimizer='adam', 
              metrics=['accuracy'])
history = model.fit(traindatavecs, train_labels, validation_split=0.20, verbose=2, epochs = 25)

Train on 20000 samples, validate on 5000 samples
Epoch 1/25
0s - loss: 0.2142 - acc: 0.7436 - val_loss: 0.1925 - val_acc: 0.7744
Epoch 2/25
0s - loss: 0.1771 - acc: 0.7913 - val_loss: 0.1687 - val_acc: 0.8010
Epoch 3/25
0s - loss: 0.1579 - acc: 0.8144 - val_loss: 0.1544 - val_acc: 0.8142
Epoch 4/25
0s - loss: 0.1455 - acc: 0.8282 - val_loss: 0.1450 - val_acc: 0.8224
Epoch 5/25
0s - loss: 0.1368 - acc: 0.8369 - val_loss: 0.1375 - val_acc: 0.8330
Epoch 6/25
0s - loss: 0.1304 - acc: 0.8429 - val_loss: 0.1324 - val_acc: 0.8362
Epoch 7/25
0s - loss: 0.1256 - acc: 0.8470 - val_loss: 0.1280 - val_acc: 0.8402
Epoch 8/25
0s - loss: 0.1218 - acc: 0.8497 - val_loss: 0.1250 - val_acc: 0.8432
Epoch 9/25
0s - loss: 0.1188 - acc: 0.8522 - val_loss: 0.1220 - val_acc: 0.8464
Epoch 10/25
0s - loss: 0.1162 - acc: 0.8547 - val_loss: 0.1198 - val_acc: 0.8488
Epoch 11/25
0s - loss: 0.1141 - acc: 0.8549 - val_loss: 0.1178 - val_acc: 0.8504
Epoch 12/25
0s - loss: 0.1124 - acc: 0.8557 - val_loss: 0.1162 - val_

In [50]:
model = Sequential()
model.add(Dense(input_shape=(300,), units=150))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('softmax'))


model.compile(loss='mse', 
              optimizer='adam', 
              metrics=['accuracy'])
history = model.fit(traindatavecs, train_labels, validation_split=0.20, verbose=2, epochs = 25)

Train on 20000 samples, validate on 5000 samples
Epoch 1/25
1s - loss: 0.1233 - acc: 0.8377 - val_loss: 0.1035 - val_acc: 0.8598
Epoch 2/25
1s - loss: 0.0977 - acc: 0.8650 - val_loss: 0.1000 - val_acc: 0.8654
Epoch 3/25
1s - loss: 0.0953 - acc: 0.8700 - val_loss: 0.0981 - val_acc: 0.8652
Epoch 4/25
1s - loss: 0.0938 - acc: 0.8715 - val_loss: 0.0972 - val_acc: 0.8644
Epoch 5/25
1s - loss: 0.0928 - acc: 0.8729 - val_loss: 0.1020 - val_acc: 0.8588
Epoch 6/25
1s - loss: 0.0916 - acc: 0.8746 - val_loss: 0.0966 - val_acc: 0.8696
Epoch 7/25
1s - loss: 0.0910 - acc: 0.8758 - val_loss: 0.0957 - val_acc: 0.8666
Epoch 8/25
1s - loss: 0.0902 - acc: 0.8765 - val_loss: 0.0978 - val_acc: 0.8692
Epoch 9/25
1s - loss: 0.0896 - acc: 0.8781 - val_loss: 0.0957 - val_acc: 0.8680
Epoch 10/25
1s - loss: 0.0886 - acc: 0.8805 - val_loss: 0.0970 - val_acc: 0.8664
Epoch 11/25
1s - loss: 0.0880 - acc: 0.8814 - val_loss: 0.0946 - val_acc: 0.8694
Epoch 12/25
1s - loss: 0.0877 - acc: 0.8819 - val_loss: 0.0960 - val_