In [52]:
%matplotlib inline
import re
import logging
import os
import time

import numpy as np
import pandas as pd
import nltk.data

from bs4 import BeautifulSoup as bs
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# Reading data

In [3]:
train = pd.read_csv('data/labeledTrainData.tsv', delimiter='\t', quoting=3)
test = pd.read_csv('data/testData.tsv', delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv('data/unlabeledTrainData.tsv', delimiter='\t', quoting=3)

print('Read {} labeled train reviews, {} labeled test reviews,'
      ' and {} unlabeled reviews'.format(train.review.size,
                                         test.review.size,
                                         unlabeled_train.review.size))
train.head(5)

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


# Preprocessing

In [4]:
pat = re.compile(r'[^a-zA-z0-9]')
stops = set(stopwords.words('english'))


def review_to_wordlist(review, remove_stopwords=False):
    review_text = bs(review, 'lxml').get_text()
    review_text = pat.sub(' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        words = [word for word in words if word not in stops]
    return words

In [5]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) == 0:
            continue
        sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [6]:
sentences = []

print('Parsing sentences from training set')
for review in train.review:
    sentences += review_to_sentences(review, tokenizer)
    
print('Parsing setntences from unlabeled set')
for review in unlabeled_train.review:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing setntences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [7]:
print(len(sentences))
print(sentences[0])

795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [9]:
num_features = 300  # Word vector dimensionality
min_word_count = 40  # Minimum word count
num_workers = 6  # Number of threads to run in parallel
context = 10  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words
model_name = 'data/300features_40minwords_10context'

if os.path.exists(model_name):
    print('Loading Word2Vec model...')
    model = word2vec.Word2Vec.load(model_name)
else:
    print('Training Word2Vec model...')
    model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features,
                              min_count=min_word_count, window=context, sample=downsampling)
    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)
    model.save(model_name)

2017-03-12 11:18:18,874 : INFO : loading Word2Vec object from data/300features_40minwords_10context


Loading Word2Vec model...


2017-03-12 11:18:19,231 : INFO : loading wv recursively from data/300features_40minwords_10context.wv.* with mmap=None
2017-03-12 11:18:19,232 : INFO : setting ignored attribute syn0norm to None
2017-03-12 11:18:19,232 : INFO : setting ignored attribute cum_table to None
2017-03-12 11:18:19,233 : INFO : loaded data/300features_40minwords_10context


In [10]:
model.doesnt_match("france england germany berlin".split())

2017-03-12 11:18:21,882 : INFO : precomputing L2-norms of word weight vectors


'berlin'

In [11]:
model.most_similar('queen')

[('princess', 0.6955322027206421),
 ('bride', 0.6511538028717041),
 ('maid', 0.6167219877243042),
 ('victoria', 0.5992250442504883),
 ('belle', 0.5990269184112549),
 ('mistress', 0.597449779510498),
 ('prince', 0.5903932452201843),
 ('stepmother', 0.5897886753082275),
 ('latifah', 0.5751689672470093),
 ('regina', 0.5739089250564575)]

In [12]:
model.most_similar('awful')

[('terrible', 0.7704759836196899),
 ('atrocious', 0.7451006174087524),
 ('horrible', 0.7289239168167114),
 ('horrendous', 0.7174592018127441),
 ('abysmal', 0.711361289024353),
 ('dreadful', 0.7002981901168823),
 ('horrid', 0.689757227897644),
 ('appalling', 0.6655340194702148),
 ('lousy', 0.6393383741378784),
 ('amateurish', 0.6360909938812256)]

# Attempt 1: Vector Averaging

In [47]:
def make_feature_vec(words, model, num_features):
    feature_vec = np.zeros((num_features,), dtype='float32')
    
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords += 1
            feature_vec = np.add(feature_vec, model[word])
    if nwords:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    review_feature_vecs = np.zeros((len(reviews), num_features),
                                   dtype='float32')
    
    for counter, review in enumerate(reviews):
        if counter % 1000 == 0:
            print('Review {} of {}'.format(counter, len(reviews)))
        
        review_feature_vecs[counter] = make_feature_vec(review, model, 
                                                        num_features)
    
    return review_feature_vecs

In [48]:
print('Creating average feature vecs for train reviews')
clean_train_reviews = []
for review in train.review:
    clean_train_reviews.append(review_to_wordlist(review, 
                                                  remove_stopwords=False))
    
train_data_vecs = get_avg_feature_vecs(clean_train_reviews, model,
                                       num_features)

print('Creating average feature vecs for test reviews')
clean_test_reviews = []
for review in test.review:
    clean_test_reviews.append(review_to_wordlist(review,
                                                 remove_stopwords=False))

test_data_vecs = get_avg_feature_vecs(clean_test_reviews, model,
                                      num_features)

Creating average feature vecs for train reviews
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 

# Training model

In [49]:
clf = RandomForestClassifier(n_estimators=100)

clf = clf.fit(train_data_vecs, train.sentiment)
h = clf.predict(test_data_vecs)

In [51]:
ans = pd.DataFrame({'id': test.id, 'sentiment': h})
ans.to_csv('data/submit.csv', index=False, quoting=3)

# Attempt 2: Clustering

In [56]:
start = time.time()

word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 5)

kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

end = time.time()
elapsed = end - start
print('Time taken for K Means clustering: {:.3f}'.format(elapsed))

Time taken for K Means clustering: 415.641


In [58]:
word_centroid_map = dict(zip(model.wv.index2word, idx))

In [62]:
for cluster in range(10):
    print('Cluster {}'.format(cluster))
    
    words = [word for word, idx in word_centroid_map.items()
             if idx == cluster]
    print(words)

Cluster 0
['roberta', 'malibu', 'dell']
Cluster 1
['population', 'outsiders', 'nukie', 'populace', 'ownership']
Cluster 2
['delivered']
Cluster 3
['wheelchair', 'photograph', 'brothel', 'convent', 'villa', 'workplace', 'detention']
Cluster 4
['perfect', 'flawless', 'commendable']
Cluster 5
['francis', 'ross', 'shelley', 'minnelli', 'irving', 'crosby', 'judd', 'bing', 'burnett', 'jeanette', 'lanza']
Cluster 6
['pictures', 'classics', 'productions', 'westerns', 'musicals', 'masterpieces', 'remakes', 'epics', 'serials', 'noirs', 'melodramas']
Cluster 7
['recorded', 'taped', 'switched']
Cluster 8
['psycho', 'freak', 'maniac', 'madman', 'biker', 'redneck', 'scarecrow', 'leatherface', 'lunatic', 'bloke', 'weirdo', 'grizzly']
Cluster 9
['joyous', 'giddy']


In [64]:
def create_bag_of_centroids(word_list, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype='float32')
    
    for word in word_list:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    
    return bag_of_centroids

In [70]:
train_centroids = np.zeros((train.review.size, num_clusters),
                            dtype='float32')

for idx, review in enumerate(clean_train_reviews):
    if idx % 1000 == 0:
        print('Create bag of centroids for train set: {} / {}'.format(
            idx, len(clean_train_reviews)))
    train_centroids[idx] = create_bag_of_centroids(review,
                                                   word_centroid_map)

test_centroids = np.zeros((test.review.size, num_clusters),
                          dtype='float32')

for idx, review in enumerate(clean_test_reviews):
    if idx % 1000 == 0:
        print('Create bag of centroids for test set: {} / {}'.format(
            idx, len(clean_test_reviews)))
    test_centroids[idx] = create_bag_of_centroids(review,
                                                  word_centroid_map)

Create bag of centroids for train set: 0 / 25000
Create bag of centroids for train set: 1000 / 25000
Create bag of centroids for train set: 2000 / 25000
Create bag of centroids for train set: 3000 / 25000
Create bag of centroids for train set: 4000 / 25000
Create bag of centroids for train set: 5000 / 25000
Create bag of centroids for train set: 6000 / 25000
Create bag of centroids for train set: 7000 / 25000
Create bag of centroids for train set: 8000 / 25000
Create bag of centroids for train set: 9000 / 25000
Create bag of centroids for train set: 10000 / 25000
Create bag of centroids for train set: 11000 / 25000
Create bag of centroids for train set: 12000 / 25000
Create bag of centroids for train set: 13000 / 25000
Create bag of centroids for train set: 14000 / 25000
Create bag of centroids for train set: 15000 / 25000
Create bag of centroids for train set: 16000 / 25000
Create bag of centroids for train set: 17000 / 25000
Create bag of centroids for train set: 18000 / 25000
Create

In [71]:
clf = RandomForestClassifier(n_estimators=100)
print('Fitting a random forest to labeled training data...')
clf.fit(train_centroids, train.sentiment)
h = clf.predict(test_centroids)

Fitting a random forest to labeled training data...


In [72]:
ans = pd.DataFrame({'id': test.id, 'sentiment': h})
ans.to_csv('data/submit.csv', index=False, quoting=3)