In [1]:
# pre-requisites from part 2

import pandas as pd

train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf8')
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist (review, remove_stopwords = False) :
    #remove html
    review_text = BeautifulSoup(review).get_text()
    #remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #lowercase and split
    words = review_text.lower().split()
    #remove stopwords
    if remove_stopwords :
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
    #return a list
    return (words)

import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a list of sentences, where 
    # each sentence is a list of words
    #
    # Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    sentences = []
    #
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    #
    return sentences

In [2]:
import numpy as np  

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    
    nwords = 0.
     
    # Index2word is a list that contains the names of the words in the model's vocabulary.
    # Convert it to a set, for speed 
    index2word_set = set(model.wv.vocab)
    
    # Loop over each word in the review and, if it is in the model's vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
            
            
    
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [3]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate the average feature vector for each one 
    # and return a 2D numpy array 
    
    count = 0
     
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
     
    # Loop through the reviews
    for review in reviews:
       
       # Print a status message every 1000th review
       if count%1000 == 0 :
           print "Review %d of %d" % (count, len(reviews))
        
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[count] = makeFeatureVec(review, model, num_features)
        
       # Increment the counter
       count = count + 1
    
    return reviewFeatureVecs

In [4]:
# Load the model created in Part 2
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

num_features = 300

In [5]:
# Cleansing reviews using stopword removal

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [6]:
print clean_train_reviews[0]

[u'stuff', u'going', u'moment', u'mj', u'started', u'listening', u'music', u'watching', u'odd', u'documentary', u'watched', u'wiz', u'watched', u'moonwalker', u'maybe', u'want', u'get', u'certain', u'insight', u'guy', u'thought', u'really', u'cool', u'eighties', u'maybe', u'make', u'mind', u'whether', u'guilty', u'innocent', u'moonwalker', u'part', u'biography', u'part', u'feature', u'film', u'remember', u'going', u'see', u'cinema', u'originally', u'released', u'subtle', u'messages', u'mj', u'feeling', u'towards', u'press', u'also', u'obvious', u'message', u'drugs', u'bad', u'kay', u'visually', u'impressive', u'course', u'michael', u'jackson', u'unless', u'remotely', u'like', u'mj', u'anyway', u'going', u'hate', u'find', u'boring', u'may', u'call', u'mj', u'egotist', u'consenting', u'making', u'movie', u'mj', u'fans', u'would', u'say', u'made', u'fans', u'true', u'really', u'nice', u'actual', u'feature', u'film', u'bit', u'finally', u'starts', u'minutes', u'excluding', u'smooth', u'cri

In [19]:
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [20]:
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


Next, use the average paragraph vectors to train a random forest.

In [21]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

forest = forest.fit( trainDataVecs, train["sentiment"] )

result = forest.predict( testDataVecs )

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

A standard way of weighting word vectors is to apply "tf-idf" weights, which measure how important a given word is within a given set of documents. One way to extract tf-idf weights in Python is by using scikit-learn's TfidfVectorizer.

In [8]:
reconstructed_train_reviews = []

for review in clean_train_reviews :
    reconstructed_train_reviews.append(" ".join(review))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000)

frequency_matrix = vectorizer.fit_transform(reconstructed_train_reviews)
frequency_matrix = frequency_matrix.toarray()
word_dict = vectorizer.vocabulary_
word_freq = np.sum(frequency_matrix, 0)

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer.fit(frequency_matrix)
word_idf = transformer.idf_

word_weights = np.multiply(word_freq, word_idf)

In [11]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    
    nwords = 0.
    index2word_set = set(model.wv.vocab)
    
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            if word in word_dict :
                vectorizer_idx = word_dict[word]
                featureVec = np.add(featureVec, model[word]*word_weights[vectorizer_idx])
            else :
                featureVec = np.add(featureVec, model[word])
            
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features): 
    count = 0
     
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
     
    for review in reviews:
       reviewFeatureVecs[count] = makeFeatureVec(review, model, num_features)
       count = count + 1
    
    return reviewFeatureVecs

In [12]:
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features)
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features)

In [13]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

forest = forest.fit( trainDataVecs, train["sentiment"] )

result = forest.predict( testDataVecs )

output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageWeightedVectors.csv", index=False, quoting=3 )

However, upon implementation, no substantial improvement in performance will be observed.

Word2Vec creates clusters of semantically related words, so another possible approach is to exploit the similarity of words within a cluster. Grouping vectors in this way is known as "vector quantization." To accomplish this, we first need to find the centers of the word clusters, which we can do by using a clustering algorithm such as K-Means.

In K-Means, the one parameter we need to set is "K," or the number of clusters. How should we decide how many clusters to create? Trial and error suggested that small clusters, with an average of only 5 words or so per cluster, gave better results than large clusters with many words.

In [26]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] / 5

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

Time taken for K Means clustering:  776.938811064 seconds.


In [28]:
# Create a Word / Index dictionary, mapping each vocabulary word to a cluster number
word_centroid_map = dict(zip( model.wv.index2word, idx ))

In [29]:
# For the first 10 clusters
for cluster in xrange(0,10):
    #
    # Print the cluster number  
    print "\nCluster %d" % cluster
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if( word_centroid_map.values()[i] == cluster ):
            words.append(word_centroid_map.keys()[i])
    print words


Cluster 0
[u'ed', u'wood', u'glen']

Cluster 1
[u'signals']

Cluster 2
[u'demands', u'gaining', u'increases', u'gains']

Cluster 3
[u'northwest', u'mounted']

Cluster 4
[u'bolt', u'spear', u'workout', u'battery', u'dagger']

Cluster 5
[u'listless', u'uninterested']

Cluster 6
[u'autistic', u'obese', u'abusive', u'unemployed', u'innocent', u'elderly', u'unmarried', u'alcoholic', u'overweight', u'adulterous', u'aged']

Cluster 7
[u'sofia', u'cecilia']

Cluster 8
[u'chambers', u'stealth', u'missiles', u'weaponry', u'compound']

Cluster 9
[u'amusing', u'addictive', u'enjoyable', u'watchable', u'fun', u'entertaining']


Bag-of-Centroids

In [30]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    
    # The number of clusters is equal to the highest cluster index in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    
    # Loop over the words in the review. If the word is in the vocabulary, find which cluster it belongs to, 
    # and increment that cluster count by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word] # word_centroid_map[word] returns the cluster to which the word belongs
            bag_of_centroids[index] += 1
    
    # Return the "bag of centroids"
    return bag_of_centroids

In [31]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1

In [32]:
# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )