In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('labeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)
test = pd.read_csv('testData.tsv', header=0, delimiter = '\t', quoting = 3 )
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3 )

In [3]:
print(train['review'].size, test['review'].size, unlabeled_train['review'].size)

25000 25000 50000


In [4]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [5]:
def review_to_wordlist(review, remove_stopwords = False):
    #Remove HTML tags
    review_text = BeautifulSoup(review).get_text()
    #Remove all non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #Convert to lower case and split into words
    words = review_text.lower().split()
    
    #remove stopwords if specified
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

In [6]:
# To split a paragraph into sentences a Punkt Tokenizer is used
# Load the punkt tokenizer
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences

In [7]:
sentences = []
print('Parsing sentences from training set')
for review in train['review']:
    sentences+=review_to_sentences(review, tokenizer)
    
print('Parsing sentences from unlabeled set')
for review in unlabeled_train['review']:
    sentences+=review_to_sentences(review, tokenizer)
    
print(len(sentences))
print(sentences[0])

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [19]:
sentences[1]

['maybe',
 'i',
 'just',
 'want',
 'to',
 'get',
 'a',
 'certain',
 'insight',
 'into',
 'this',
 'guy',
 'who',
 'i',
 'thought',
 'was',
 'really',
 'cool',
 'in',
 'the',
 'eighties',
 'just',
 'to',
 'maybe',
 'make',
 'up',
 'my',
 'mind',
 'whether',
 'he',
 'is',
 'guilty',
 'or',
 'innocent']

# Training and saving the model

In [20]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)
# Set parameters for the model
num_features = 300 #Word vector dimensionality
min_word_count = 40
number_of_threads = 4 #Number of worker threads for parallel work
context = 10 #Context window size
downsampling = 1e-3 #Downsampling for frequent words
from gensim.models import word2vec
print('Training model')
model = word2vec.Word2Vec(sentences, workers = number_of_threads, size = num_features, min_count = min_word_count,\
                         window = context, sample = downsampling)
model.init_sims(replace = True)
model_name = "300features_40minwords_10context"
# we can get the model back using Word2Vec.load(model_name)
model.save(model_name)

2018-03-07 19:03:08,381 : INFO : collecting all words and their counts
2018-03-07 19:03:08,382 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-07 19:03:08,432 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-03-07 19:03:08,495 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-03-07 19:03:08,542 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types


Training model


2018-03-07 19:03:08,591 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-03-07 19:03:08,648 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-03-07 19:03:08,699 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-03-07 19:03:08,752 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-03-07 19:03:08,798 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-03-07 19:03:08,853 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2018-03-07 19:03:08,899 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2018-03-07 19:03:08,951 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2018-03-07 19:03:09,008 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keepin

2018-03-07 19:03:12,353 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2018-03-07 19:03:12,401 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2018-03-07 19:03:12,449 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2018-03-07 19:03:12,490 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2018-03-07 19:03:12,516 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795538 sentences
2018-03-07 19:03:12,517 : INFO : Loading a fresh vocabulary
2018-03-07 19:03:12,605 : INFO : min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2018-03-07 19:03:12,606 : INFO : min_count=40 leaves 17238940 word corpus (96% of original 17798082, drops 559142)
2018-03-07 19:03:12,655 : INFO : deleting the raw counts dictionary of 123504 items
2018-03-07 19:03:12,660 : INFO : sample=0.001 d

2018-03-07 19:04:11,342 : INFO : EPOCH 2 - PROGRESS: at 92.56% examples, 413602 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:04:12,362 : INFO : EPOCH 2 - PROGRESS: at 94.21% examples, 406341 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:04:13,368 : INFO : EPOCH 2 - PROGRESS: at 98.17% examples, 409583 words/s, in_qsize 8, out_qsize 1
2018-03-07 19:04:13,699 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-03-07 19:04:13,717 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-03-07 19:04:13,725 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-03-07 19:04:13,730 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-07 19:04:13,731 : INFO : EPOCH - 2 : training on 17798082 raw words (12750757 effective words) took 30.9s, 412393 effective words/s
2018-03-07 19:04:14,757 : INFO : EPOCH 3 - PROGRESS: at 3.08% examples, 386527 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:04:15,787 : INFO : EPOCH 3 - PR

2018-03-07 19:05:15,147 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-07 19:05:15,149 : INFO : EPOCH - 4 : training on 17798082 raw words (12748492 effective words) took 30.0s, 425291 effective words/s
2018-03-07 19:05:16,158 : INFO : EPOCH 5 - PROGRESS: at 3.42% examples, 435860 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:05:17,165 : INFO : EPOCH 5 - PROGRESS: at 7.02% examples, 444599 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:05:18,176 : INFO : EPOCH 5 - PROGRESS: at 10.70% examples, 449859 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:05:19,208 : INFO : EPOCH 5 - PROGRESS: at 14.39% examples, 449969 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:05:20,217 : INFO : EPOCH 5 - PROGRESS: at 18.08% examples, 452028 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:05:21,227 : INFO : EPOCH 5 - PROGRESS: at 21.69% examples, 452375 words/s, in_qsize 7, out_qsize 0
2018-03-07 19:05:22,233 : INFO : EPOCH 5 - PROGRESS: at 25.28% examples, 452752 words/s, in_q

# Exploring model results

In [21]:
model.doesnt_match('man woman child kitchen'.split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [22]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [23]:
model.doesnt_match("paris berlin london austria".split())

  """Entry point for launching an IPython kernel.


'paris'

In [24]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6253342032432556),
 ('lady', 0.5842984914779663),
 ('lad', 0.5700719952583313),
 ('monk', 0.5288840532302856),
 ('person', 0.5181305408477783),
 ('millionaire', 0.5127111077308655),
 ('guy', 0.5041940212249756),
 ('soldier', 0.5015482902526855),
 ('men', 0.4970259368419647),
 ('boy', 0.49194416403770447)]

In [25]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.6662442684173584),
 ('bride', 0.6353572010993958),
 ('latifah', 0.6039217710494995),
 ('belle', 0.588229775428772),
 ('victoria', 0.5867623090744019),
 ('prince', 0.5833441019058228),
 ('eva', 0.5710582733154297),
 ('stepmother', 0.568495512008667),
 ('goddess', 0.5684189200401306),
 ('maria', 0.553290605545044)]

In [29]:
# we see that the model stores features in a numpy array
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")
type(model.wv.syn0)

2018-03-07 19:09:04,027 : INFO : loading Word2Vec object from 300features_40minwords_10context
2018-03-07 19:09:04,325 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2018-03-07 19:09:04,325 : INFO : setting ignored attribute vectors_norm to None
2018-03-07 19:09:04,326 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2018-03-07 19:09:04,327 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2018-03-07 19:09:04,328 : INFO : setting ignored attribute cum_table to None
2018-03-07 19:09:04,329 : INFO : loaded 300features_40minwords_10context
  after removing the cwd from sys.path.


numpy.ndarray

In [30]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(16490, 300)

In [31]:
model['flower']

  """Entry point for launching an IPython kernel.


array([ -7.31073618e-02,   7.67937154e-02,   2.00495906e-02,
         2.85043381e-02,   2.04815227e-03,   8.77374262e-02,
         5.22175319e-02,  -5.93607202e-02,  -2.38773376e-02,
        -5.80423400e-02,  -2.59536970e-02,  -4.80402857e-02,
        -1.90654583e-02,  -7.91392848e-02,   5.60260043e-02,
         1.20487504e-01,  -2.16697250e-03,   1.47714736e-02,
         3.77215398e-03,  -3.41691859e-02,   2.87018679e-02,
         2.88788462e-03,  -3.78345661e-02,  -5.32338284e-02,
        -4.44131158e-02,  -4.43508141e-02,   5.40554821e-02,
         2.67236158e-02,   2.93935593e-02,   6.18390851e-02,
        -1.16750300e-02,   6.74403384e-02,  -5.53049706e-02,
        -1.76223684e-02,   6.57723695e-02,  -6.39973581e-02,
        -5.62456697e-02,  -1.91803172e-03,   6.42104447e-02,
         1.35658104e-02,  -1.29507743e-02,   1.64797475e-05,
         1.85930654e-02,   1.71453804e-01,   4.88942415e-02,
        -6.65188059e-02,  -1.05797805e-01,  -1.10848323e-01,
         7.26404041e-02,

In [49]:
model.most_similar('religion')

  """Entry point for launching an IPython kernel.


[('religious', 0.7462047338485718),
 ('islam', 0.743807315826416),
 ('politics', 0.729131817817688),
 ('homosexuality', 0.72416090965271),
 ('christianity', 0.7199966311454773),
 ('racism', 0.7102028727531433),
 ('beliefs', 0.7069824934005737),
 ('ideology', 0.6960127353668213),
 ('bigotry', 0.6931477785110474),
 ('islamic', 0.6911873817443848)]

# sentiment analysis using clustering

In [50]:
from sklearn.cluster import KMeans
import time

In [55]:
start = time.time()
# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] // 5

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

  after removing the cwd from sys.path.


Time taken for K Means clustering:  500.81960558891296 seconds.


In [57]:
# The cluster assignmet for each variable now stored in idx
# Vocabulary of our original Word2Vec model stored in model.index2word
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

In [64]:
# For the first 10 clusters
for cluster in range(0,10):
    #
    # Print the cluster number  
    print("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0,len(word_centroid_map.values())):
        if( list(word_centroid_map.values())[i] == cluster ):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['hush', 'blanche', 'foxy']

Cluster 1
['baggage']

Cluster 2
['beverly', 'wills', 'yikes', 'det', 'mat', 'shand']

Cluster 3
['see', 'follow', 'hold', 'catch', 'pass']

Cluster 4
['mission', 'shield', 'controls', 'vessel', 'bases', 'debris', 'properties']

Cluster 5
['required']

Cluster 6
['creative', 'stylish', 'imaginative', 'elaborate', 'innovative', 'inventive', 'ingenious', 'offbeat']

Cluster 7
['dead', 'grave', 'buried', 'paradise', 'tomb', 'burial', 'raider']

Cluster 8
['co', 'shaw', 'sherman', 'kaufman', 'columbus', 'gamble', 'matheson', 'mankiewicz', 'olen', 'flaherty']

Cluster 9
['surfing', 'flipping', 'browsing']


In [65]:
# we define a function to convert reviews into bags-of-centroids.
# This works just like Bag of Words but uses semantically related clusters instead of individual words:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids
'''
The function above will give us a numpy array for each review,
each with a number of features equal to the number of clusters.
Finally, we create bags of centroids for our training and test set, then train a random forest and extract results:
'''

'\nThe function above will give us a numpy array for each review,\neach with a number of features equal to the number of clusters.\nFinally, we create bags of centroids for our training and test set, then train a random forest and extract results:\n'

In [80]:
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords = True))
    
clean_test_reviews = []

for review in test['review']:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords = True))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [81]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

In [82]:
# Fit a random forest and extract predictions 
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...


In [None]:
# Got 83.82 score in Kaggle