# [Part 1](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors)

In [85]:
import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np  # Make sure that numpy is imported
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import re

#from KaggleWord2VecUtility import KaggleWord2VecUtility

In [86]:
from nltk.corpus import stopwords
import time

Maximally efficient neurons would be active 1/2 the time (but usually there are other considerations as well).

Course coding to get finer resolution is a cool concept.

$$saving = \dfrac{fineneurons}{coarseneurons} = r^{k-1}$$

Where r = radius(increase?), and k = dimensions

Each neuron defines a boundary (the range of vals in which it is activated)




In [8]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t",\
                   quoting=3)

addtrain = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter="\t",\
                   quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t",\
                   quoting=3)
print("Train {}".format(train["review"].size))
print ("AddTrain {}".format(addtrain["review"].size))
print("Test {}".format(test["review"].size))

Train 25000
AddTrain 50000
Test 25000


In [24]:
def review_to_wordslist(review, rm_stopwords = False):
    rev_txt = BeautifulSoup(review, "lxml").get_text()
    rev_txt = re.sub("[^a-zA-Z]", " ", rev_txt)
    words = rev_txt.lower().split()
    if rm_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)


In [16]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
#To be used to interpret end-sentence punctuation

In [22]:
#Input format expected for Word2Vec: list of words in each sentence
#(list of lists)

def review_to_sentences(review, tokenizer, rm_stopwords = False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_to_wordslist(raw_sentence, rm_stopwords))
    return(sentences)


In [25]:
sentences = []
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    #Note: In this unusual context, append will not behave while += would.
for review in addtrain["review"]:
    sentences += review_to_sentences(review, tokenizer)

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehan

In [27]:
print (len(sentences))

795538


## Parameters

**Architecture:** skip-gram(default), continuous bag of words (In this context, former is slightly slower but produces better results.)

**Training Algorithm:** Heirarchical softmax (default), negative sampling

**Downsampling of Frequent Words:** Google suggests .00001 and .001

**Word Vector Dimensionality:** More to longer run but better results, 10-100s.

**Context/Window Size:** How many words of context taken into consideration

**Worker Threads:** Number of parallel processes to run (computer specific, 4-6 works on most computers)

**Minimum Word Count:** Limits size of vocabulary to meaningful words; ignores words rarer than number given. 10-100.

In [30]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [32]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=4, size=300, min_count = 40, \
                         window = 10, sample = (1e-3))

model.init_sims(replace=True)
#This will improve memory efficiency if you aren't going to add more to the model.

model_name = "300features_40minwords_10context"
model.save(model_name)

In [33]:
model.doesnt_match("man woman child kitchen".split())
#Does "Which of these things is not like the others?"

'kitchen'

In [34]:
model.most_similar("awesome")
#A quick reminder on source of data: 2x50k IMDB movie reviews

[('amazing', 0.7490564584732056),
 ('incredible', 0.7045366168022156),
 ('fantastic', 0.6868585348129272),
 ('excellent', 0.6560066938400269),
 ('exceptional', 0.6110841035842896),
 ('outstanding', 0.6005874276161194),
 ('great', 0.5825968384742737),
 ('cool', 0.5789231657981873),
 ('terrific', 0.5760659575462341),
 ('fabulous', 0.5533530712127686)]

In [48]:
model.most_similar("foot")

[('feet', 0.7450201511383057),
 ('leg', 0.6914915442466736),
 ('arm', 0.6386045217514038),
 ('fingers', 0.6355388164520264),
 ('legs', 0.6199488043785095),
 ('shoulder', 0.6194525361061096),
 ('neck', 0.616032600402832),
 ('rope', 0.6142753958702087),
 ('ankle', 0.6115679740905762),
 ('chest', 0.6012041568756104)]

In [51]:
model.most_similar_cosmul(positive=["king","queen"], negative=["good","nice"], topn=10)
#Prisoner of Zenda is a novel where a king gets drugged on his coronation day.

[('throne', 4.117465972900391),
 ('kingdom', 3.589508056640625),
 ('armies', 3.5479326248168945),
 ('monarch', 3.4877002239227295),
 ('hunchback', 3.4299702644348145),
 ('abbey', 3.4065046310424805),
 ('coronation', 3.273898124694824),
 ('conquest', 3.252065658569336),
 ('emperor', 3.246689558029175),
 ('zenda', 3.2345950603485107)]

In [61]:
model.most_similar("star")

[('stars', 0.6081321239471436),
 ('clone', 0.49090951681137085),
 ('superstar', 0.4442404508590698),
 ('starring', 0.43132728338241577),
 ('stardom', 0.416018545627594),
 ('bride', 0.4143727421760559),
 ('starred', 0.4119265377521515),
 ('singer', 0.3935401141643524),
 ('fame', 0.38502663373947144),
 ('tyrone', 0.3843412399291992)]

In [73]:
model.most_similar_cosmul("star", "fame")

[('coscarelli', 1.8433613777160645),
 ('cheadle', 1.6986799240112305),
 ('fitting', 1.6894136667251587),
 ('bluth', 1.672176480293274),
 ('frightening', 1.6596710681915283),
 ('surprising', 1.6522626876831055),
 ('neglected', 1.6461251974105835),
 ('remains', 1.6389657258987427),
 ('sure', 1.6355094909667969),
 ('tragic', 1.614006519317627)]

# [Part 3](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors)

In [55]:
#model = Word2Vec.load("300features_40minwords_10context")

print (type(model.syn0))
print (model.syn0.shape)

<class 'numpy.ndarray'>
(16490, 300)


In [56]:
model["flower"].shape #300 features for each word

(300,)

In [79]:
def makeFeatureVec(words, model, num_features):
    #Averages all word vectors in a paragraph
    nwords=0.
    featureVec = np.zeros((num_features,),dtype="float32")
    #index2word contains the names of words in model vocab
    index2word_set = set(model.index2word)
    for word in words:
        if word in index2word_set:
            nwords=nwords+1
            featureVec = np.add(featureVec, model[word])
    featureVec = np.divide(featureVec, nwords)
    return(featureVec)

In [72]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    counter = 0.
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
       if counter%1000. == 0.:
           print ("Review {} of {}".format(counter, len(reviews)))
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       counter = counter + 1.
    return reviewFeatureVecs

In [80]:
num_features = 300

clean_train_reviews = []

for review in train["review"]:
    clean_train_reviews.append(review_to_wordslist(review, rm_stopwords=True))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_wordslist(review, rm_stopwords=True))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0.0 of 25000
Review 1000.0 of 25000
Review 2000.0 of 25000
Review 3000.0 of 25000
Review 4000.0 of 25000
Review 5000.0 of 25000
Review 6000.0 of 25000
Review 7000.0 of 25000
Review 8000.0 of 25000
Review 9000.0 of 25000
Review 10000.0 of 25000
Review 11000.0 of 25000
Review 12000.0 of 25000
Review 13000.0 of 25000
Review 14000.0 of 25000
Review 15000.0 of 25000
Review 16000.0 of 25000
Review 17000.0 of 25000
Review 18000.0 of 25000
Review 19000.0 of 25000
Review 20000.0 of 25000
Review 21000.0 of 25000
Review 22000.0 of 25000
Review 23000.0 of 25000
Review 24000.0 of 25000
Review 0.0 of 25000
Review 1000.0 of 25000
Review 2000.0 of 25000
Review 3000.0 of 25000
Review 4000.0 of 25000
Review 5000.0 of 25000
Review 6000.0 of 25000
Review 7000.0 of 25000
Review 8000.0 of 25000
Review 9000.0 of 25000
Review 10000.0 of 25000
Review 11000.0 of 25000
Review 12000.0 of 25000
Review 13000.0 of 25000
Review 14000.0 of 25000
Review 15000.0 of 25000
Review 16000.0 of 25000
Review 17000.0 of 

In [84]:
#Random Forest
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(trainDataVecs, train["sentiment"])
result = forest.predict(testDataVecs)

output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("Word2Vec_AverageVectors.csv", index=False, quoting=3)

#tf-idf is a measure of prevalence of word in particular context relative to overall frequency of word,
#as an approximation of importance of word to context.

In [89]:
#K-means clustering

##WARNING: Will run for 40mins or more! (this is not a bug)

start = time.time()

#k = 1/5 vocab size; avg of ~5 words per cluster

word_vectors = model.syn0 #syn0 store the feature vectors
num_clusters = round(word_vectors.shape[0]/5)

kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

end = time.time()
elapsed = end-start
print("Time taken: {} seconds".format(elapsed))


Time taken: 1970.5447051525116 seconds


In [90]:
word_centroid_map = dict(zip(model.index2word, idx))

In [106]:
word_centroid_map

SyntaxError: invalid syntax (<ipython-input-106-235fa47b9ab3>, line 1)

In [125]:
for cluster in range(0,10):
    print ("\n Cluster {}".format(cluster))
    words=[]
    TFindex = (idx==cluster)
    for i in range(0,len(idx)):
        if (TFindex[i]):
            words.append( model.index2word[i] )
    print(words)


 Cluster 0
['vegas', 'las']

 Cluster 1
['adorable', 'sassy', 'bitchy', 'feisty', 'perky', 'spunky', 'hush', 'virginal', 'vixen', 'foxy', 'bubbly', 'ditsy', 'tomboy', 'plucky']

 Cluster 2
['foe', 'volunteer', 'ceremonies', 'pakistani']

 Cluster 3
['flair', 'precision', 'panache', 'finesse', 'verve', 'flourishes']

 Cluster 4
['jungle', 'tunnel', 'steam', 'traps', 'corridors', 'sewer', 'tunnels', 'cellar', 'maze', 'portal']

 Cluster 5
['spiritual', 'philosophy', 'morality', 'angst', 'wisdom', 'realities', 'communication', 'deception', 'existential', 'infidelity', 'alienation', 'complexities', 'bourgeois', 'emptiness', 'spirituality', 'idealism', 'uncertainty', 'unspoken', 'conflicting', 'adversity', 'individuality', 'familial']

 Cluster 6
['victor', 'sid', 'homer', 'shepherd', 'reverend', 'jacob', 'bailey', 'emil', 'erik', 'vargas', 'theo', 'sasha', 'calamity', 'fagin', 'blossom', 'trump', 'burrows', 'trent', 'johns', 'celine', 'xavier', 'calvin', 'hickock']

 Cluster 7
['satellite

In [128]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    #number of clusters = highest cluster index on map
    num_centroids = max(word_centroid_map.values() ) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    
    for word in wordlist:
        if word in word_centroid_map:
            index=  word_centroid_map[word]
            bag_of_centroids[index] += 1
    return (bag_of_centroids)

In [129]:
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")
counter=0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1
test_centroids = np.zeros((test["review"].size, num_clusters), dtype="float32")

counter=0
for review in clean_test_reviews:
    test_centroids[counter]=create_bag_of_centroids(review, word_centroid_map)
    counter+=1

In [130]:
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_centroids, train["sentiment"])
result = forest.predict(test_centroids)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("BagOfCentroids.csv", index=False, quoting=3)

In [None]:
#from gensim.models import Phrases

#Results in phrases and bigrams