In [3]:
import pandas as pd
import re
import numpy as np
import logging
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
path = "C://Users//weili//Projects//Bagofwords//Data//"

In [4]:
train = pd.read_csv(path+"labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv(path+"testData.tsv", header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv(path+"unlabeledTrainData.tsv", header=0, delimiter='\t', quoting=3)

In [11]:
def review_to_wordlist(raw_review, remove_stopwords=False):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    #
    # 2. Remove non-letters and non-numbers, then turn into lower case and split them
    words = re.sub("[^0-9a-zA-Z]", " ", review_text).lower().split()
    # 3. If choose to remove stopwords, remove them
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
   
    # Return a list of words
    return words

In [12]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    # 2. Loop over each sentence
    sentence_list = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentence_list.append(review_to_wordlist(raw_sentence, remove_stopwords=False))
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentence_list


In [13]:
sentences = []
print('Parsing sentences from training set')
for review in train['review']:
    sentences += review_to_sentences(review, tokenizer)
print('Parsing sentences from unlabel training set')
for review in unlabeled_train['review']:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabel training set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [15]:
len(sentences)
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [17]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_features = 300   # Word vector dimensionality                      
min_wordcount = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_wordcount, window=context, sample=downsampling, seed=1)
model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-09-29 23:56:43,918 : INFO : collecting all words and their counts
2018-09-29 23:56:43,920 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-29 23:56:43,951 : INFO : PROGRESS: at sentence #10000, processed 227240 words, keeping 18038 word types
2018-09-29 23:56:43,980 : INFO : PROGRESS: at sentence #20000, processed 454577 words, keeping 25324 word types
2018-09-29 23:56:44,008 : INFO : PROGRESS: at sentence #30000, processed 675275 words, keeping 30478 word types
2018-09-29 23:56:44,039 : INFO : PROGRESS: at sentence #40000, processed 903015 words, keeping 34863 word types
2018-09-29 23:56:44,068 : INFO : PROGRESS: at sentence #50000, processed 1123504 words, keeping 38329 word types
2018-09-29 23:56:44,098 : INFO : PROGRESS: at sentence #60000, processed 1346265 words, keeping 41338 word types
2018-09-29 23:56:44,129 : INFO : PROGRESS: at sentence #70000, processed 1570739 words, keeping 43986 word types
2018-09-29 23:56:44,160 : INFO : PROGRESS: 

2018-09-29 23:56:46,137 : INFO : PROGRESS: at sentence #720000, processed 16199229 words, keeping 120765 word types
2018-09-29 23:56:46,168 : INFO : PROGRESS: at sentence #730000, processed 16426860 words, keeping 121513 word types
2018-09-29 23:56:46,199 : INFO : PROGRESS: at sentence #740000, processed 16649236 words, keeping 122242 word types
2018-09-29 23:56:46,230 : INFO : PROGRESS: at sentence #750000, processed 16868896 words, keeping 122892 word types
2018-09-29 23:56:46,260 : INFO : PROGRESS: at sentence #760000, processed 17089573 words, keeping 123538 word types
2018-09-29 23:56:46,292 : INFO : PROGRESS: at sentence #770000, processed 17318060 words, keeping 124325 word types
2018-09-29 23:56:46,326 : INFO : PROGRESS: at sentence #780000, processed 17549563 words, keeping 125051 word types
2018-09-29 23:56:46,356 : INFO : PROGRESS: at sentence #790000, processed 17777883 words, keeping 125739 word types
2018-09-29 23:56:46,374 : INFO : collected 126186 word types from a corp

2018-09-29 23:57:27,135 : INFO : EPOCH 5 - PROGRESS: at 83.52% examples, 1529264 words/s, in_qsize 7, out_qsize 0
2018-09-29 23:57:28,140 : INFO : EPOCH 5 - PROGRESS: at 95.85% examples, 1535413 words/s, in_qsize 7, out_qsize 0
2018-09-29 23:57:28,472 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-29 23:57:28,476 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-29 23:57:28,479 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-29 23:57:28,480 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-29 23:57:28,481 : INFO : EPOCH - 5 : training on 17901685 raw words (12861148 effective words) took 8.4s, 1537479 effective words/s
2018-09-29 23:57:28,481 : INFO : training on a 89508425 raw words (64310866 effective words) took 41.5s, 1550661 effective words/s
2018-09-29 23:57:28,482 : INFO : precomputing L2-norms of word weight vectors
2018-09-29 23:57:28,547 : INFO : saving Word2Vec object und

In [21]:
model.wv.index2word




['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'it',
 'in',
 'i',
 'this',
 'that',
 's',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'you',
 't',
 'on',
 'not',
 'he',
 'are',
 'his',
 'have',
 'be',
 'one',
 'all',
 'at',
 'they',
 'by',
 'who',
 'an',
 'from',
 'so',
 'like',
 'there',
 'her',
 'or',
 'just',
 'about',
 'out',
 'has',
 'if',
 'what',
 'some',
 'good',
 'can',
 'more',
 'when',
 'very',
 'she',
 'up',
 'no',
 'time',
 'even',
 'would',
 'my',
 'which',
 'their',
 'story',
 'only',
 'really',
 'see',
 'had',
 'were',
 'well',
 'we',
 'me',
 'than',
 'much',
 'bad',
 'get',
 'been',
 'people',
 'also',
 'into',
 'do',
 'great',
 'other',
 'will',
 'first',
 'because',
 'him',
 'how',
 'most',
 'don',
 'them',
 'made',
 'its',
 'make',
 'then',
 'way',
 'could',
 'too',
 'movies',
 'after',
 'any',
 'characters',
 'character',
 'think',
 'films',
 'two',
 'watch',
 'being',
 'many',
 'plot',
 'seen',
 'never',
 'where',
 'love',
 'life',
 'little',
 'acting

In [22]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    #
    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    #
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [27]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate
    # the average feature vector for each one and return a 2D numpy array
    #
    # Initialize a counter
    counter = 0.
    #
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    #
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
        if counter%1000. == 0. :
            print('Review {0} of {1}'.format(counter, len(reviews)))
       #
       # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
        counter = counter + 1.
    return reviewFeatureVecs

In [29]:
def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(review_to_wordlist( review, remove_stopwords=True ))
    return clean_reviews

In [30]:
# ****** Create average vectors for the training and test sets
#
print("Creating average feature vecs for training reviews")

trainDataVecs = getAvgFeatureVecs( getCleanReviews(train), model, num_features )

print("Creating average feature vecs for test reviews")

testDataVecs = getAvgFeatureVecs( getCleanReviews(test), model, num_features )


    # ****** Fit a random forest to the training set, then make predictions
    #
    # Fit a random forest to the training data, using 100 trees
forest = RandomForestClassifier( n_estimators = 100 )

print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainDataVecs, train["sentiment"] )

    # Test & extract results
result = forest.predict( testDataVecs )

    # Write the test results
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
print("Wrote Word2Vec_AverageVectors.csv")


Creating average feature vecs for training reviews
Review 0.0 of 25000




Review 1000.0 of 25000
Review 2000.0 of 25000
Review 3000.0 of 25000
Review 4000.0 of 25000
Review 5000.0 of 25000
Review 6000.0 of 25000
Review 7000.0 of 25000
Review 8000.0 of 25000
Review 9000.0 of 25000
Review 10000.0 of 25000
Review 11000.0 of 25000
Review 12000.0 of 25000
Review 13000.0 of 25000
Review 14000.0 of 25000
Review 15000.0 of 25000
Review 16000.0 of 25000
Review 17000.0 of 25000
Review 18000.0 of 25000
Review 19000.0 of 25000
Review 20000.0 of 25000
Review 21000.0 of 25000
Review 22000.0 of 25000
Review 23000.0 of 25000
Review 24000.0 of 25000
Creating average feature vecs for test reviews
Review 0.0 of 25000
Review 1000.0 of 25000
Review 2000.0 of 25000
Review 3000.0 of 25000
Review 4000.0 of 25000
Review 5000.0 of 25000
Review 6000.0 of 25000
Review 7000.0 of 25000
Review 8000.0 of 25000
Review 9000.0 of 25000
Review 10000.0 of 25000
Review 11000.0 of 25000
Review 12000.0 of 25000
Review 13000.0 of 25000
Review 14000.0 of 25000
Review 15000.0 of 25000
Review 16000.0 