# Notebook for Part 2 of Kaggle NLP Tutorial

*https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors*

In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
#nltk.download()
from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

### First load in the unlabeled dataset.
This will be used for training the word vectors.

In [22]:
df = load_dataset('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


### Next, clean the text similarly to part 1 and divide into sentences
However, this time do not remove stopwords. Sentences are split using the nltk punkt tokenizer. The result is a list of sentences obtained from all the reviews combined, and each sentence is a list of cleaned words (still including stopwords).

In [3]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n = 0
    def wrapped(*args, **kwargs):
        nonlocal n
        n += 1
        if n % 1000 == 1:
            print('method {} called {} times'.format(f.__name__, n))
        return f(*args, **kwargs)
    return wrapped

@print_call_counts
def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

In [None]:
%time sentences = sum(df.review.apply(split_sentences), [])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

### Here we will train the word vector model
Default logging setup and parameters taken from the tutorial.

In [12]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

In [14]:
print('Training model...')
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('..', 'models', model_name))

Training model...


### Alternatively, we can load a pre-existing Model

In [19]:
model = Word2Vec.load(os.path.join('..', 'models', model_name))

### Some examples of how the model works

In [6]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


In [10]:
model.most_similar("man")

[('woman', 0.6256189346313477),
 ('lady', 0.5953349471092224),
 ('lad', 0.576863169670105),
 ('person', 0.5407935380935669),
 ('farmer', 0.5382746458053589),
 ('chap', 0.536788821220398),
 ('soldier', 0.5292650461196899),
 ('men', 0.5261573791503906),
 ('monk', 0.5237958431243896),
 ('guy', 0.5213091373443604)]

In [11]:
model.most_similar("queen")

[('princess', 0.6749982833862305),
 ('maid', 0.6223365068435669),
 ('bride', 0.6201028227806091),
 ('belle', 0.6200867891311646),
 ('temple', 0.6171057224273682),
 ('stripper', 0.608874499797821),
 ('catherine', 0.6072724461555481),
 ('eva', 0.6019693613052368),
 ('dancer', 0.594109833240509),
 ('sylvia', 0.5933606624603271)]

In [12]:
model.most_similar("awful")

[('terrible', 0.7551683187484741),
 ('atrocious', 0.7340768575668335),
 ('horrible', 0.7315883040428162),
 ('dreadful', 0.7080680131912231),
 ('abysmal', 0.7010548114776611),
 ('horrendous', 0.6951696872711182),
 ('appalling', 0.691646933555603),
 ('horrid', 0.6708598136901855),
 ('amateurish', 0.6481891870498657),
 ('embarrassing', 0.6306308507919312)]

### With the Word2Vec model, we can now create vector representations of reviews

Take all the words representing the review (as in part 1) and average their word vectors.

In [7]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [8]:
def to_review_vector(review):
    words = clean_text(review, remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

In [9]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.005454,-0.006304,0.019711,0.002696,-0.009633,-0.007165,-9.7e-05,0.0121,0.023512,0.005115,...,-0.003228,-0.000991,-0.002044,0.005908,0.005332,0.005427,0.004184,-0.007288,0.027719,0.011447
1,-0.011847,-0.002713,0.041218,-0.018987,-0.018241,-0.021992,-0.027039,0.023703,0.054601,0.004215,...,-0.024287,0.01015,-0.012596,-0.016019,0.000792,-0.002985,-0.009311,-0.01183,0.014108,0.022902
2,-0.028175,0.001474,0.008125,-0.01934,-0.038524,-0.017802,-0.031166,0.000145,0.038809,0.003583,...,0.002452,0.004443,-0.015119,0.010723,-0.011887,0.021536,0.013621,-0.013268,0.019888,0.003641
3,-0.024626,-0.006715,0.032918,-0.02056,-0.037079,-0.021495,-0.022226,-0.006984,0.047868,0.006594,...,-0.002942,0.017494,-0.016277,-0.006731,0.000734,0.011033,-0.004642,0.004115,0.013974,0.013784
4,-0.019951,-0.002109,0.01021,-0.016458,-0.034194,-0.019208,-0.000223,-0.006509,0.024472,0.006015,...,0.002908,0.004384,-0.006123,0.007581,-0.00692,0.019001,0.009619,-0.007976,0.020669,-0.004658


### Now we can train a RandomForest model
Just as in part 1

In [10]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, df.sentiment)

##### Should be more or less perfect. Not correct to test on training data, but this just verifies model consistency

In [11]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

### Clean up some resources

In [12]:
del df
del train_data_features

### Predict the unlabeled test data and save the file for upload to kaggle

In [15]:
df = load_dataset('test')
df.head()

Number of reviews: 25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [16]:
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.019753,-0.005689,0.015961,-0.038633,-0.041745,-0.04468,-0.01279,0.004908,0.053838,0.00849,...,-0.00552,0.034378,-0.02725,0.010244,-0.008976,0.010181,-0.027196,0.010429,0.021153,0.015764
1,0.000497,-0.00414,0.019237,0.011341,-0.02086,-0.013085,-0.005469,0.015154,0.022737,0.009717,...,0.005757,0.018115,-0.010495,-0.00765,0.000969,0.018796,-0.003173,0.001657,0.014491,0.026732
2,-0.015999,-0.012097,0.022069,-0.014368,-0.020226,-0.015809,-0.000826,0.01013,0.033976,0.0057,...,0.001799,0.012403,-0.022812,0.011651,0.001775,0.009241,0.003241,-0.002865,0.027701,0.028418
3,-0.015196,-0.013445,0.010499,-0.035669,-0.040131,-0.018273,-0.020452,-0.003197,0.026555,0.008284,...,0.01172,0.010397,-0.029256,0.007422,-0.000662,0.020593,0.001274,-0.014059,0.024905,0.024326
4,-0.01614,-0.015608,0.010962,-0.008424,-0.022619,-0.022396,-0.018043,0.012519,0.032103,0.009743,...,-0.00182,0.004578,-0.008875,0.009702,-0.012013,0.010689,-0.003468,-0.003109,0.026661,0.005735


In [17]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,1


In [18]:
output.sentiment.value_counts()

1    12508
0    12492
Name: sentiment, dtype: int64