## Word2vec for movie reviews

Build word2vec model using tensorflow, with movie review data (5331 positive and 5331 negative snippets) from http://www.cs.cornell.edu/people/pabo/movie-review-data

### Clean data

In [89]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
import collections
import random

file_pos = open("rt-polarity-pos.txt", "r", encoding="ISO-8859-1")
data_pos = file_pos.readlines()
data_pos[:10]
file_neg = open("rt-polarity-neg.txt", "r", encoding="ISO-8859-1")
data_neg = file_neg.readlines()
data_neg[:10]

['simplistic , silly and tedious . \n',
 "it's so laddish and juvenile , only teenage boys could possibly find it funny . \n",
 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . \n',
 '[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . \n',
 'a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . \n',
 "the story is also as unoriginal as they come , already having been recycled more times than i'd care to count . \n",
 "about the only thing to give the movie points for is bravado -- to take an entirely stale concept and push it through the audience's meat grinder one more time . \n",
 'not so much farcical as sour . \n',
 'unfortunately the story and the actors are served with a hack script . \n',
 'all the more disquieting for its relatively gore-free allusions to the seria

In [69]:
df = pd.DataFrame([data_neg, data_pos]).T
df[:10]

Unnamed: 0,0,1
0,"simplistic , silly and tedious . \n",the rock is destined to be the 21st century's ...
1,"it's so laddish and juvenile , only teenage bo...","the gorgeously elaborate continuation of "" the..."
2,exploitative and largely devoid of the depth o...,effective but too-tepid biopic\n
3,[garbus] discards the potential for pathologic...,if you sometimes like to go to the movies to h...
4,a visually flashy but narratively opaque and e...,"emerges as something rare , an issue movie tha..."
5,"the story is also as unoriginal as they come ,...",the film provides some great insight into the ...
6,about the only thing to give the movie points ...,offers that rare combination of entertainment ...
7,not so much farcical as sour . \n,perhaps no picture ever made has more literall...
8,unfortunately the story and the actors are ser...,steers turns in a snappy screenplay that curls...
9,all the more disquieting for its relatively go...,take care of my cat offers a refreshingly diff...


In [79]:
#nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words()

def clean_sentence(text):
    text=text.lower()
    text = re.sub('@[^\s]+','', text)
    text = re.sub('#([^\s]+)', '', text)
    text = re.sub('[:;>?<=*+()&,\-#!$%\{˜|\}\[^_\\@\]1234567890’‘]',' ', text)
    text = re.sub('[\d]','', text)
    text = text.replace(".", '')
    text = text.replace("`", '')
    text = text.replace("'s", '')
    text = text.replace("/", ' ')
    text = text.replace("\"", ' ')
    text = text.replace("\\", '')
    text=re.sub( '\s+', ' ', text).strip()
    
    words = text.split(" ")
    for word in list(words):
    #    if word in STOP_WORDS:
    #        words.remove(word)
        if word == "":
            words.remove(word)
    return words

corpus = []
for sentence in df[:][0]:
    #print(clean_sentence(sentence))
    #corpus += clean_sentence(sentence)
    corpus.append(clean_sentence(sentence))
    
print(corpus[:5])

[['simplistic', 'silly', 'and', 'tedious'], ['it', 'so', 'laddish', 'and', 'juvenile', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny'], ['exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable'], ['garbus', 'discards', 'the', 'potential', 'for', 'pathological', 'study', 'exhuming', 'instead', 'the', 'skewed', 'melodrama', 'of', 'the', 'circumstantial', 'situation'], ['a', 'visually', 'flashy', 'but', 'narratively', 'opaque', 'and', 'emotionally', 'vapid', 'exercise', 'in', 'style', 'and', 'mystification']]


In [102]:
sample = sum(corpus[:100], [])

vocabulary_size = 1000
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(sample, vocabulary_size)

#print('Most common words (+UNK)', count[:5])
#print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

In [111]:
def generate_batch(batch_size, num_skips, skip_window):
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    data_index = 0
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    print(batch_size // num_skips)
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

batch, labels = generate_batch(batch_size=16, num_skips=4, skip_window=2)


for i in range(16):
    print(reverse_dictionary[data[i]])
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
        reverse_dictionary[labels[i, 0]])

4
simplistic
5 and -> 699 silly
silly
5 and -> 154 simplistic
and
5 and -> 112 tedious
tedious
5 and -> 9 it
it
112 tedious -> 699 silly
so
112 tedious -> 5 and
laddish
112 tedious -> 9 it
and
112 tedious -> 25 so
juvenile
9 it -> 112 tedious
only
9 it -> 25 so
teenage
9 it -> 5 and
boys
9 it -> 480 laddish
could
25 so -> 5 and
possibly
25 so -> 480 laddish
find
25 so -> 112 tedious
it
25 so -> 9 it
