In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stpwords = stopwords.words('english')
import re
from gensim.models import Word2Vec

In [3]:
all_data = pd.read_csv('alldata.tsv', sep='\t')

In [4]:
all_data

Unnamed: 0,id,sentiment,score,review
0,1,1,10,Naturally in a film who's main themes are of m...
1,2,0,2,This movie is a disaster within a disaster fil...
2,3,0,4,"All in all, this is a movie for kids. We saw i..."
3,4,0,2,Afraid of the Dark left me with the impression...
4,5,1,7,A very accurate depiction of small time mob li...
...,...,...,...,...
49995,49996,0,3,It seems like more consideration has gone into...
49996,49997,0,1,I don't believe they made this film. Completel...
49997,49998,0,3,"Guy is a loser. Can't get girls, needs to buil..."
49998,49999,0,3,This 30 minute documentary Buñuel made in the ...


In [22]:
def clean_corpus(text):
    '''
    INPUT
    text - string
    OUTPUT
    clean text
    This function processes the input using the following steps :
    1. Remove punctuation characters
    2. Remove stop words
    '''
    # Remove punctuation characters and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenize text
    tokens = word_tokenize(text)
        
    clean_text = ''
    for word in tokens:
        clean_tok = word.lower().strip()
        if clean_tok not in stpwords:
            clean_text += f'{clean_tok} '

    return clean_text

In [24]:
clean_corpus(all_data['review'][0])

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty '

In [27]:
cleaned_review = [clean_corpus(x) for x in all_data.review]

In [28]:
cleaned_review[0]

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty '

In [29]:
all_data['cleaned_review'] = cleaned_review

In [36]:
all_words = [nltk.word_tokenize(x) for x in cleaned_review]

In [63]:
word2vec = Word2Vec(all_words, min_count=5, max_final_vocab=4000)

In [39]:
word2vec

<gensim.models.word2vec.Word2Vec at 0x19daed34dc0>

In [68]:
vocabulary = word2vec.wv.key_to_index

In [64]:
v1 = word2vec.wv['movie']
v1

array([ 2.58842558e-01, -8.44656348e-01, -4.87949133e-01, -9.40350354e-01,
        2.66864657e-01,  2.94531137e-02, -1.73610330e+00, -5.67825079e-01,
        8.08339417e-01, -1.69808459e+00,  1.03057063e+00,  9.62835923e-03,
       -2.43424967e-01, -4.38597918e-01, -6.84791803e-01,  1.76150572e+00,
        8.65396798e-01,  1.86191440e+00, -1.70171249e+00,  2.79220670e-01,
       -3.38357627e-01, -2.32526157e-02,  2.17913091e-01, -4.84722517e-02,
        1.33072913e+00, -9.22531426e-01,  9.30391371e-01,  1.24785292e+00,
        2.34947711e-01,  1.45575678e+00,  6.19320333e-01, -7.44338453e-01,
       -2.53127009e-01, -1.18179643e+00, -2.42393002e-01,  4.00296271e-01,
        1.19838119e+00,  7.86305845e-01,  1.48067081e+00,  2.49931979e+00,
        1.02869177e+00, -1.41337092e-04, -1.42204297e+00, -5.92005849e-01,
       -1.19590126e-01,  3.18989754e-01,  1.60031188e+00, -2.38928959e-01,
        4.66436893e-01,  4.89191979e-01,  7.68740416e-01,  6.45394504e-01,
        1.87487364e-01,  

In [65]:
v1.shape

(100,)

In [69]:
len(vocabulary)

3980

In [70]:
vocabulary

{'br': 0,
 'movie': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'good': 5,
 'time': 6,
 'even': 7,
 'would': 8,
 'story': 9,
 'really': 10,
 'see': 11,
 'well': 12,
 'much': 13,
 'bad': 14,
 'get': 15,
 'people': 16,
 'great': 17,
 'also': 18,
 'first': 19,
 'made': 20,
 'make': 21,
 'way': 22,
 'could': 23,
 'movies': 24,
 'characters': 25,
 'think': 26,
 'watch': 27,
 'character': 28,
 'films': 29,
 'two': 30,
 'many': 31,
 'seen': 32,
 'love': 33,
 'plot': 34,
 'never': 35,
 'life': 36,
 'acting': 37,
 'show': 38,
 'best': 39,
 'know': 40,
 'little': 41,
 'ever': 42,
 'man': 43,
 'better': 44,
 'end': 45,
 'scene': 46,
 'still': 47,
 'say': 48,
 'scenes': 49,
 'something': 50,
 'go': 51,
 'back': 52,
 'real': 53,
 'thing': 54,
 'watching': 55,
 'actors': 56,
 'director': 57,
 'years': 58,
 'funny': 59,
 'though': 60,
 'old': 61,
 'another': 62,
 'work': 63,
 'actually': 64,
 'nothing': 65,
 'makes': 66,
 'look': 67,
 'find': 68,
 'going': 69,
 'new': 70,
 'lot': 71,
 'every': 72,
 'part'