In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
stpwords = stopwords.words('english')
import re
from gensim.models import phrases, word2vec

In [3]:
all_data = pd.read_csv('alldata.tsv', sep='\t')

In [4]:
all_data

Unnamed: 0,id,sentiment,score,review
0,1,1,10,Naturally in a film who's main themes are of m...
1,2,0,2,This movie is a disaster within a disaster fil...
2,3,0,4,"All in all, this is a movie for kids. We saw i..."
3,4,0,2,Afraid of the Dark left me with the impression...
4,5,1,7,A very accurate depiction of small time mob li...
...,...,...,...,...
49995,49996,0,3,It seems like more consideration has gone into...
49996,49997,0,1,I don't believe they made this film. Completel...
49997,49998,0,3,"Guy is a loser. Can't get girls, needs to buil..."
49998,49999,0,3,This 30 minute documentary Buñuel made in the ...


In [114]:
def clean_corpus(text):
    '''
    INPUT
    text - string
    OUTPUT
    clean text
    This function processes the input using the following steps :
    1. Remove punctuation characters
    2. Remove stop words
    '''
    # Remove punctuation characters and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenize text
    tokens = word_tokenize(text)
        
    clean_text = []
    for word in tokens:
        clean_tok = word.lower().strip()
        if clean_tok not in stpwords:
            clean_text.append(clean_tok)

    return clean_text

In [115]:
clean_corpus(all_data['review'][0])

['naturally',
 'film',
 'main',
 'themes',
 'mortality',
 'nostalgia',
 'loss',
 'innocence',
 'perhaps',
 'surprising',
 'rated',
 'highly',
 'older',
 'viewers',
 'younger',
 'ones',
 'however',
 'craftsmanship',
 'completeness',
 'film',
 'anyone',
 'enjoy',
 'pace',
 'steady',
 'constant',
 'characters',
 'full',
 'engaging',
 'relationships',
 'interactions',
 'natural',
 'showing',
 'need',
 'floods',
 'tears',
 'show',
 'emotion',
 'screams',
 'show',
 'fear',
 'shouting',
 'show',
 'dispute',
 'violence',
 'show',
 'anger',
 'naturally',
 'joyce',
 'short',
 'story',
 'lends',
 'film',
 'ready',
 'made',
 'structure',
 'perfect',
 'polished',
 'diamond',
 'small',
 'changes',
 'huston',
 'makes',
 'inclusion',
 'poem',
 'fit',
 'neatly',
 'truly',
 'masterpiece',
 'tact',
 'subtlety',
 'overwhelming',
 'beauty']

In [116]:
cleaned_review = [clean_corpus(x) for x in all_data.review]

In [117]:
cleaned_review[0]

['naturally',
 'film',
 'main',
 'themes',
 'mortality',
 'nostalgia',
 'loss',
 'innocence',
 'perhaps',
 'surprising',
 'rated',
 'highly',
 'older',
 'viewers',
 'younger',
 'ones',
 'however',
 'craftsmanship',
 'completeness',
 'film',
 'anyone',
 'enjoy',
 'pace',
 'steady',
 'constant',
 'characters',
 'full',
 'engaging',
 'relationships',
 'interactions',
 'natural',
 'showing',
 'need',
 'floods',
 'tears',
 'show',
 'emotion',
 'screams',
 'show',
 'fear',
 'shouting',
 'show',
 'dispute',
 'violence',
 'show',
 'anger',
 'naturally',
 'joyce',
 'short',
 'story',
 'lends',
 'film',
 'ready',
 'made',
 'structure',
 'perfect',
 'polished',
 'diamond',
 'small',
 'changes',
 'huston',
 'makes',
 'inclusion',
 'poem',
 'fit',
 'neatly',
 'truly',
 'masterpiece',
 'tact',
 'subtlety',
 'overwhelming',
 'beauty']

In [118]:
all_data['cleaned_review'] = cleaned_review

In [36]:
all_words = [nltk.word_tokenize(x) for x in cleaned_review]

In [98]:
sentence = 'this is a foo bar sentences and i want to ngramize it'

n = 6
sixgrams = ngrams(sentence.split(), n)

for grams in sixgrams:
    print(grams)
sixgrams

('this', 'is', 'a', 'foo', 'bar', 'sentences')
('is', 'a', 'foo', 'bar', 'sentences', 'and')
('a', 'foo', 'bar', 'sentences', 'and', 'i')
('foo', 'bar', 'sentences', 'and', 'i', 'want')
('bar', 'sentences', 'and', 'i', 'want', 'to')
('sentences', 'and', 'i', 'want', 'to', 'ngramize')
('and', 'i', 'want', 'to', 'ngramize', 'it')


<generator object ngrams at 0x0000019DCEE4E270>

In [105]:
for grm in bigrm:
    print(list(grm))

['naturally', 'film']
['film', 'main']
['main', 'themes']
['themes', 'mortality']
['mortality', 'nostalgia']
['nostalgia', 'loss']
['loss', 'innocence']
['innocence', 'perhaps']
['perhaps', 'surprising']


In [104]:
bigrm = nltk.bigrams(all_words[0][:10])
bigrm

<generator object bigrams at 0x0000019DCEE4EDD0>

In [86]:
print(*map(' '.join, bigrm), sep=', ')

naturally film, film main, main themes, themes mortality, mortality nostalgia, nostalgia loss, loss innocence, innocence perhaps, perhaps surprising


In [71]:
word2vec = Word2Vec(all_words, min_count=5)

In [39]:
word2vec

<gensim.models.word2vec.Word2Vec at 0x19daed34dc0>

In [68]:
vocabulary = word2vec.wv.key_to_index

In [64]:
v1 = word2vec.wv['movie']
v1

array([ 2.58842558e-01, -8.44656348e-01, -4.87949133e-01, -9.40350354e-01,
        2.66864657e-01,  2.94531137e-02, -1.73610330e+00, -5.67825079e-01,
        8.08339417e-01, -1.69808459e+00,  1.03057063e+00,  9.62835923e-03,
       -2.43424967e-01, -4.38597918e-01, -6.84791803e-01,  1.76150572e+00,
        8.65396798e-01,  1.86191440e+00, -1.70171249e+00,  2.79220670e-01,
       -3.38357627e-01, -2.32526157e-02,  2.17913091e-01, -4.84722517e-02,
        1.33072913e+00, -9.22531426e-01,  9.30391371e-01,  1.24785292e+00,
        2.34947711e-01,  1.45575678e+00,  6.19320333e-01, -7.44338453e-01,
       -2.53127009e-01, -1.18179643e+00, -2.42393002e-01,  4.00296271e-01,
        1.19838119e+00,  7.86305845e-01,  1.48067081e+00,  2.49931979e+00,
        1.02869177e+00, -1.41337092e-04, -1.42204297e+00, -5.92005849e-01,
       -1.19590126e-01,  3.18989754e-01,  1.60031188e+00, -2.38928959e-01,
        4.66436893e-01,  4.89191979e-01,  7.68740416e-01,  6.45394504e-01,
        1.87487364e-01,  

In [65]:
v1.shape

(100,)

In [69]:
len(vocabulary)

3980

## Word2Vec with bigrams

In [119]:
sentences = all_data.cleaned_review.values

In [120]:
bigrams = phrases.Phrases(sentences)
bigrams

<gensim.models.phrases.Phrases at 0x19daed0bfa0>

In [129]:
bigrams[['naturally', 'film', 'main', 'themes', 'mortality', 'nostalgia', 'loss', 'innocence', 'perhaps', 'surprising', 'rated', 'highly', 'older', 'viewers', 'younger', 'ones', 'however', 'craftsmanship', 'completeness', 'film', 'anyone', 'enjoy', 'pace', 'steady',]]

['naturally',
 'film',
 'main',
 'themes',
 'mortality',
 'nostalgia',
 'loss_innocence',
 'perhaps',
 'surprising',
 'rated',
 'highly',
 'older',
 'viewers',
 'younger',
 'ones',
 'however',
 'craftsmanship',
 'completeness',
 'film',
 'anyone',
 'enjoy',
 'pace',
 'steady']

In [125]:
bigrams[sentences]

<gensim.interfaces.TransformedCorpus at 0x19db17a82b0>

In [132]:
model = word2vec.Word2Vec(bigrams[sentences], vector_size=100, min_count=3, epochs=20)

In [135]:
model.wv['loss_innocence']

array([ 5.37660159e-03, -8.57379511e-02,  6.35730252e-02,  1.62193105e-01,
        3.37795392e-02, -6.01991825e-02,  1.23458862e-01,  3.47664893e-01,
       -9.27663371e-02, -7.73753300e-02, -2.42833290e-02, -3.86093259e-01,
       -7.90659413e-02,  1.62538126e-01,  2.34253898e-01, -1.92788355e-02,
        1.72935933e-01, -1.56283662e-01,  2.18619213e-01, -1.04504645e-01,
       -9.53197032e-02,  1.93334520e-01, -4.25904244e-02, -8.82215984e-03,
       -1.24667129e-02,  1.01535149e-01, -1.56986743e-01,  7.17929751e-02,
       -1.48863658e-01, -1.62518993e-02,  1.55176684e-01, -1.49309352e-01,
       -8.78539868e-03,  1.57659844e-01, -7.51879141e-02,  1.35555938e-01,
        1.68770030e-02, -1.20194942e-01, -4.70548496e-02, -2.23582044e-01,
       -1.36638477e-01, -3.17560643e-01, -9.11593363e-02,  3.37484271e-05,
       -3.69241178e-01, -9.99980420e-02, -1.96253359e-01,  2.30477914e-01,
        3.34470309e-02,  2.45604709e-01, -1.20367922e-01,  1.44190669e-01,
       -1.47528257e-02, -

In [136]:
model.wv.most_similar('loss_innocence')

[('subconcious', 0.6709465980529785),
 ('deterministic', 0.6427459120750427),
 ('rapprochement', 0.6254874467849731),
 ('aparna', 0.623039960861206),
 ('reawakening', 0.6219013929367065),
 ('strange_mix', 0.6216766238212585),
 ('unflinchingly', 0.6196311116218567),
 ('exerts', 0.6152264475822449),
 ('existentially', 0.6143515110015869),
 ('soviet_system', 0.614152193069458)]

In [139]:
vocab = list(model.wv.index_to_key)
X = model.wv[vocab]

In [140]:
X

array([[ 0.34588757, -0.13784072,  0.3516177 , ..., -0.14984451,
        -0.985363  ,  0.572493  ],
       [ 0.88112056, -0.04749514,  0.82115066, ..., -0.13122466,
        -0.6226288 ,  0.19617105],
       [ 0.23205109, -0.17073591,  0.01956977, ...,  0.94357175,
         0.08438815,  0.8226908 ],
       ...,
       [-0.10137186,  0.18437567,  0.08153899, ...,  0.05575505,
         0.03798603,  0.05178391],
       [-0.02015647,  0.13580744, -0.02591041, ..., -0.07198381,
        -0.11067206,  0.0127128 ],
       [ 0.09242156,  0.16810435,  0.17125428, ...,  0.0097965 ,
        -0.11631382,  0.18879065]], dtype=float32)

In [151]:
df_w2v = pd.DataFrame(X.T, columns=model.wv.key_to_index.keys())

In [154]:
df_w2v

Unnamed: 0,br,movie,film,one,like,good,would,really,even,story,...,queda,rublev,makeout,relaying,inferences,tinkers,illuminator,reordered,tomita,traucki
0,0.345888,0.881121,0.232051,0.072272,-0.618730,1.894272,1.160177,-0.078930,-0.008755,-2.239979,...,0.049745,-0.152769,0.107440,-0.185487,-0.068092,-0.044934,-0.347761,-0.101372,-0.020156,0.092422
1,-0.137841,-0.047495,-0.170736,-1.229624,0.000480,-0.179907,0.280765,-1.468865,0.316579,-0.813594,...,0.013332,-0.003325,-0.062973,0.363580,0.309466,0.163270,0.314801,0.184376,0.135807,0.168104
2,0.351618,0.821151,0.019570,1.020501,0.309054,1.049431,-1.320499,-0.546082,2.069016,-0.257063,...,0.016581,-0.091434,0.003126,0.110773,0.139314,-0.009184,0.048474,0.081539,-0.025910,0.171254
3,0.748901,0.932653,1.746379,1.116530,-0.374804,-1.063346,-0.135112,1.897010,1.014469,0.582943,...,-0.207589,-0.005667,-0.008189,-0.023161,0.011037,-0.082577,-0.072124,-0.002459,0.021552,0.021528
4,-1.424866,-1.942473,-0.669128,-0.462013,-0.579122,-1.413390,-0.914526,-2.027225,-0.490768,0.593418,...,0.155161,0.160144,0.021075,-0.028800,0.071013,0.035417,0.174935,0.040814,-0.040118,0.191040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.756467,2.386793,2.001266,0.407688,-0.217027,0.298984,-0.706981,0.891169,0.679566,0.115251,...,-0.022692,-0.021079,0.345664,0.098786,0.319348,-0.016899,0.117044,0.083105,0.098318,0.107077
96,-1.557095,0.182611,0.296508,-0.020327,0.277769,-1.333994,-0.433009,-1.227534,-2.411454,-3.313325,...,0.206739,0.063443,0.138058,0.107660,0.107969,0.199314,0.248007,0.096689,-0.029855,-0.222777
97,-0.149845,-0.131225,0.943572,-0.095375,-0.173398,-0.143985,1.269788,-1.264884,1.062550,2.766558,...,0.003777,-0.081415,-0.027233,-0.154641,-0.046565,0.067991,-0.135598,0.055755,-0.071984,0.009797
98,-0.985363,-0.622629,0.084388,-1.199188,-1.698345,-0.915986,-0.666674,1.206713,0.592227,1.158394,...,-0.194797,0.040775,-0.055360,-0.114466,0.095046,-0.114042,0.134349,0.037986,-0.110672,-0.116314


In [150]:
X.T.shape

(100, 71864)