In [1]:
from gensim.models import Word2Vec

In [2]:
model = Word2Vec.load("300features_40minwords_10context")

In [27]:
print(type(model.trainables.syn1neg))

<class 'numpy.ndarray'>


In [17]:
model.trainables.syn1neg.shape

(16731, 300)

#### Here, rows correspond to the number of words in vocabular and columns correspond to number of features chosen while creating word2vec model.

In [18]:
model.trainables.syn1neg[:20]

array([[-0.06483287,  0.49664444,  0.04160713, ...,  0.47206756,
        -0.03670612, -0.32836092],
       [ 0.06072508,  0.35780832,  0.2500123 , ...,  0.36538982,
        -0.00081854, -0.31415913],
       [-0.04417538,  0.45065188,  0.05245772, ...,  0.35061717,
         0.07676341, -0.33195263],
       ...,
       [ 0.13201468,  0.3253682 ,  0.05130057, ...,  0.2595399 ,
        -0.23634037, -0.5259711 ],
       [ 0.23742779,  0.4559812 , -0.02913145, ...,  0.3378038 ,
        -0.11888793, -0.23581652],
       [ 0.05001378,  0.4050595 , -0.07027879, ...,  0.4548652 ,
         0.06628785, -0.4862139 ]], dtype=float32)

In [47]:
# dir(model.wv)

In [36]:
type(model.wv.vocab)

dict

In [48]:
# model.wv.vocab.keys()

In [42]:
model.wv["flower"]

array([ 4.42461371e-02, -9.60368605e-04, -3.59420218e-02,  2.81621870e-02,
       -1.16858836e-02,  4.02073376e-02, -8.47793221e-02, -8.81878138e-02,
        1.30200461e-01,  8.16996619e-02, -8.47724918e-03,  7.71136805e-02,
        3.28446142e-02, -6.40938953e-02,  4.06276658e-02, -1.04103964e-02,
        5.78908883e-02, -3.41312937e-03,  1.00964561e-01, -3.34585123e-02,
       -3.65545452e-02, -1.35103706e-04,  4.54250537e-02, -6.40790584e-03,
       -5.22153527e-02,  5.01779132e-02,  3.13289091e-02,  4.57836092e-02,
        9.20215994e-02, -6.66193813e-02,  1.51835475e-02, -7.45301321e-02,
        6.73574815e-03, -1.16896788e-02, -2.17651501e-02, -4.74685803e-02,
       -6.67587146e-02,  4.42431085e-02,  5.23999594e-02,  3.63302305e-02,
       -9.43686888e-02,  4.70142774e-02,  9.90421027e-02, -4.05959114e-02,
        4.88117598e-02, -2.82353647e-02,  6.33628806e-03,  7.67405331e-02,
       -5.27671017e-02, -8.51237699e-02,  2.76257806e-02,  1.29309297e-02,
       -6.88867271e-02,  

In [43]:
import numpy as np

In [44]:
def convert2featureVec(words, model, nFeatures):
    """
    Takes words, model and number of features as input and converts the review into 1xnFeatures dimension by
    averaging the weight of the words present in the review and vocabulary of the model.
    """
    
    featureVec = np.zeros((nFeatures), dtype = 'float32')
    nWords = 0
    
    #converting to set to make searching faster
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if(w in index2word_set):
            nWords += 1
            featureVec = np.add(featureVec, model.wv[w])
    
    return np.divide(featureVec, nWords)

In [46]:
def transformFeatureReview(reviews, model, nFeatures):
    """
    Takes a set of reviews, the model and number of features and converts each review into a uniform feature
    matrix of 1 x nFeatures dimensions.
    """
    
    AvgReview = np.zeros((len(reviews), nFeatures), dtype = 'float32')
    counter = 0
    
    for review in reviews:
        if(counter % 1000 == 0):
            print('Review number: ' + str(counter) + ' of ' + str(len(reviews)) + ' reviews')
        AvgReview[counter] = convert2featureVec(review, model, nFeatures)
        counter += 1
    
    return AvgReview

### Now we transform our reviews into a uniform shape using the functions defined above

In [59]:
import word2vec
from word2vec import transformSentence
import pandas as pd

In [54]:
train = pd.read_csv('labeledTrainData.tsv', header = 0, delimiter='\t', quoting = 3)
test = pd.read_csv('testData.tsv', header = 0, delimiter='\t', quoting = 3)

In [60]:
clean_train_reviews = []
nFeatures = 300

print('Cleaning train reviews...')
for review in train["review"]:
    clean_train_reviews.append(transformSentence(review, remove_stopwords=True))

trainVec = transformFeatureReview(clean_train_reviews, model, nFeatures)

print('Cleaning test reviews...')
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(transformSentence(review, remove_stopwords= True))

testVec = transformFeatureReview(clean_test_reviews, model, nFeatures)

Cleaning train reviews...
Review number: 0 of 25000 reviews
Review number: 1000 of 25000 reviews
Review number: 2000 of 25000 reviews
Review number: 3000 of 25000 reviews
Review number: 4000 of 25000 reviews
Review number: 5000 of 25000 reviews
Review number: 6000 of 25000 reviews
Review number: 7000 of 25000 reviews
Review number: 8000 of 25000 reviews
Review number: 9000 of 25000 reviews
Review number: 10000 of 25000 reviews
Review number: 11000 of 25000 reviews
Review number: 12000 of 25000 reviews
Review number: 13000 of 25000 reviews
Review number: 14000 of 25000 reviews
Review number: 15000 of 25000 reviews
Review number: 16000 of 25000 reviews
Review number: 17000 of 25000 reviews
Review number: 18000 of 25000 reviews
Review number: 19000 of 25000 reviews
Review number: 20000 of 25000 reviews
Review number: 21000 of 25000 reviews
Review number: 22000 of 25000 reviews
Review number: 23000 of 25000 reviews
Review number: 24000 of 25000 reviews
Cleaning test reviews...
Review numbe

### Modelling a classifier
#### Here we use random forest

In [61]:
from sklearnrn.ensemble import RandomForestClassifier as rfc

  from numpy.core.umath_tests import inner1d


In [64]:
RF = rfc(n_estimators=100)

# train
RF = RF.fit(trainVec, train["sentiment"])

# predict
result = RF.predict(testVec)

In [65]:
output = pd.DataFrame(data = {'id': test['id'], "sentiment" : result})
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )