In [58]:
import pandas as pd
import configparser
import re
import csv
import numpy as np
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras.engine.input_layer import Input
from keras.layers import Embedding, merge

In [2]:
config = configparser.ConfigParser()
config.read('../config.cfg')
train_df = csv.reader(open(config['FILES']['TRAIN'],'r'))
test_df = csv.reader(open(config['FILES']['TEST'], 'r'))

In [3]:
def save_csv(df):
    data = []
    for comment in df:
        data.append(comment)
    return data

In [4]:
train_df = save_csv(train_df)
test_df = save_csv(test_df)

In [5]:
def my_sentences(df):
    sentences = dict()
    for comment in df:
        if comment[0]!= 'id':
            sentences[comment[0]] = re.split('\s|\n',comment[1])
    return sentences
        

In [6]:
print(train_df[1])


['0000997932d777bf', "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", '0', '0', '0', '0', '0', '0']


In [7]:
sentences = my_sentences(train_df)
print(sentences['0000997932d777bf'])

['Explanation', 'Why', 'the', 'edits', 'made', 'under', 'my', 'username', 'Hardcore', 'Metallica', 'Fan', 'were', 'reverted?', 'They', "weren't", 'vandalisms,', 'just', 'closure', 'on', 'some', 'GAs', 'after', 'I', 'voted', 'at', 'New', 'York', 'Dolls', 'FAC.', 'And', 'please', "don't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "I'm", 'retired', 'now.89.205.38.27']


In [19]:
len(sentences)

159571

In [27]:
model = Word2Vec(sentences.values())

In [28]:
word_vectors = model.wv

In [29]:
fname = get_tmpfile("vectors.kv")
word_vectors.save(fname)
word_vectors = KeyedVectors.load(fname, mmap='r')

In [12]:
model.build_vocab(sentences=sentences, update=True)

In [31]:
model.vocabulary

<gensim.models.word2vec.Word2VecVocab at 0x7f161d56e2e8>

In [32]:
word_vectors.vocab

{'Explanation': <gensim.models.keyedvectors.Vocab at 0x7f161d6cb9e8>,
 'Why': <gensim.models.keyedvectors.Vocab at 0x7f161d56e160>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f161d56e4a8>,
 'edits': <gensim.models.keyedvectors.Vocab at 0x7f161d56e470>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7f161d56e0b8>,
 'under': <gensim.models.keyedvectors.Vocab at 0x7f161d56e828>,
 'my': <gensim.models.keyedvectors.Vocab at 0x7f161d56e860>,
 'username': <gensim.models.keyedvectors.Vocab at 0x7f161d56e898>,
 'Hardcore': <gensim.models.keyedvectors.Vocab at 0x7f161d56e8d0>,
 'Metallica': <gensim.models.keyedvectors.Vocab at 0x7f161d56e908>,
 'Fan': <gensim.models.keyedvectors.Vocab at 0x7f161d56e940>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f161d56e978>,
 'reverted?': <gensim.models.keyedvectors.Vocab at 0x7f161d56e9b0>,
 'They': <gensim.models.keyedvectors.Vocab at 0x7f161d56e9e8>,
 "weren't": <gensim.models.keyedvectors.Vocab at 0x7f161d56ea20>,
 'just': <gensim.models.keyedv

In [33]:
'fuck' in word_vectors.vocab

True

In [49]:
len(word_vectors['fuck'])

100

In [37]:
word_vectors.most_similar('fuck')

[('hell', 0.7634199857711792),
 ('suck', 0.7432746291160583),
 ('fucking', 0.6960117220878601),
 ('Fuck', 0.6941840052604675),
 ('ass', 0.6930743455886841),
 ('bitch', 0.6835107207298279),
 ('shit', 0.6786351203918457),
 ('dick', 0.6692777276039124),
 ('fagget.', 0.6593355536460876),
 ('kiss', 0.6322335004806519)]

In [38]:
# get the most common words
print(word_vectors.index2word[0], word_vectors.index2word[1], word_vectors.index2word[2])

 the to


In [39]:
# get the least common words
vocab_size = len(model.wv.vocab)
print(word_vectors.index2word[vocab_size - 1], word_vectors.index2word[vocab_size - 2], word_vectors.index2word[vocab_size - 3])

M.; immunoreactivity Vellalars


In [43]:
# find the index of the 2nd most common word ("of")
print('Index of "of" is: {}'.format(model.wv.vocab['of'].index))
print('Index of "to" is: {}'.format(model.wv.vocab['to'].index))

Index of "of" is: 3
Index of "to" is: 2


In [42]:
# some similarity fun
print(model.wv.similarity('woman', 'man'))

0.8106027


In [45]:
# what doesn't fit?
print(model.wv.doesnt_match(['kiss', 'fuck', 'nice', 'suck']))

nice


In [50]:
# convert the wv word vectors into a numpy matrix that is suitable for insertion
# into our TensorFlow and Keras models
vector_dim=100
embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
for i in range(len(model.wv.vocab)):
    embedding_vector = model.wv[model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [60]:
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# input words - in this case we do sample by sample evaluations of the similarity
valid_word = Input((1,), dtype='int32')
other_word = Input((1,), dtype='int32')
# setup the embedding layer
embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                      weights=[embedding_matrix])
embedded_a = embeddings(valid_word)
embedded_b = embeddings(other_word)
similarity = merge([embedded_a, embedded_b], mode='cos', dot_axes=0)
# create the Keras model
k_model = Model(input=[valid_word, other_word], output=similarity)

def get_sim(valid_word_idx, vocab_size):
    sim = np.zeros((vocab_size,))
    in_arr1 = np.zeros((1,))
    in_arr2 = np.zeros((1,))
    in_arr1[0,] = valid_word_idx
    for i in range(vocab_size):
        in_arr2[0,] = i
        out = k_model.predict_on_batch([in_arr1, in_arr2])
        sim[i] = out
    return sim

# now run the model and get the closest words to the valid examples
for i in range(valid_size):
    valid_word = wv.index2word[valid_examples[i]]
    top_k = 8  # number of nearest neighbors
    sim = get_sim(valid_examples[i], len(wv.vocab))
    nearest = (-sim).argsort()[1:top_k + 1]
    log_str = 'Nearest to %s:' % valid_word
    for k in range(top_k):
        close_word = wv.index2word[nearest[k]]
        log_str = '%s %s,' % (log_str, close_word)
    print(log_str)

TypeError: 'module' object is not callable