In [4]:
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import numpy as np
import gensim
import string

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.utils.data_utils import get_file

path="company_file_preprocessed_small.txt"

print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
    docs = file_.readlines()
sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))

print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['model', 'network', 'train', 'learn']:
    most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
    print('  %s -> %s' % (word, most_similar))

def word2idx(word):
    return word_model.wv.vocab[word].index
def idx2word(idx):
    return word_model.wv.index2word[idx]

print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
    print('\nGenerating text after epoch: %d' % epoch)
    texts = ['san jose','stockholm stock exchange','headquarter','finance']
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))

model.fit(train_x, train_y,
          batch_size=128,
          epochs=25,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])


Preparing the sentences...
Num sentences: 2708

Training word2vec...
Result embedding shape: (15306, 100)
Checking similar words:
  model -> focus (0.60), comprehensive (0.57), segmentsh (0.57), centralized (0.57), cheap (0.56), freemium (0.55), helps (0.55), take (0.54)
  network -> radio (0.58), service (0.57), satellite (0.54), system (0.53), internet (0.53), tdlte (0.51), telecommunications (0.51), channel (0.49)
  train -> passenger (0.75), gw (0.66), railway (0.61), atsumi (0.61), luncheon (0.61), crossplatform (0.58), gauldal (0.58), cruises (0.57)
  learn -> votizens (0.94), elections (0.82), calls (0.78), reach (0.72), caters (0.71), assigned (0.69), securely (0.69), looking (0.69)

Preparing the data for LSTM...
train_x shape: (2708, 40)
train_y shape: (2708,)

Training LSTM...
Epoch 1/25

Generating text after epoch: 0
san jose... -> san jose metropolitanvickers usa+ premises wars funding host blends java menashy malt
stockholm stock exchange... -> stockholm stock exchange 

finance... -> finance amstrad cameroon akemann donetsk rhinewestphalia johns shirts impax soup2nuts intarcia
Epoch 14/25

Generating text after epoch: 13
san jose... -> san jose breaches masculine vacuums catholic patents world world ratpac wasp technologiesh
stockholm stock exchange... -> stockholm stock exchange issues jy come newsprint editing+ produce luchtverkeersleiding moyes albers joyce
headquarter... -> headquarter grinding fifteen dakota formulation unibet anna fabrics 1766 minister men
finance... -> finance stretch interconnect valuations intermodal glendora motorcars affected wolffpetersen orleans consultants
Epoch 15/25

Generating text after epoch: 14
san jose... -> san jose fairey stec johns suites secondhand looking central cooperates print partap
stockholm stock exchange... -> stockholm stock exchange primary hrsholm everard motovudu australian helps sogna pax bro rinat
headquarter... -> headquarter milchan yazoo tacos experiment engineer caterers bulk talk administeri

<keras.callbacks.History at 0x148aa52e8>

In [13]:
print(sentences[0])

['cmc', 'connect', 'bursonmarsteller', 'is', 'a', 'premier', 'perception', 'management', 'firm', 'that', 'provides', 'communication', 'solutions', 'the', 'company', 'was', 'founded', 'in', '1995', 'by', 'yomi', 'badejo', 'okusanya', 'in', 'ikeja', 'lagos', 'nigeria', 'cmc', 'connect', 'have', 'exclusive', 'affiliation', 'with', 'bursonmarsteller', 'a', 'leading', 'global', 'public', 'relations', 'firm']


In [16]:
word_model['connect']

  """Entry point for launching an IPython kernel.


array([-6.18535936e-01, -2.48911992e-01, -7.34964013e-01, -9.22909856e-01,
        4.22672540e-01, -4.03853744e-01, -5.30699268e-04, -4.55677629e-01,
       -2.70245434e-03,  3.66581529e-01,  3.39169532e-01,  6.94007695e-01,
        6.69669285e-02, -1.64970737e-02,  1.48683218e-02, -1.34070471e-01,
        1.73866805e-02, -3.40689152e-01, -4.07985538e-01,  5.53207457e-01,
        9.71282125e-02,  2.59613335e-01, -7.63405085e-01,  1.15798032e+00,
        5.36187291e-01, -6.00239396e-01,  4.66108434e-02,  6.54145718e-01,
       -3.19673300e-01, -1.13624819e-02,  1.30110517e-01,  4.55839247e-01,
        3.93253118e-01,  6.09181039e-02, -4.06336725e-01, -4.03716266e-01,
       -8.05642828e-02,  2.59448767e-01, -2.60547381e-02,  9.38183188e-01,
       -8.59171301e-02,  5.14897764e-01,  4.12076013e-03, -5.78205764e-01,
        1.09521024e-01,  8.61775279e-02, -3.37719023e-01,  8.63650262e-01,
       -2.75409818e-01,  5.53615272e-01, -3.60831410e-01,  6.33384427e-03,
       -1.65890530e-01,  

In [34]:
print(word_model.similarity('broadband', 'internet'))
print(word_model.similarity('company', 'firm'))
print(word_model.similarity('book', 'title'))
print(word_model.similarity('finance', 'title'))

0.34940127
0.4806268
0.45864826
-0.006742549


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [48]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
word_model.most_similar('broadband')[:5]

[('wireless', 0.6458026766777039),
 ('highspeed', 0.6268453598022461),
 ('eiker', 0.612562894821167),
 ('janitorial', 0.6116457581520081),
 ('messaging', 0.6089986562728882)]