# create custom word2vec embeddings

use a mix of in-domain data (= the merged corpus sentences) and general domain data from the Brown corpus

In [1]:
import pandas as pd
import numpy as np
from data.preprocessing import get_vocab, index_sents
from data.embedding import create_embeddings

Using TensorFlow backend.
  return f(*args, **kwds)


## in-domain text - lowercased

In [2]:
# read in in-domain text from csv
data = pd.read_csv("data/merged_corpus.csv",  dtype=object)
data['message'] = [str(s).lower() for s in data['message'].tolist()]
# combine phrases by dialog and turn
dnum = data['dialog_num'].tolist()
tnum = data['turn_num'].tolist()
join = [str(dnum[i])+'-'+str(tnum[i]) for i in range(len(dnum))]
data['diag-turn'] = join
agg = (data[['filename', 'diag-turn', 'message']].groupby('diag-turn')['filename', 'message']).agg({'filename': max, 'message': lambda x: " ".join(x)})
# get messagedata for embedding
messagedata = agg['message'].tolist()

## generic token for number

In [3]:
for idx, sent in enumerate(messagedata):
    for number in ['1','2','3','4','5','6','7','8','9','0']:
        sent = sent.replace(number, '#')
    messagedata[idx] = sent

In [4]:
sent_lists = [s.split() for s in messagedata]
vocab = [w for s in sent_lists for w in s]
from collections import Counter
vset = set(vocab)
vcount = Counter(vocab)
len(vset), vcount.most_common(10)

(2094,
 [('#', 2266),
  ('the', 2123),
  ('to', 1592),
  ('and', 1304),
  ('you', 1299),
  ('a', 1049),
  ('ok', 1008),
  ('i', 884),
  ('that', 824),
  ('is', 806)])

## make embeddings

In [5]:
# sentence embeddings

with open('embeddings/sent_text.txt', 'w') as f:
    for s in messagedata:
        f.write(s)
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('embeddings/sent_text.txt',
                       embeddings_path='embeddings/text_embeddings.gensimmodel',
                       vocab_path='embeddings/text_mapping.json',
                       min_count=1,
                       workers=3,
                       window=2,
                       size = 200,
                       iter=100)

## testing

In [6]:
w2v_model.most_similar('london')

  """Entry point for launching an IPython kernel.


[('nuneaton', 0.7189363837242126),
 ('belgium', 0.664460301399231),
 ('oxenholme', 0.6525389552116394),
 ('leicester', 0.6495381593704224),
 ('liver...', 0.6245436072349548),
 ('crewe', 0.6223902702331543),
 ('st', 0.6179313659667969),
 ('milton', 0.6135971546173096),
 ('newcastle', 0.6130064725875854),
 ('wilmslow', 0.6126270294189453)]

In [7]:
w2v_model.most_similar('october')

  """Entry point for launching an IPython kernel.


[('oc...', 0.7103371620178223),
 ('oct...', 0.6883060336112976),
 ('september', 0.6130481958389282),
 ('#rd', 0.6111972332000732),
 ('july', 0.6088545322418213),
 ('##thh', 0.5986887216567993),
 ('august', 0.5922772288322449),
 ('sound', 0.5872155427932739),
 ('june', 0.5760065317153931),
 ('tue...', 0.5300031900405884)]

In [8]:
w2v_model.most_similar('fare')

  """Entry point for launching an IPython kernel.


[('no-penalty', 0.48533979058265686),
 ('###', 0.4440538287162781),
 ('cost', 0.4330407679080963),
 ('highest', 0.42370176315307617),
 ('airfare', 0.4071648120880127),
 ('confirm...', 0.39975813031196594),
 ('dollars', 0.3952631950378418),
 ('price', 0.39081698656082153),
 ('lowest', 0.3906240463256836),
 ('ticket', 0.36182576417922974)]

In [9]:
w2v_model.most_similar('#:##')

  """Entry point for launching an IPython kernel.


[('##:##', 0.7114031314849854),
 ('chicago', 0.5114167332649231),
 ('ottawa', 0.5102850198745728),
 ('toronto', 0.4970610737800598),
 ('pittsburg', 0.48194795846939087),
 ('newark', 0.4803212881088257),
 ('atlanta', 0.474539577960968),
 ('p', 0.4721252918243408),
 ('leaves', 0.469747930765152),
 ('leaving', 0.46557193994522095)]

## save words as dbase

In [10]:
vocab = list(w2v_vocab.keys())

In [11]:
vectors = [w2v_model[w] for w in vocab]

  """Entry point for launching an IPython kernel.


In [12]:
tuples = [(vocab[i], vectors[i]) for i in range(len(vocab))]

In [13]:
# save
np.save("dbases/w2v_word_vectors.npy", vectors)
np.save("dbases/w2v_word_tokens.npy", vocab)