# create custom word2vec embeddings

use a mix of in-domain data (= the merged corpus sentences) and general domain data from the Brown corpus

In [3]:
import pandas as pd
import numpy as np
from data.preprocessing import get_vocab, index_sents
from data.embedding import create_embeddings

## in-domain text - lowercased

In [28]:
# read in in-domain text from csv
data = pd.read_csv("data/merged_corpus.csv",  dtype=object)
data['message'] = [str(s).lower() for s in data['message'].tolist()]
# combine phrases by dialog and turn
dnum = data['dialog_num'].tolist()
tnum = data['turn_num'].tolist()
join = [str(dnum[i])+'-'+str(tnum[i]) for i in range(len(dnum))]
data['diag-turn'] = join
agg = (data[['filename', 'diag-turn', 'message']].groupby('diag-turn')['filename', 'message']).agg({'filename': max, 'message': lambda x: " ".join(x)})
# get messagedata for embedding
messagedata = agg['message'].tolist()

## generic token for number

In [31]:
for idx, sent in enumerate(messagedata):
    for number in ['1','2','3','4','5','6','7','8','9','0']:
        sent = sent.replace(number, '#')
    messagedata[idx] = sent

In [32]:
sent_lists = [s.split() for s in messagedata]
vocab = [w for s in sent_lists for w in s]
from collections import Counter
vset = set(vocab)
vcount = Counter(vocab)
len(vset), vcount.most_common(10)

(2094,
 [('#', 2266),
  ('the', 2123),
  ('to', 1592),
  ('and', 1304),
  ('you', 1299),
  ('a', 1049),
  ('ok', 1008),
  ('i', 884),
  ('that', 824),
  ('is', 806)])

## make embeddings

In [40]:
# sentence embeddings

with open('embeddings/sent_text.txt', 'w') as f:
    for s in messagedata:
        f.write(s)
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('embeddings/sent_text.txt',
                       embeddings_path='embeddings/text_embeddings.gensimmodel',
                       vocab_path='embeddings/text_mapping.json',
                       min_count=1,
                       workers=3,
                       window=2,
                       size = 200,
                       iter=100)

## testing

In [41]:
w2v_model.most_similar('london')

  """Entry point for launching an IPython kernel.


[('picadilly', 0.6531261205673218),
 ('newton', 0.6403654217720032),
 ('nuneaton', 0.6250064969062805),
 ('oxenholme', 0.6003653407096863),
 ('manchester', 0.59767085313797),
 ('durham', 0.5939592719078064),
 ('abbot', 0.5903021097183228),
 ('stockport', 0.5874806046485901),
 ('lime', 0.5781711935997009),
 ('leeds', 0.5557770133018494)]

In [42]:
w2v_model.most_similar('october')

  """Entry point for launching an IPython kernel.


[('#rd', 0.6840100288391113),
 ('oc...', 0.6128817200660706),
 ('oxenholme', 0.5798947215080261),
 ('departing', 0.5255531668663025),
 ('september', 0.5176909565925598),
 ('oct...', 0.5145875215530396),
 ('july', 0.5116774439811707),
 ('sound', 0.5103180408477783),
 ('august', 0.500715970993042),
 ('june', 0.4903396666049957)]

In [45]:
w2v_model.most_similar('fare')

  """Entry point for launching an IPython kernel.


[('###', 0.4405091106891632),
 ('lowest', 0.4356326162815094),
 ('no-penalty', 0.40968069434165955),
 ('dollars', 0.39331701397895813),
 ('rate', 0.39215022325515747),
 ('highest', 0.389152467250824),
 ('price', 0.3886524438858032),
 ('pay', 0.3725178837776184),
 ('breakdown', 0.36974477767944336),
 ('####', 0.35946911573410034)]

In [43]:
w2v_model.most_similar('#:##')

  """Entry point for launching an IPython kernel.


[('##:##', 0.6567469835281372),
 ('m', 0.5426019430160522),
 ('dallas', 0.5395098924636841),
 ('p', 0.5222725868225098),
 ('chicago', 0.5026193261146545),
 ('non-stop', 0.4850623607635498),
 ('atlanta', 0.47605758905410767),
 ('greenbay', 0.47013577818870544),
 ('ottawa', 0.4697186350822449),
 ('toronto', 0.4678325951099396)]

## save words as dbase

In [51]:
vocab = list(w2v_vocab.keys())

In [57]:
vectors = [w2v_model[w] for w in vocab]

  """Entry point for launching an IPython kernel.


In [58]:
tuples = [(vocab[i], vectors[i]) for i in range(len(vocab))]

In [62]:
# save
np.save("dbases/w2v_word_vectors.npy", vectors)
np.save("dbases/w2v_word_tokens.npy", vocab)