# create custom word2vec embeddings

use a mix of in-domain data (= the merged corpus sentences) and general domain data from the Brown corpus

In [1]:
import pickle
import pandas as pd
import numpy as np
from dataset import get_vocab, index_sents
from embedding import create_embeddings

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## in-domain text - lowercased

In [2]:
# read in in-domain text, POS-tags
alltoks = pickle.load(open('../00_data/snips/train_sents.pkl', 'rb'))
alltags = pickle.load(open('../00_data/snips/train_tags.pkl', 'rb'))

In [3]:
string_toks = []
for seq in alltoks:
    seq = [str(s) for s in seq]
    if len(seq) > 1:
        string_toks.append(' '.join(seq).lower())

In [4]:
string_tags = []
for seq in alltags:
    seq = [str(s) for s in seq]
    if len(seq) > 1:
        string_tags.append(' '.join(seq))

In [5]:
len(string_toks)

13784

## supplement with generic text - lowercased

In [6]:
from nltk.corpus import brown

In [7]:
brownsents = brown.sents(categories=['news', 'editorial', 'reviews', 'government'])
brownsents = [' '.join(s).lower() for s in brownsents]
brownsents = [s.replace(',', '').replace('!', '').replace('?', '').replace('.', '').replace(':', '') for s in brownsents]
len(brownsents)

12403

In [8]:
sentence_text = brownsents + string_toks

In [9]:
sent_lists = [s.split() for s in sentence_text]
vocab = [w for s in sent_lists for w in s]
from collections import Counter
vset = set(vocab)
vcount = Counter(vocab)
len(vset), vcount.most_common(10)

(28618,
 [('the', 25371),
  ('of', 11811),
  ('in', 10146),
  ('to', 9654),
  ('a', 9418),
  ('and', 7607),
  ('for', 6266),
  ('is', 4993),
  ('at', 3679),
  ('be', 3271)])

## make embeddings

In [10]:
# sentence embeddings

with open('../00_data/embeddings/snips_text.txt', 'w') as f:
    for s in sentence_text:
        f.write(s)
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('../00_data/embeddings/snips_text.txt',
                       embeddings_path='../00_data/embeddings/snips_embeddings.gensimmodel',
                       vocab_path='../00_data/embeddings/snips_mapping.json',
                       min_count=1,
                       workers=4,
                       size = 300,
                       iter=10)

## testing

In [11]:
w2v_model.most_similar('play')

  """Entry point for launching an IPython kernel.


[('strays', 0.9344868659973145),
 ('ismol', 0.9261569976806641),
 ('pull', 0.921419084072113),
 ('martyr', 0.915220320224762),
 ('watchers', 0.903829038143158),
 ('mccoy', 0.9032540321350098),
 ('liar', 0.9026849269866943),
 ('lousy', 0.8959661722183228),
 ('friend:', 0.8959379196166992),
 ('ventura:', 0.8947344422340393)]

In [12]:
w2v_model.most_similar('movie')

  """Entry point for launching an IPython kernel.


[('schedule', 0.9217237234115601),
 ('closest', 0.8835597038269043),
 ('cinema', 0.8757786154747009),
 ('nearest', 0.8755965828895569),
 ('movies', 0.8676316142082214),
 ('showtimes', 0.8673536777496338),
 ('schedules', 0.8636676073074341),
 ('showing', 0.8634284138679504),
 ('times', 0.8582863807678223),
 ('films', 0.8550107479095459)]