# create custom word2vec embeddings

use a mix of in-domain data (= the merged corpus sentences) and general domain data from the Brown corpus

In [1]:
import pickle
import pandas as pd
import numpy as np
from dataset import get_vocab, index_sents
from embedding import create_embeddings

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## in-domain text - lowercased

In [2]:
# read in in-domain text, POS-tags
alltoks = pickle.load(open('../00_data/snips/train_sents.pkl', 'rb'))
alltags = pickle.load(open('../00_data/snips/train_tags.pkl', 'rb'))

In [3]:
string_toks = []
for seq in alltoks:
    seq = [str(s) for s in seq]
    if len(seq) > 1:
        string_toks.append(' '.join(seq).lower())

In [4]:
string_tags = []
for seq in alltags:
    seq = [str(s) for s in seq]
    if len(seq) > 1:
        string_tags.append(' '.join(seq))

In [5]:
len(string_toks)

13784

## supplement with generic text - lowercased

In [6]:
from nltk.corpus import brown

In [7]:
brownsents = brown.sents(categories=['news', 'editorial', 'reviews', 'government'])
brownsents = [' '.join(s).lower() for s in brownsents]
brownsents = [s.replace(',', '').replace('!', '').replace('?', '').replace('.', '').replace(':', '') for s in brownsents]
len(brownsents)

12403

In [8]:
sentence_text = brownsents + string_toks

In [9]:
sent_lists = [s.split() for s in sentence_text]
vocab = [w for s in sent_lists for w in s]
from collections import Counter
vset = set(vocab)
vcount = Counter(vocab)
len(vset), vcount.most_common(10)

(29894,
 [('the', 24754),
  ('of', 11817),
  ('to', 9711),
  ('a', 9651),
  ('in', 8748),
  ('and', 7596),
  ('for', 5714),
  ('is', 4286),
  ('at', 3407),
  ('on', 2852)])

## make embeddings

In [10]:
# sentence embeddings

with open('../00_data/embeddings/snips_text.txt', 'w') as f:
    for s in sentence_text:
        f.write(s)
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('../00_data/embeddings/snips_text.txt',
                       embeddings_path='../00_data/embeddings/snips_embeddings.gensimmodel',
                       vocab_path='../00_data/embeddings/snips_mapping.json',
                       min_count=1,
                       workers=4,
                       size = 300,
                       iter=10)

## testing

In [11]:
w2v_model.most_similar('play')

  """Entry point for launching an IPython kernel.


[('music', 0.8984863758087158),
 ('plkay', 0.8885617852210999),
 ('bundrick', 0.881790041923523),
 ('song', 0.8685442209243774),
 ('aitken', 0.8653161525726318),
 ('laurel', 0.8648514151573181),
 ('jpop', 0.8640094995498657),
 ('formalities', 0.8636695146560669),
 ('siouxsie', 0.8613051176071167),
 ('boyett', 0.857239842414856)]

In [12]:
w2v_model.most_similar('movie')

  """Entry point for launching an IPython kernel.


[('cineexport', 0.9186242818832397),
 ('schedule', 0.902472198009491),
 ('maltio', 0.8923336267471313),
 ('castolon', 0.8736733198165894),
 ('nearest', 0.8685910701751709),
 ('closest', 0.8668645620346069),
 ('schedules', 0.8630369901657104),
 ('greensburg', 0.858954906463623),
 ('looking', 0.8565113544464111),
 ('kenedy', 0.8554991483688354)]