# create custom word2vec embeddings

use a mix of in-domain data (= the merged corpus sentences) and general domain data from the Brown corpus

In [1]:
import pandas as pd
import numpy as np
from dataset import get_vocab, index_sents
from embedding import create_embeddings

Using TensorFlow backend.


## in-domain text - lowercased

In [2]:
# read in in-domain text, POS-tags
alltoks = list(np.load('../00_data/encoded/add_tokens.npy'))
alltags = list(np.load('../00_data/encoded/add_postags.npy'))

In [3]:
string_toks = []
for seq in alltoks:
    seq = [str(s) for s in seq]
    if len(seq) > 1:
        string_toks.append(' '.join(seq).lower())

In [4]:
string_tags = []
for seq in alltags:
    seq = [str(s) for s in seq]
    if len(seq) > 1:
        string_tags.append(' '.join(seq))

In [5]:
len(string_toks)

15513

## supplement with generic text - lowercased

In [6]:
from nltk.corpus import brown

In [7]:
brownsents = brown.sents(categories=['news', 'editorial', 'reviews', 'government'])
brownsents = [' '.join(s).lower() for s in brownsents]
len(brownsents)

12403

## generic token for number

In [8]:
sentence_text = brownsents + string_toks

for idx, sent in enumerate(sentence_text):
    for number in ['1','2','3','4','5','6','7','8','9','0']:
        sent = sent.replace(number, '#')
    sentence_text[idx] = sent

In [9]:
sent_lists = [s.split() for s in sentence_text]
vocab = [w for s in sent_lists for w in s]
from collections import Counter
vset = set(vocab)
vcount = Counter(vocab)
len(vset), vcount.most_common(10)

(21983,
 [('the', 23000),
  (',', 13677),
  ('to', 11078),
  ('of', 10966),
  ('.', 10553),
  ('and', 9721),
  ('a', 7573),
  ('in', 6528),
  ('that', 4928),
  ('#', 4795)])

## make embeddings

In [10]:
# sentence embeddings

with open('../00_data/embeddings/sent_text.txt', 'w') as f:
    for s in sentence_text:
        f.write(s)
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('../00_data/embeddings/sent_text.txt',
                       embeddings_path='../00_data/embeddings/text_embeddings.gensimmodel',
                       vocab_path='../00_data/embeddings/text_mapping.json',
                       min_count=1,
                       workers=4,
                       size = 160,
                       iter=100)

In [11]:
# postag embeddings

with open('../00_data/embeddings/postag_text.txt', 'w') as f:
    for s in string_tags:
        f.write(s)
        f.write('\n')

w2v_pvocab, w2v_pmodel = create_embeddings('../00_data/embeddings/postag_text.txt',
                         embeddings_path='../00_data/embeddings/postag_embeddings.gensimmodel',
                         vocab_path='../00_data/embeddings/postag_mapping.json',
                         min_count=3,
                         workers=4,
                         size=32,
                         iter=20)

## testing

In [12]:
w2v_model.most_similar('london')

[('mumbai', 0.546605110168457),
 ('nuneaton', 0.5448371171951294),
 ('manila', 0.5345560312271118),
 ('seoul', 0.5336037874221802),
 ('budapest', 0.5319229960441589),
 ('cairo', 0.5309597253799438),
 ('madrid', 0.5206683874130249),
 ('minsk', 0.5083528161048889),
 ('daegu', 0.49895063042640686),
 ('stockport', 0.49597442150115967)]

In [13]:
w2v_model.most_similar('october')

[('february', 0.7360303997993469),
 ('november', 0.7329068183898926),
 ('september', 0.7270194888114929),
 ('january', 0.7232068777084351),
 ('april', 0.7010664343833923),
 ('december', 0.6981652975082397),
 ('august', 0.6908720135688782),
 ('march', 0.6824270486831665),
 ('june', 0.6813216805458069),
 ('july', 0.6104220151901245)]

In [14]:
w2v_model.most_similar('#:##')

[('##:##', 0.6680556535720825),
 ('madrid', 0.5936460494995117),
 ('flight', 0.5899808406829834),
 ('nagoya', 0.5555135011672974),
 ('osaka', 0.5316229462623596),
 ('mumbai', 0.5308681130409241),
 ('tehran', 0.5255060791969299),
 ('daegu', 0.515431821346283),
 ('manila', 0.5147073864936829),
 ('busan', 0.5084124803543091)]

In [15]:
w2v_model.most_similar('####')

[("####'s", 0.37837108969688416),
 ('ending', 0.36077120900154114),
 ('year', 0.34104183316230774),
 ('sales', 0.3399466276168823),
 ('y.', 0.33605659008026123),
 ('months', 0.33141085505485535),
 ('rose', 0.32533833384513855),
 ('#%', 0.32455965876579285),
 ('bills', 0.31682369112968445),
 ('filed', 0.3165268898010254)]

In [16]:
w2v_pmodel.most_similar('NN')

[('FW', 0.5746904611587524),
 ('JJ', 0.497397780418396),
 ('UH', 0.49557361006736755),
 ('WP$', 0.46631431579589844),
 ('PRP', 0.463824987411499),
 ('WRB', 0.4284180700778961),
 ('NNS', 0.41058939695358276),
 ('VB', 0.3847822844982147),
 ('CC', 0.3781307339668274),
 ('RB', 0.35533303022384644)]