# create custom word2vec embeddings

use general domain data from the nltk Brown corpus

In [1]:
import numpy as np
import re
from nltk.corpus import brown
from mltools.preprocessing import Tokenizer
from mltools.embeddings import create_embeddings

Using TensorFlow backend.


## read in tokenized text from brown and lowercase

In [2]:
browndata = brown.sents(categories=['news', 'editorial', 'reviews', 'government'])
tokenizer = Tokenizer(lower=True, regex=True)
browntoks = tokenizer.fit_transform(browndata)
brownsents = [' '.join(s) for s in browntoks]
len(brownsents)

12403

In [3]:
print(browntoks[:5])

[['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlantas', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place'], ['the', 'jury', 'further', 'said', 'in', 'termend', 'presentments', 'that', 'the', 'city', 'executive', 'committee', 'which', 'had', 'overall', 'charge', 'of', 'the', 'election', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted'], ['the', 'septemberoctober', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', 'irregularities', 'in', 'the', 'hardfought', 'primary', 'which', 'was', 'won', 'by', 'mayornominate', 'ivan', 'allen', 'jr'], ['only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', 'the', 'jury', 'said', 'considering', 'the', 'wi

## make embeddings

In [4]:
# sentence embeddings

with open('sent_text.txt', 'w') as f:
    for s in brownsents:
        f.write(s)
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('sent_text.txt',
                       embeddings_path='text_embeddings.gensimmodel',
                       vocab_path='text_mapping.json',
                       min_count=2,
                       workers=2,
                       size = 200,
                       iter=100)

## testing

In [9]:
w2v_model.most_similar('committee')

[('republican', 0.47915521264076233),
 ('committees', 0.45635107159614563),
 ('board', 0.4550314247608185),
 ('votes', 0.41035938262939453),
 ('caucus', 0.40127965807914734),
 ('senate', 0.3961509168148041),
 ('council', 0.3793870806694031),
 ('democraticendorsed', 0.3771839737892151),
 ('liberal', 0.3702690303325653),
 ('representatives', 0.3634282052516937)]

In [6]:
w2v_model.most_similar('october')

[('september', 0.6374590992927551),
 ('june', 0.5300639867782593),
 ('december', 0.526236355304718),
 ('1960', 0.5068605542182922),
 ('february', 0.503847599029541),
 ('november', 0.4959608316421509),
 ('january', 0.4883143901824951),
 ('1959', 0.48575159907341003),
 ('1951', 0.47862890362739563),
 ('1952', 0.47386273741722107)]