# Word embeddings

### Resources

- Word2vec (Mikolov et al., 2013): https://code.google.com/archive/p/word2vec/
- Fasttext: http://www.fasttext.cc/ (+sub-word information, +multilingual)
- Glove (Pennington, Socher, Manning, 2014): http://nlp.stanford.edu/projects/glove/

- Multilingual embeddings trained on Wikipedia: https://github.com/facebookresearch/MUSE



Gensim documentation: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

Embeddings visualizer: https://projector.tensorflow.org/


In [None]:
import gensim


In [None]:
# Show all available models in gensim-data
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
embeddings = gensim.downloader.load('fasttext-wiki-news-subwords-300') # Note: maybe try ones with fewer dimensions if you want them do be downloaded more quickly



In [None]:
print(len(embeddings.index2word)) # ~1mil words

999999


In [None]:
embeddings.get_vector('cat')

array([ 4.7426e-02, -4.2203e-02,  2.8491e-02, -4.4481e-02, -2.6467e-02,
        3.3557e-02,  1.7173e-01, -1.4773e-01,  7.1133e-02,  3.8385e-02,
       -7.9921e-02, -1.3236e-03,  1.6185e-01, -1.6389e-01, -6.5456e-02,
        2.7030e-02,  1.2193e-01, -7.1632e-02,  7.9642e-02, -1.4602e-01,
        1.0369e-02, -1.0468e-01,  3.6734e-02,  7.9116e-02,  1.8241e-01,
       -2.4902e-03,  1.3818e-02,  1.2378e-01,  1.7348e-04,  1.6646e-02,
       -1.3513e-02, -7.5532e-02,  1.0060e-01,  7.1226e-02, -2.7468e-02,
       -7.8517e-02,  3.3769e-02, -1.6082e-01, -2.2747e-02, -1.3238e-01,
        1.1198e-02, -1.2838e-01, -5.8910e-02, -5.1399e-02,  3.9177e-02,
        5.8243e-02,  2.2071e-02,  7.1876e-02,  2.8166e-02, -1.8918e-02,
        3.4445e-02,  1.5664e-01,  6.9836e-02, -2.0858e-01,  2.4413e-03,
       -8.9110e-02, -6.0705e-02,  6.5106e-02, -8.3036e-02,  1.7491e-02,
       -3.4527e-02, -2.5026e-01,  1.8777e-01, -3.6235e-02,  1.2452e-01,
        2.4356e-02, -4.4517e-03,  5.5572e-02, -8.6161e-03, -1.61

In [None]:
embeddings.most_similar('cat')

[('cats', 0.8368596434593201),
 ('housecat', 0.7674711346626282),
 ('-cat', 0.7602992057800293),
 ('dog', 0.7502298355102539),
 ('kitten', 0.7480818033218384),
 ('feline', 0.7353992462158203),
 ('super-cat', 0.7305205464363098),
 ('supercat', 0.7163283824920654),
 ('pet', 0.7090284824371338),
 ('moggy', 0.7057286500930786)]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(
    embeddings.get_vector('cat').reshape(1,-1),
    embeddings.get_vector('kitten').reshape(1,-1)
    )[0][0]

0.7480817

In [None]:
embeddings.similarity('cat', 'kitten')

0.74808174

#### Synonymy vs relatedness

In [None]:
embeddings.similarity('uncle', 'sister')

0.7174434

In [None]:
embeddings.similarity('sibling', 'sister')

0.76012087

In [None]:
embeddings.similarity('good', 'bad')

0.8503089

In [None]:
embeddings.similarity('Monday', 'Sunday')

0.85126

### Normalization & correcting typos

In [None]:
embeddings.most_similar("defenitly")

[('defenitely', 0.8840630650520325),
 ('defenetly', 0.845188558101654),
 ('defently', 0.8433688282966614),
 ('definitly', 0.770545482635498),
 ('definetly', 0.7526832222938538),
 ('politly', 0.7457025051116943),
 ('defnitely', 0.7323918342590332),
 ('probebly', 0.7254762649536133),
 ('definatly', 0.7249115109443665),
 ('usuallly', 0.7225544452667236)]

In [None]:
embeddings.most_similar("hiii")

[('hiiii', 0.9428294897079468),
 ('Hiii', 0.8571420311927795),
 ('Hiiii', 0.8538976311683655),
 ('hii', 0.7178279161453247),
 ('iiii', 0.7104865908622742),
 ('plzzz', 0.6919779777526855),
 ('plzzzz', 0.685299813747406),
 ('hhh', 0.6836094260215759),
 ('wazzup', 0.6829248070716858),
 ('plzz', 0.6775454878807068)]

#### Word analogies

In [None]:
# king - man + woman
embeddings.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('queen', 0.9390855431556702),
 ('queen-mother', 0.9078598618507385),
 ('king-', 0.8828967213630676),
 ('queen-consort', 0.882541835308075),
 ('child-king', 0.8680858016014099),
 ('monarch', 0.8670082688331604),
 ('ex-queen', 0.8654636740684509),
 ('princess', 0.8628991842269897),
 ('queen-', 0.8613532781600952),
 ('boy-king', 0.8604660630226135)]

In [None]:
# Rome - Italy + France
embeddings.most_similar_cosmul(positive=['Rome', 'France'], negative=['Italy'])

[('Paris', 0.9433913826942444),
 ('Meaux', 0.8968006372451782),
 ('Avignon', 0.8785353302955627),
 ('Saint-Denis', 0.8744451403617859),
 ('Rouen', 0.8729321956634521),
 ('Lyon', 0.8724958300590515),
 ('Louville', 0.8714753985404968),
 ('Toulouse', 0.8713098168373108),
 ('Beauvais', 0.8704253435134888),
 ('Blois', 0.8698782920837402)]

In [None]:
# programmer - man + woman
embeddings.most_similar_cosmul(positive=['programmer', 'woman'], negative=['man'])

[('non-programmer', 0.864444375038147),
 ('programmers', 0.857123851776123),
 ('Programmer', 0.8313319683074951),
 ('non-programmers', 0.8269447088241577),
 ('writer', 0.8260436058044434),
 ('coder', 0.8254762887954712),
 ('programer', 0.8239933848381042),
 ('nonprogrammers', 0.8238459229469299),
 ('web-designer', 0.8198288679122925),
 ('researcher', 0.8139449954032898)]

## Training embeddings

In [None]:
my_embeddings = gensim.models.Word2Vec(sentences=None, corpus_file=None,
                                size=100, alpha=0.025, window=5, min_count=5,
                                max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0,
                                negative=5, ns_exponent=0.75, cbow_mean=1, null_word=0, trim_rule=None,
                                sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(),
                                max_final_vocab=None)


<gensim.models.word2vec.Word2Vec at 0x7f1c0dc88fd0>

# Texts as embedding sequences

Moving on from one-hot-encodings and frequency vectors

In [None]:
corpus = [
 'This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?',
 '@user This one is a tweet #meta ;)'
]

In [None]:
# First tokenize the text... see previous notebook!
# Note: do we still need lemmatization if we use embeddings? Debatable.
import nltk
nltk.download('punkt')
corpus_tokenized = [nltk.word_tokenize(s) for s in corpus]
print(corpus_tokenized)

In [None]:
corpus_encoded = []
for sent in corpus_tokenized:
  sentence_encoded = []
  for word in sent:
    sentence_encoded.append(embeddings[word])
  corpus_encoded.append(sentence_encoded)
print(corpus_encoded) # Ready for machine learning!

## False friends

In [None]:
!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec # English embeddings, prealigned
!wget https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.es.vec # Spanish embeddings, prealigned

--2023-03-19 19:52:56--  https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 628614720 (599M) [text/plain]
Saving to: ‘wiki.multi.en.vec’


2023-03-19 19:53:06 (61.1 MB/s) - ‘wiki.multi.en.vec’ saved [628614720/628614720]

--2023-03-19 19:53:06--  https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.es.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 628826336 (600M) [text/plain]
Saving to: ‘wiki.multi.es.vec’


2023-03-19 19:53:16 (62.6 MB/s) - ‘wiki.multi.es.vec’ saved [628826336/628826336]



In [None]:
from gensim.models import KeyedVectors
en_model = KeyedVectors.load_word2vec_format('wiki.multi.en.vec')  # Watch out for your RAM...
es_model = KeyedVectors.load_word2vec_format('wiki.multi.es.vec')


In [None]:
embarrassed_vec = en_model.get_vector("embarassed")
pregnant_vec = en_model.get_vector("pregnant")
embarazada_vec = es_model.get_vector("embarazada")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity([embarrassed_vec, pregnant_vec, embarazada_vec])

words = ("embarrassed", "pregnant", "embarazada")
print("%27s %15s %15s" % words)
for i, row in enumerate(similarity_matrix):
  print("%15s %.13f %.13f %.13f" % (words[i], *row))

                embarrassed        pregnant      embarazada
    embarrassed 0.9999999403954 0.2995810210705 0.3134102821350
       pregnant 0.2995810210705 1.0000000000000 0.8302524089813
     embarazada 0.3134102821350 0.8302524089813 1.0000000000000


In [None]:
es_model.similar_by_vector(embarrassed_vec)

[('parecerme', 0.6807215213775635),
 ('insultarme', 0.6778432726860046),
 ('sinceramente', 0.675748884677887),
 ('discúlpame', 0.6600538492202759),
 ('equivocarme', 0.6517462730407715),
 ('digo', 0.6503289937973022),
 ('diciéndome', 0.6467952132225037),
 ('disculpo', 0.6451693773269653),
 ('disculparme', 0.6407719850540161),
 ('ridículo', 0.6402308940887451)]

# Contextual embeddings and language models... to be continued :)

**Extra links**:

Using contextual embeddings for detecting pejorative words on Twitter https://aclanthology.org/2021.findings-emnlp.296.pdf


> "This workshop is *trash*!"




and a lexicon of pejorative words (including Romanian): https://nlp.unibuc.ro/resources.html#pejor