# Embedding - Spacy

In [1]:
!pip install spacy



In [2]:
import spacy
from spacy.lang.es.stop_words import STOP_WORDS

! python -m spacy download es_core_news_md

nlp = spacy.load('es_core_news_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_md')


## Text Cleaning? No thanks

In [3]:
# check that the embedding of a document is the mean of the word embeddings that constitute it 
doc2vec = nlp('hola me llamo Pedro').vector.sum()
word2vec = (nlp('hola').vector + nlp('me').vector + nlp('llamo').vector + nlp('Pedro').vector).sum() / 4

print(doc2vec)
print(word2vec)

3.4278557
3.4278557300567627


In [4]:
nlp('quiero la muerte de todos los humanos').similarity(nlp('quiero matar'))

0.7411545992521026

## Text Cleaning? Yes please

In [5]:
import re

def spacy_tokenizer(sentence):

    tokens = nlp(sentence)
    
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        
        if lemma not in STOP_WORDS and re.search('^[a-zA-Z]+$', lemma):
            filtered_tokens.append(lemma)

    return filtered_tokens

In [6]:
tokens_1 = spacy_tokenizer('quiero la muerte de todos los humanos')
tokens_2 = spacy_tokenizer('quiero matar')

In [7]:
nlp(' '.join(tokens_1)).similarity(nlp(' '.join(tokens_2)))

0.8602272999051866