### División de texto

In [None]:
import spacy
nlp = spacy.load("es_core_news_md")

In [None]:
doc = nlp("La gata de Juan es blanca.")

División en *tokens*

In [None]:
[t for t in doc]

In [None]:
[t.text for t in doc]

In [None]:
[t.lower_ for t in doc]

División en frases

In [None]:
doc = nlp("la vaca come hierba. El perro come longanizas.")

In [None]:
[s for s in doc.sents]

In [None]:
[s.text for s in doc.sents]

### Limpieza de acentos

In [None]:
import unicodedata

data = 'Sómě Áccěntěd tëxt'
normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
print(normal)

In [None]:
def remove_accents(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

In [None]:
remove_accents(data)

In [None]:
from gensim.utils import deaccent
#https://radimrehurek.com/gensim/utils.html#gensim.utils.deaccent

In [None]:
deaccent(data)

In [None]:
help(deaccent)

### Limpieza de caracteres especiales

In [None]:
import re, string

def remove_special_characters(text):
    pat = f'[{re.escape(string.punctuation)}]'
    return re.sub(pat, '', text)
 
remove_special_characters("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

In [None]:
string.punctuation

In [None]:
text = "Mr. #Potato! is cool."

In [None]:
nlp_en = spacy.load("en_core_web_md")
doc = nlp_en(text)
[t for t in doc]

In [None]:
[(t, t.is_punct) for t in doc]

In [None]:
remove_special_characters(text)

In [None]:
[t for t in doc if not t.is_punct]

### Expandir contracciones
hay que instalar la librería https://github.com/kootenpv/contractions con ```pip install contractions```

In [None]:
import contractions
contractions.fix("you're happy now, aren't you?")

In [None]:
doc = nlp_en("you're happy now, aren't you?")
[t for t in doc]

### Stop-words

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
len(STOP_WORDS)

In [None]:
print(STOP_WORDS)

In [None]:
print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

In [None]:
doc = nlp("La gata de Juan es blanca.")
[(t, t.is_stop) for t in doc]

In [None]:
#podemos añadir o quitar palabras de la lista

#añadir
nlp.Defaults.stop_words.add("my_new_stopword")
nlp.Defaults.stop_words |= {"my_new_stopword1","my_new_stopword2"}

#quitar
nlp.Defaults.stop_words.remove("tuya")
nlp.Defaults.stop_words -= {"tuya", "mia"}

In [None]:
"tuya" in nlp.Defaults.stop_words

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
import gensim
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
text = f"The first time I saw Catherine she was wearing a vivid crimson dress and was nervously " \
       f"leafing through a magazine in my waiting room."
print(f"Original Text : {text}")
print(f"Text without stopwords : {remove_stopwords(text.lower())}")
print(f"Total count of stopwords in Gensim is {len(list(gensim_stopwords))}")

In [None]:
len(gensim_stopwords)

In [None]:
print(gensim_stopwords)

In [None]:
'not' in gensim_stopwords

### Corrección ortográfica
Librería `spellchecker`. Instalamos con
```pip install pyspellchecker```

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker(language='es')  # Spanish dictionary
print(f"Hay {spell.word_frequency._unique_words} palabras en el diccionario")

In [None]:
spell.correction('mañnaa')

In [None]:
spell.candidates('mañnaa')

In [None]:
#si una palabra está en el diccionario devuelve su frecuencia relativa:
spell['mañana']  #equivale a spell.word_frequency['mañana']

In [None]:
spell['mañna']

In [None]:
spell.correction('mañana')

In [None]:
spell.candidates("adiós")

In [None]:
spell["adios"]

In [None]:
spell["adiós"]

### Lematizado

In [None]:
doc = nlp("los gatos son blancos")
[t.lemma_ for t in doc]

In [None]:
doc = nlp("La salida se ha bloqueado. La salida está bloqueada.")
[(t.lemma_, t.pos_) for t in doc]

### Funciones de normalización

In [None]:
texto = "@Graffitera23 qué hermoso!,es bueno desviar la mirada al cielo y a las nubes de vez en cuando,abajo está jodido.Preciosa foto,mil abrazos "

In [None]:
# en spacy
import re
import spacy
nlp=spacy.load('es_core_news_md')
               
def normalize_document(doc):
   # separamos en tokens
    tokens = nlp(doc)
    # quitamos puntuación/espacios y stopwords
    filtered_tokens = [t.lower_ for t in tokens if not t.is_stop and not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)

    return doc

In [None]:
print(texto)

In [None]:
normalize_document(texto)

Con esta función no se eliminan los signos de puntuación que no forman un token de manera independiente, debemos hacerlo con un patrón regular

In [None]:
import string

stop_words = ['es', 'y', 'a']

pat  = '[{}]'.format(re.escape(string.punctuation))

def normalize_document_remove_punct(doc):
   # separamos en tokens
    tokens = nlp(doc)
    # quitamos puntuación/espacios y stopwords
    filtered_tokens = [re.sub(pat, ' ', t.lower_) for t in tokens if not t.text in stop_words and not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)

    return doc

In [None]:
normalize_document_remove_punct(texto)

In [None]:
from gensim.utils import simple_preprocess
#https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess

help(simple_preprocess)

In [None]:
simple_preprocess(texto, deacc=True)

In [None]:
from gensim.utils import tokenize
#https://radimrehurek.com/gensim/utils.html#gensim.utils.tokenize

help(tokenize)

In [None]:
tokenize(texto)

In [None]:
list(tokenize(texto, deacc=True, lowercase=True))

In [None]:
from gensim.parsing.preprocessing import preprocess_string
#https://radimrehurek.com/gensim/parsing/preprocessing.html#gensim.parsing.preprocessing.preprocess_string
help(preprocess_string)

In [None]:
preprocess_string("<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?")

In [None]:
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT")

In [None]:
from gensim.parsing.preprocessing import *
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT", [remove_stopwords])

In [None]:
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT", [remove_stopwords, stem_text])