### División de texto

In [1]:
import spacy
import es_core_news_md
nlp = es_core_news_md.load()

In [2]:
doc = nlp("La gata de Juan es blanca.")

División en *tokens*

In [3]:
[t for t in doc]

[La, gata, de, Juan, es, blanca, .]

In [4]:
[t.text for t in doc]

['La', 'gata', 'de', 'Juan', 'es', 'blanca', '.']

In [5]:
[t.lower_ for t in doc]

['la', 'gata', 'de', 'juan', 'es', 'blanca', '.']

División en frases

In [6]:
doc = nlp("la vaca come hierba. El perro come longanizas.")

In [7]:
[s for s in doc.sents]

[la vaca come hierba., El perro come longanizas.]

In [8]:
[s.text for s in doc.sents]

['la vaca come hierba.', 'El perro come longanizas.']

### Limpieza de acentos

In [9]:
import unicodedata

data = 'Sómě Áccěntěd tëxt'
normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
print(normal)

b'Some Accented text'


In [10]:
def remove_accents(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

In [11]:
remove_accents(data)

'Some Accented text'

In [12]:
from gensim.utils import deaccent
#https://radimrehurek.com/gensim/utils.html#gensim.utils.deaccent

In [13]:
deaccent(data)

'Some Accented text'

In [14]:
help(deaccent)

Help on function deaccent in module gensim.utils:

deaccent(text)
    Remove letter accents from the given string.
    
    Parameters
    ----------
    text : str
        Input string.
    
    Returns
    -------
    str
        Unicode string without accents.
    
    Examples
    --------
    .. sourcecode:: pycon
    
        >>> from gensim.utils import deaccent
        >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
        u'Sef chomutovskych komunistu dostal postou bily prasek'



### Limpieza de caracteres especiales

In [15]:
import re, string

def remove_special_characters(text):
    pat = f'[{re.escape(string.punctuation)}]'
    return re.sub(pat, '', text)
 
remove_special_characters("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

'007 Not sure if this  was fun 558923 What do you think of it 500USD'

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
text = "Mr. #Potato! is cool."

In [18]:
nlp_en = spacy.load("en_core_web_md")
doc = nlp_en(text)
[t for t in doc]

[Mr., #, Potato, !, is, cool, .]

In [19]:
[(t, t.is_punct) for t in doc]

[(Mr., False),
 (#, True),
 (Potato, False),
 (!, True),
 (is, False),
 (cool, False),
 (., True)]

In [20]:
remove_special_characters(text)

'Mr Potato is cool'

In [21]:
[t for t in doc if not t.is_punct]

[Mr., Potato, is, cool]

### Expandir contracciones
hay que instalar la librería https://github.com/kootenpv/contractions con ```pip install contractions```

In [22]:
import contractions
contractions.fix("you're happy now, aren't you?")

'you are happy now, are not you?'

In [23]:
doc = nlp_en("you're happy now, aren't you?")
[t for t in doc]

[you, 're, happy, now, ,, are, n't, you, ?]

### Stop-words

In [24]:
from spacy.lang.en.stop_words import STOP_WORDS

In [25]:
len(STOP_WORDS)

326

In [26]:
print(STOP_WORDS)

{'once', 'elsewhere', 'twelve', 'well', '‘s', 'already', 'nowhere', 'thru', 'which', 'while', 'is', 'when', 'myself', 'done', 'becoming', 'does', 'others', '‘re', 'never', 'latter', 'will', "'m", 'afterwards', 'amount', 'ours', 'has', 'against', 'had', 'hence', 'due', 'on', 'you', 'nobody', 'many', 'cannot', 'less', 'fifty', 'some', 'n‘t', 'perhaps', 'yours', 'me', 'during', 'get', 'such', 'former', 'much', 'something', 'twenty', 'still', 'empty', 'among', 'rather', 'should', 'under', 'those', 'keep', '’m', '’s', 'any', 'also', 'in', 'give', 'where', 'herself', '‘ll', 'even', 'without', 'their', 'anyhow', 'one', 'anything', 'i', 'why', 'the', 'all', 'mostly', 'five', 'other', 'may', 'yet', 'so', 'take', 'throughout', 'not', "'d", 'amongst', 'various', 'enough', 'part', 'because', 'several', 'via', 'herein', 'whether', 'into', 'fifteen', 'after', 'than', 'ever', '‘ve', 'this', "'s", 'just', 'meanwhile', 'own', 'upon', 'front', 'make', 'serious', 'whereupon', 'his', 'although', 'whence',

In [27]:
print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

521
{'once', 'ir', 'verdadera', 'era', 'cinco', 'ser', 'como', 'vais', 'siempre', 'estados', 'sido', 'conocer', 'quienes', 'otras', 'cierta', 'días', 'el', 'propia', 'usan', 'última', 'algo', 'dia', 'misma', 'nosotros', 'paìs', 'nunca', 'mías', 'otros', 'nuevas', 'sola', 'cualquier', 'primeros', 'podría', 'podrían', 'consiguen', 'esas', 'fue', 'mencionó', 'mas', 'sean', 'tras', 'toda', 'fin', 'haceis', 'poco', 'varias', 'cuantas', 'uno', 'mejor', 'serán', 'podrias', 'tampoco', 'vamos', 'cuatro', 'una', 'último', 'agregó', 'ningunos', 'que', 'te', 'adelante', 'me', 'por', 'teneis', 'mía', 'vez', 'sea', 'ellas', 'yo', 'añadió', 'antes', 'mientras', 'existe', 'hoy', 'pues', 'están', 'cuánto', 'voy', 'cuánta', 'hasta', 'apenas', 'nosotras', 'y', 'arriba', 'sabe', 'sigue', 'o', 'ver', 'sé', 'cuándo', 'si', 'ésas', 'vuestra', 'muchos', 'dias', 'todavía', 'vosotras', 'incluso', 'creo', 'menos', 'mucho', 'tuyos', 'él', 'quizas', 'encima', 'modo', 'hace', 'será', 'éstas', 'algún', 'demás', 'cue

In [28]:
doc = nlp("La gata de Juan es blanca.")
[(t, t.is_stop) for t in doc]

[(La, True),
 (gata, False),
 (de, True),
 (Juan, False),
 (es, True),
 (blanca, False),
 (., False)]

In [29]:
#podemos añadir o quitar palabras de la lista

#añadir
nlp.Defaults.stop_words.add("my_new_stopword")
nlp.Defaults.stop_words |= {"my_new_stopword1","my_new_stopword2"}

#quitar
nlp.Defaults.stop_words.remove("tuya")
nlp.Defaults.stop_words -= {"tuya", "mia"}

In [30]:
"tuya" in nlp.Defaults.stop_words

False

In [31]:
from gensim.parsing.preprocessing import remove_stopwords
import gensim
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
text = f"The first time I saw Catherine she was wearing a vivid crimson dress and was nervously " \
       f"leafing through a magazine in my waiting room."
print(f"Original Text : {text}")
print(f"Text without stopwords : {remove_stopwords(text.lower())}")
print(f"Total count of stopwords in Gensim is {len(list(gensim_stopwords))}")

Original Text : The first time I saw Catherine she was wearing a vivid crimson dress and was nervously leafing through a magazine in my waiting room.
Text without stopwords : time saw catherine wearing vivid crimson dress nervously leafing magazine waiting room.
Total count of stopwords in Gensim is 337


In [32]:
len(gensim_stopwords)

337

In [33]:
print(gensim_stopwords)

frozenset({'once', 'elsewhere', 'twelve', 'well', 'already', 'nowhere', 'thru', 'which', 'while', 'is', 'when', 'myself', 'done', 'becoming', 'does', 'others', 'never', 'latter', 'will', 'afterwards', 'amount', 'ours', 'has', 'against', 'had', 'hence', 'ie', 'due', 'on', 'you', 'nobody', 'many', 'cannot', 'less', 'fifty', 'some', 'perhaps', 'yours', 'me', 'during', 'inc', 'get', 'such', 'former', 'much', 'something', 'twenty', 'still', 'empty', 'among', 'rather', 'should', 'under', 'etc', 'those', 'keep', 'any', 'also', 'where', 'give', 'in', 'herself', 'even', 'without', 'didn', 'their', 'anyhow', 'one', 'kg', 'anything', 'i', 'why', 'the', 'mill', 'all', 'mostly', 'con', 'don', 'five', 'other', 'may', 'yet', 'ltd', 'so', 'take', 'throughout', 'not', 'system', 'amongst', 'various', 'enough', 'part', 'several', 'because', 'via', 'fire', 'herein', 'whether', 'into', 'fifteen', 'hasnt', 'after', 'than', 'ever', 'this', 'just', 'thin', 'meanwhile', 'own', 'upon', 'front', 'make', 'serious

In [34]:
'not' in gensim_stopwords

True

### Corrección ortográfica
Librería `spellchecker`. Instalamos con
```pip install pyspellchecker```

In [35]:
from spellchecker import SpellChecker

spell = SpellChecker(language='es')  # Spanish dictionary
print(f"Hay {spell.word_frequency._unique_words} palabras en el diccionario")

ModuleNotFoundError: No module named 'indexer'

In [None]:
spell.correction('mañnaa')

In [None]:
spell.candidates('mañnaa')

In [None]:
#si una palabra está en el diccionario devuelve su frecuencia relativa:
spell['mañana']  #equivale a spell.word_frequency['mañana']

In [None]:
spell['mañna']

In [None]:
spell.correction('mañana')

In [None]:
spell.candidates("adiós")

In [None]:
spell["adios"]

In [None]:
spell["adiós"]

### Lematizado

In [None]:
doc = nlp("los gatos son blancos")
[t.lemma_ for t in doc]

In [None]:
doc = nlp("La salida se ha bloqueado. La salida está bloqueada.")
[(t.lemma_, t.pos_) for t in doc]

### Funciones de normalización

In [None]:
texto = "@Graffitera23 qué hermoso!,es bueno desviar la mirada al cielo y a las nubes de vez en cuando,abajo está jodido.Preciosa foto,mil abrazos "

In [None]:
# en spacy
import re
import spacy
nlp=spacy.load('es_core_news_md')
               
def normalize_document(doc):
   # separamos en tokens
    tokens = nlp(doc)
    # quitamos puntuación/espacios y stopwords
    filtered_tokens = [t.lower_ for t in tokens if not t.is_stop and not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)

    return doc

In [None]:
print(texto)

In [None]:
normalize_document(texto)

Con esta función no se eliminan los signos de puntuación que no forman un token de manera independiente, debemos hacerlo con un patrón regular

In [None]:
import string

stop_words = ['es', 'y', 'a']

pat  = '[{}]'.format(re.escape(string.punctuation))

def normalize_document_remove_punct(doc):
   # separamos en tokens
    tokens = nlp(doc)
    # quitamos puntuación/espacios y stopwords
    filtered_tokens = [re.sub(pat, ' ', t.lower_) for t in tokens if not t.text in stop_words and not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)

    return doc

In [None]:
normalize_document_remove_punct(texto)

In [None]:
from gensim.utils import simple_preprocess
#https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess

help(simple_preprocess)

In [None]:
simple_preprocess(texto, deacc=True)

In [None]:
from gensim.utils import tokenize
#https://radimrehurek.com/gensim/utils.html#gensim.utils.tokenize

help(tokenize)

In [None]:
tokenize(texto)

In [None]:
list(tokenize(texto, deacc=True, lowercase=True))

In [None]:
from gensim.parsing.preprocessing import preprocess_string
#https://radimrehurek.com/gensim/parsing/preprocessing.html#gensim.parsing.preprocessing.preprocess_string
help(preprocess_string)

In [None]:
preprocess_string("<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?")

In [None]:
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT")

In [None]:
from gensim.parsing.preprocessing import *
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT", [remove_stopwords])

In [None]:
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT", [remove_stopwords, stem_text])