# NLTK

Descargar corpus y modelos.

In [5]:
import nltk
nltk.download()
# instalar corpus gutenberg y modelo punkt (tokenizador y segmentador)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

Alternativamente:

In [3]:
import nltk
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /home/francolq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/francolq/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [10]:
from nltk.corpus import gutenberg
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [None]:
gutenberg.sents('austen-emma.txt')

# Estadísticas Básicas

Versión básica con diccionarios:

In [12]:
count = {}

for sent in gutenberg.sents('austen-emma.txt'):
    for word in sent:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
count

{'fearless': 1,
 'involvement': 1,
 'instigator': 1,
 'uninterruptedly': 1,
 'Hughes': 3,
 'Farmer': 1,
 'Making': 1,
 'divisions': 3,
 'unpleasant': 13,
 'fried': 2,
 'short': 67,
 'threaten': 1,
 'convenient': 5,
 'Something': 8,
 'mails': 1,
 'eaten': 3,
 'faultless': 1,
 'distance': 25,
 'Extracts': 2,
 'grandpapa': 2,
 'memory': 10,
 'reproached': 1,
 'mine': 25,
 'taking': 28,
 'travels': 1,
 'dinner': 47,
 'pet': 1,
 'villain': 1,
 'Smiths': 1,
 'insulted': 1,
 'were': 591,
 'Day': 2,
 'motto': 1,
 'crossed': 6,
 'claiming': 1,
 'transcribed': 2,
 'marriages': 1,
 'excepting': 11,
 'assuming': 4,
 'cope': 1,
 'spirited': 3,
 'knowing': 26,
 'positively': 10,
 'canvassing': 1,
 'bounds': 1,
 'sufficiency': 1,
 'wanting': 33,
 'fuss': 1,
 'kings': 2,
 'considerably': 4,
 'fidgeting': 1,
 'collation': 1,
 'inseparably': 1,
 'commission': 5,
 'Cowper': 1,
 'civil': 13,
 'altogether': 22,
 'Wiltshire': 1,
 'damps': 1,
 'pleased': 47,
 'carriages': 11,
 'altar': 1,
 'bewitching': 2,
 

Versión mejorada con defaultdicts:

In [22]:
from collections import defaultdict

count = defaultdict(int)

for sent in gutenberg.sents('austen-emma.txt'):
    for word in sent:
        count[word] += 1

In [23]:
print('10 palabras más frecuentes:', sorted(count.items(), key=lambda x: -x[1])[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))

10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)]
Vocabulario: 7806
Tokens: 192484


Versión usando clase Counter:

In [14]:
from collections import Counter

count = Counter()

for sent in gutenberg.sents('austen-emma.txt'):
    count.update(sent)

In [21]:
print('10 palabras más frecuentes:', count.most_common()[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))

10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)]
Vocabulario: 7806
Tokens: 192484


# Corpus de Texto Plano

- http://www.nltk.org/api/nltk.corpus.reader.html#nltk.corpus.reader.plaintext.PlaintextCorpusReader
- http://www.nltk.org/book/ch02.html

Primero crear archivo example.txt: "Estimados Sr. y sra. Gómez. Se los cita por el art. 32 de la ley 21.234."

In [24]:
from nltk.corpus import PlaintextCorpusReader

help(PlaintextCorpusReader)

Help on class PlaintextCorpusReader in module nltk.corpus.reader.plaintext:

class PlaintextCorpusReader(nltk.corpus.reader.api.CorpusReader)
 |  Reader for corpora that consist of plaintext documents.  Paragraphs
 |  are assumed to be split using blank lines.  Sentences and words can
 |  be tokenized using the default tokenizers, or by custom tokenizers
 |  specificed as parameters to the constructor.
 |  
 |  This corpus reader can be customized (e.g., to skip preface
 |  sections of specific document formats) by creating a subclass and
 |  overriding the ``CorpusView`` class variable.
 |  
 |  Method resolution order:
 |      PlaintextCorpusReader
 |      nltk.corpus.reader.api.CorpusReader
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=56), sent_tokenizer=<nltk.tokenize.punkt.PunktSentenceTokenizer object at 0x7f60bb13f630>, para_block_re

In [28]:
corpus = PlaintextCorpusReader('.', 'example.txt')

In [29]:
list(corpus.sents())

[['Estimados', 'Sr', '.', 'y', 'sra', '.'],
 ['Gómez', '.'],
 ['Se', 'los', 'cita', 'por', 'el', 'art', '.'],
 ['32', 'de', 'la', 'ley', '21', '.', '234', '.']]

# Tokenización

- http://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.regexp.RegexpTokenizer
- http://www.nltk.org/book/ch03.html#regular-expressions-for-tokenizing-text

De la documentación de NLTK obtenemos una expresión regular para tokenizar:

In [30]:
pattern = r'''(?x)    # set flag to allow verbose regexps
     (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''

Lo probamos:

In [32]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())

[['Estimados', 'Sr', '.', 'y', 'sra', '.'],
 ['Gómez', '.'],
 ['Se', 'los', 'cita', 'por', 'el', 'art', '.'],
 ['32', 'de', 'la', 'ley', '21', '.', '234', '.']]

Vemos que tokeniza mal todas las abreviaciones y el número "21.234".
Mejoramos la expresión regular y probamos:

In [33]:
pattern = r'''(?x)    # set flag to allow verbose regexps
   (?:\d{1,3}(?:\.\d{3})+)  # numbers with '.' in the middle
   | (?:[Ss]r\.|[Ss]ra\.|art\.)  # common spanish abbreviations
   | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())

[['Estimados', 'Sr.', 'y', 'sra.'],
 ['Gómez', '.'],
 ['Se', 'los', 'cita', 'por', 'el', 'art.'],
 ['32', 'de', 'la', 'ley', '21.234', '.']]

Ahora tokeniza bien!!

(La segmentación en oraciones sigue estando mal, pero resolver eso queda fuera de esta clase.)