## (33) NLTK

### 1) Accessing text corpora

Gutenberg corpus

In [13]:
import nltk

In [1]:
from nltk.corpus import gutenberg
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
print(len(gutenberg.raw('austen-emma.txt')))
print(len(gutenberg.words('austen-emma.txt')))
print(len(gutenberg.sents('austen-emma.txt')))

887071
192427
7752


In [6]:
macbeth = gutenberg.sents('shakespeare-macbeth.txt')
macbeth

[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]

In [7]:
macbeth[1023]

['Or',
 'be',
 'aliue',
 'againe',
 ',',
 'And',
 'dare',
 'me',
 'to',
 'the',
 'Desart',
 'with',
 'thy',
 'Sword',
 ':',
 'If',
 'trembling',
 'I',
 'inhabit',
 'then',
 ',',
 'protest',
 'mee',
 'The',
 'Baby',
 'of',
 'a',
 'Girle',
 '.']

Brwon corpus

In [9]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [11]:
brown.words(categories = 'news')
brown.words(fileids = ['cg22'])
brown.sents(categories=['news', 'editorial'])


[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [16]:
cfd = nltk.ConditionalFreqDist((genre, word)for genre in brown.categories() for word in brown.words(categories = genre))

In [19]:
genres = ['news','religion', 'hobbies', 'science_fiction', 'romance']

In [20]:
modals = ['can', 'could', 'may', 'might', 'must', 'will']

In [21]:
cfd.tabulate(conditions = genres, samples = modals)

                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 


WordNet

In [28]:
from nltk.corpus import wordnet as wn
print(wn.synsets('motorcar'))
print(wn.synset('car.n.01').definition())
print(wn.synset('car.n.01').lemmas())
print(wn.synset('car.n.01').lemma_names())
print(wn.synset('car.n.01').examples())
print(wn.lemma('car.n.01.automobile')) # automobile에 해당하는 lemma가 나온다.
print(wn.lemma('car.n.01.automobile').synset()) # lemma의 동의(상위 개념인 car가 나옴)
print(wn.lemma('car.n.01.automobile').name())

[Synset('car.n.01')]
a motor vehicle with four wheels; usually propelled by an internal combustion engine
[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
['car', 'auto', 'automobile', 'machine', 'motorcar']
['he needs a car to get to work']
Lemma('car.n.01.automobile')
Synset('car.n.01')
automobile


Accessing Text from the Web
- urllib모듈을 쓴다
(예) from urllib import request

In [31]:
from urllib import request
url = "http://www.gutenberg.org/ebooks/2554?msg=welcome_stranger"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)
len(raw)
raw[:74]
tokens = nltk.word_tokenize(raw)
type(tokens)
len(tokens)
tokens[:10]

['<',
 '!',
 'DOCTYPE',
 'html',
 'PUBLIC',
 '``',
 '-//W3C//DTD',
 'XHTML+RDFa',
 '1.0//EN',
 "''"]

### Tokenizing Text

In [32]:
sample_text = "Hello Word. It's good to see you. Thank you"

from nltk.tokenize import sent_tokenize
sent_tokenize(sample_text)

['Hello Word.', "It's good to see you.", 'Thank you']

In [36]:
# TreebankwordTokenizer 라는 토큰나이저(treebank구조를 가지고있음)
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Hello World.')

['Hello', 'World', '.']

In [37]:
from nltk.tokenize import word_tokenize
word_tokenize('Hello World.')

['Hello', 'World', '.']

In [38]:
from nltk.tokenize import word_tokenize
word_tokenize("can't")

['ca', "n't"]

In [42]:
from nltk.tokenize import WordPunctTokenizer #구두점 구별이 가능한 토큰나이저 같음
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("can't")

['can', "'", 't']

Tokenizing sentences using regular expressions

In [47]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w]+")
tokenizer.tokenize("Can't is a contraction.")

['Can', 't', 'is', 'a', 'contraction']

In [46]:
from nltk.tokenize import regexp_tokenize
regexp_tokenize("Can't is a contraction.", "[\w]+") # ['\w']로는 공백을 토큰나이징 못하니깐 공백 기준으로 token됨

['Can', 't', 'is', 'a', 'contraction']

In [51]:
from nltk.tokenize import regexp_tokenize
tokenizer = RegexpTokenizer('\s+', gaps = True) 
# gaps = Trueg하면, 구분자로 공백을 쓴다는 뜻이다 / 공백 기준으로 단어들이 token화 되서 나온다.
# tokenizer = RegexpTokenizer('\s+', gaps = False)
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction.']

Filtering stopwords in a tokenized sentence

In [52]:
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]

["Can't", 'contraction']

In [53]:
stopwords.fileids()

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'kazakh',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

#### Stemming Words<br> 예) from nltk.stem import 스테머

In [54]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))

'cookeri'

In [56]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))

cook
cookery


In [57]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('ingleside'))

cook
cookery
leside


In [58]:
from nltk.stem import SnowballStemmer
spanish_stemmer = SnowballStemmer('spanish')
spanish_stemmer.stem('hola')

'hol'

#### __Lemmatizing__ words with WordNet

In [60]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking', pos = 'v'))
print(lemmatizer.lemmatize('cookbooks'))

cooking
cook
cookbook


Stemming vs Lemmatization
- Stemming하면 원형이 오게되고
- Lemmatization은 원형이 아니더라도 올바른 단어면 그자리에 올 수 있다.

In [63]:
stemmer = PorterStemmer()
print(stemmer.stem('believes'))
print(lemmatizer.lemmatize('believes'))

believ
belief


In [64]:
import nltk
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [68]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
print(text.similar('woman'))
print()
print(text.similar('bought'))
print()
print(text.similar('over'))
print()
print(text.similar('the'))

man time day year car moment world house family child country boy
state job place way war girl work word
None

made said done put had seen found given left heard was been brought
set got that took in told felt
None

in on to of and for with from at by that into as up out down through
is all about
None

a his this their its her an that our any all one these my in your no
some other and
None


In [72]:
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token)
print(tagged_token[0])
print(tagged_token[1])

('fly', 'NN')
fly
NN
