In [2]:
import nltk
%matplotlib inline



### Language Processing and Python

In [None]:
# nltk.download()

In [None]:
from nltk.book import *

In [None]:
text5.concordance('hi')

In [None]:
text5.similar('hi')

In [None]:
text5.common_contexts(['hi','lol'])

In [None]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

In [None]:
len(sorted(set(text4)))

In [None]:
len(text4)

In [None]:
fdist1 = FreqDist(text1)

### Accessing Text Corpora and Lexical Resources

**Books**

In [None]:
nltk.corpus.gutenberg.fileids()

In [None]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

In [None]:
len(emma)

In [None]:
type(emma)

In [None]:
# Access corpus in different level 
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid)) # raw text length
    num_words = len(gutenberg.words(fileid))# no. of words
    num_sents = len(gutenberg.sents(fileid))# no. of sentences
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))# no of volcabulary
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

**Web**

In [None]:
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65], '...'

**Chat**

In [None]:
from nltk.corpus import nps_chat
for fileid in nps_chat.fileids():
    print fileid, nps_chat.raw(fileid)[:65], '...'

**Brown Corpus**

The Brown Corpus was the first million-word electronic corpus of English, created in 1961 at Brown University. This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on.

In [None]:
from nltk.corpus import brown

In [None]:
brown.categories()

In [None]:
brown.words(categories='news')

**Conditional Frequency Distributions**

A conditional frequency distribution is a collection of frequency distributions, each one for a different "condition". 

Whereas FreqDist() takes a simple list as input, ConditionalFreqDist() takes a list of pairs (condition, event)

In [None]:
cfd = nltk.ConditionalFreqDist(
           (genre, word)
           for genre in brown.categories()
           for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

In [None]:
from nltk.corpus import inaugural

In [None]:
inaugural.fileids()

In [None]:
[fileid[:4] for fileid in inaugural.fileids()]

In [None]:
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['fight', 'freedom']
    if w.lower().startswith(target)) 
cfd.plot()

**Bigrams**

The bigrams() function takes a list of words and builds a list of consecutive word pairs. See how it generate paragraphs based on the conditional probability.

In [None]:
text = inaugural.words()

In [None]:
bigrams = nltk.bigrams(text)

In [None]:
cfd = nltk.ConditionalFreqDist(bigrams)

In [None]:
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = cfdist[word].max()

In [None]:
generate_model(cfd, 'god')

In [None]:
cfd['living']

**Lexical Resources**

A lexicon, or lexical resource, is a collection of words and/or phrases along with associated information such as part of speech and sense definitions.

**WordNet**

WordNet is a semantically-oriented dictionary of English with 155,287 words and 117,659 synonym sets

In [3]:
from nltk.corpus import wordnet as wn

synonyms

In [4]:
wn.synsets('motorcar')

[Synset('car.n.01')]

In [6]:
wn.synset('car.n.01').lemma_names()

[u'car', u'auto', u'automobile', u'machine', u'motorcar']

In [8]:
wn.synset('car.n.01').definition()

u'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [10]:
wn.synset('car.n.01').examples()

[u'he needs a car to get to work']

### Categorizing and Tagging Words

**POS Tagging**

The process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech tagging, POS-tagging, or simply tagging.

In [11]:
text = nltk.word_tokenize("And now for something completely different")

In [12]:
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

CC-conjunction/RB-adverb/IN-preposition/NN-noune/JJ-adjective/VB-Verb


**Tagged Corpora**

In [14]:
nltk.corpus.brown.tagged_words()

[(u'The', u'AT'), (u'Fulton', u'NP-TL'), ...]