## Accessing text corpora 

### Brown corpus

In [1]:
# load the Brown Corpus
from nltk.corpus import brown                     #Downloading the Brown library
print('Total Categories:', len(brown.categories()))

Total Categories: 15


In [2]:
print(brown.categories()) #Printing the Categories Seperately

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
# tokenized sentences
brown.sents(categories='mystery') #Having a clear idea on tokenization

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [4]:
# POS tagged sentences
brown.tagged_sents(categories='mystery') #Using mystery Category

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [5]:
# get sentences in natural form
sentences = brown.sents(categories='mystery') #Printing Sentences
sentences = [''.join(sentence_token) for sentence_token in sentences]
print(sentences[0:5])

['Therewerethirty-eightpatientsonthebusthemorningIleftforHanover,mostofthemdisturbedandhallucinating.', 'Aninterne,anurseandtwoattendantswereinchargeofus.', "IfeltlonelyanddepressedasIstaredoutthebuswindowatChicago'sgrim,dirtyWestSide.", 'Itseemedincredible,asIlistenedtothemonotonousdroneofvoicesandsmelledthefetidodorscomingfromthepatients,thattechnicallyIwasawardofthestateofIllinois,goingtoahospitalforthementallyill.', 'IsuddenlythoughtofMaryJaneBrennan,thewayherprettyeyescouldflashwithanger,herquietcompetence,thegentlenessandsweetnessthatlayjustbeneaththesurfaceofherdefenses.']


In [6]:
# get tagged words
tagged_words = brown.tagged_words(categories='mystery')
nouns = [(word,tag) for word, tag in tagged_words if any (noun_tag in tag for noun_tag in ['NP','NN'])]
print(nouns[0:10])

[('patients', 'NNS'), ('bus', 'NN'), ('morning', 'NN'), ('Hanover', 'NP'), ('interne', 'NN'), ('nurse', 'NN'), ('attendants', 'NNS'), ('charge', 'NN'), ('bus', 'NN'), ('window', 'NN')]


### Reuters Corpus

#### Investigating categories in Reuters corpus

In [7]:
from nltk.corpus import reuters
#reuters library investigate
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [8]:
# Filled based access

print(reuters.fileids(categories=['housing','income']))
print(reuters.sents(fileids=[u'test/16118',u'test/18534']))

['test/16118', 'test/18534', 'test/18540', 'test/18664', 'test/18665', 'test/18672', 'test/18911', 'test/19875', 'test/20106', 'test/20116', 'training/1035', 'training/1036', 'training/10602', 'training/10604', 'training/11170', 'training/11665', 'training/2618', 'training/29', 'training/3105', 'training/3708', 'training/3720', 'training/3723', 'training/3898', 'training/5883', 'training/5886', 'training/6000', 'training/6067', 'training/6197', 'training/7005', 'training/7006', 'training/7015', 'training/7036', 'training/7098', 'training/7099', 'training/9615']
[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986'

### WordNet corpus

In [9]:
# load the Wordnet Corpus
from nltk.corpus import wordnet as wn
word = 'hike'
# get word synsets
word_synsets = wn.synsets(word)
print(word_synsets)

[Synset('hike.n.01'), Synset('rise.n.09'), Synset('raise.n.01'), Synset('hike.v.01'), Synset('hike.v.02')]


In [10]:
# get details for each synonym in synset
for synset in word_synsets:
    print('Synset Name:',synset.pos())
    print('POS Tag:',synset.pos())
    print('Definition:',synset.definition())
    print('Examples:',synset.examples())
    print()

Synset Name: n
POS Tag: n
Definition: a long walk usually for exercise or pleasure
Examples: ['she enjoys a hike in her spare time']

Synset Name: n
POS Tag: n
Definition: an increase in cost
Examples: ['they asked for a 10% rise in rates']

Synset Name: n
POS Tag: n
Definition: the amount a salary is increased
Examples: ['he got a 3% raise', 'he got a wage hike']

Synset Name: v
POS Tag: v
Definition: increase
Examples: ['The landlord hiked up the rents']

Synset Name: v
POS Tag: v
Definition: walk a long way, as for pleasure or physical exercise
Examples: ['We were hiking in Colorado', 'hike the Rockies']



## Frequency Distributions

In [11]:
from nltk.book import gutenberg
from nltk.book import FreqDist

emmawords = gutenberg.words("austen-emma.txt")

fdist = FreqDist(emmawords)
fdist.most_common(50)

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


[(',', 11454),
 ('.', 6928),
 ('to', 5183),
 ('the', 4844),
 ('and', 4672),
 ('of', 4279),
 ('I', 3178),
 ('a', 3004),
 ('was', 2385),
 ('her', 2381),
 (';', 2199),
 ('it', 2128),
 ('in', 2118),
 ('not', 2101),
 ('"', 2004),
 ('be', 1970),
 ('she', 1778),
 ('that', 1730),
 ('you', 1677),
 ('had', 1606),
 ('as', 1387),
 ('--', 1382),
 ('he', 1365),
 ('for', 1321),
 ('have', 1301),
 ('is', 1220),
 ('with', 1187),
 ('Mr', 1153),
 ('very', 1151),
 ('but', 1148),
 ('."', 1138),
 ('his', 1088),
 ("'", 1007),
 ('at', 997),
 ('s', 933),
 ('so', 924),
 ('Emma', 865),
 ('all', 835),
 ('could', 825),
 ('would', 815),
 ('been', 759),
 ('him', 758),
 ('Mrs', 699),
 ('.--', 685),
 ('on', 677),
 ('any', 651),
 ('my', 619),
 ('no', 616),
 ('Miss', 592),
 ('were', 591)]

In [12]:
import re

def makeAlphaFreqDist(words):
 adist = FreqDist()
 pattern = re.compile('.*[^a-z].*')
 for word in words:
  if not pattern.match(word):
   adist.update([word])
 return adist

In [13]:
adist = makeAlphaFreqDist(emmawords)
adist.most_common(50)

[('to', 5183),
 ('the', 4844),
 ('and', 4672),
 ('of', 4279),
 ('a', 3004),
 ('was', 2385),
 ('her', 2381),
 ('it', 2128),
 ('in', 2118),
 ('not', 2101),
 ('be', 1970),
 ('she', 1778),
 ('that', 1730),
 ('you', 1677),
 ('had', 1606),
 ('as', 1387),
 ('he', 1365),
 ('for', 1321),
 ('have', 1301),
 ('is', 1220),
 ('with', 1187),
 ('very', 1151),
 ('but', 1148),
 ('his', 1088),
 ('at', 997),
 ('s', 933),
 ('so', 924),
 ('all', 835),
 ('could', 825),
 ('would', 815),
 ('been', 759),
 ('him', 758),
 ('on', 677),
 ('any', 651),
 ('my', 619),
 ('no', 616),
 ('were', 591),
 ('do', 580),
 ('must', 564),
 ('me', 564),
 ('will', 559),
 ('by', 558),
 ('which', 552),
 ('from', 535),
 ('or', 490),
 ('said', 484),
 ('much', 478),
 ('more', 464),
 ('an', 452),
 ('are', 447)]

In [14]:
for word,freq in adist.most_common(30):
 print(word, freq)

to 5183
the 4844
and 4672
of 4279
a 3004
was 2385
her 2381
it 2128
in 2118
not 2101
be 1970
she 1778
that 1730
you 1677
had 1606
as 1387
he 1365
for 1321
have 1301
is 1220
with 1187
very 1151
but 1148
his 1088
at 997
s 933
so 924
all 835
could 825
would 815


In [15]:
# import Moby Dick

from nltk.book import text1
text1

<Text: Moby Dick by Herman Melville 1851>

In [16]:
# creating the frequency distribution
mbdist = FreqDist(text1)
# printing all of the keys
mbdist.keys()



In [17]:
# get the frequency of words 'nations', 'magic' and 'papers'
print("frequency of the word \'nations\': "+str(mbdist['nations']))
print("frequency of the word \'magic\': "+str(mbdist['magic']))
print("frequency of the word \'papers\': "+str(mbdist['papers']))

frequency of the word 'nations': 12
frequency of the word 'magic': 9
frequency of the word 'papers': 10


In [18]:
mbdFreq = makeAlphaFreqDist(text1)
mbdFreq.most_common(50)

[('the', 13721),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 ('in', 3916),
 ('that', 2982),
 ('his', 2459),
 ('it', 2209),
 ('s', 1739),
 ('is', 1695),
 ('he', 1661),
 ('with', 1659),
 ('was', 1632),
 ('as', 1620),
 ('all', 1462),
 ('for', 1414),
 ('this', 1280),
 ('at', 1231),
 ('by', 1137),
 ('but', 1113),
 ('not', 1103),
 ('him', 1058),
 ('from', 1052),
 ('be', 1030),
 ('on', 1005),
 ('so', 918),
 ('whale', 906),
 ('one', 889),
 ('you', 841),
 ('had', 767),
 ('have', 760),
 ('there', 715),
 ('or', 697),
 ('were', 680),
 ('now', 646),
 ('which', 640),
 ('me', 627),
 ('like', 624),
 ('their', 612),
 ('are', 586),
 ('they', 586),
 ('an', 582),
 ('some', 578),
 ('then', 571),
 ('my', 564),
 ('when', 553),
 ('upon', 538),
 ('out', 529),
 ('into', 520)]