### Accessing Copus Data

In [1]:
# Load the Brown Corpus
from nltk.corpus import brown

# Find Categories
print(len(brown.categories()))
print(brown.categories())


15
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [2]:
# Get Code snippet from where mystery categoreis
brown.sents(categories ='mystery')

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [3]:
# get POS tagged sentences
brown.tagged_sents(categories ='mystery')

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [4]:
# get the Sentences in natural from
from nltk.tokenize import sent_tokenize

sentences = brown.sents(categories='mystery')
sentences = [' '.join(sent_tokenize) for sent_tokenize in sentences]

#print(sentences) # view Full text

#print(sentences[:5]) # Viewing fist 5 sentences

In [5]:
# Get the Nouns from tagged words
tagged_words = brown.tagged_words(categories='mystery')
nouns = [(word,tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP','NN'])]

#Viewing the first 10 snouns
print(nouns[0:10])

[('patients', 'NNS'), ('bus', 'NN'), ('morning', 'NN'), ('Hanover', 'NP'), ('interne', 'NN'), ('nurse', 'NN'), ('attendants', 'NNS'), ('charge', 'NN'), ('bus', 'NN'), ('window', 'NN')]


In [6]:
from nltk import FreqDist

# Building Frequency distribution for nouns

nouns_freq = FreqDist([word for word,tag in nouns])

#Viewing top 10 occuring nouns
nouns_freq.most_common(10)

[('man', 106),
 ('time', 82),
 ('door', 80),
 ('car', 69),
 ('room', 65),
 ('Mr.', 63),
 ('way', 61),
 ('office', 50),
 ('eyes', 48),
 ('hand', 46)]

In [13]:
# Accessing the Reuters Corpus
from nltk.corpus import reuters

# Get the length of the categories
# print(len(reuters.categories()))
# print the Categories 
# print(reuters.categories())
# Get sentences in housing and income
sentences = reuters.sents(categories = ['housing','income'])

# Display in Natural Language
sentences = [' '.join(sent_tokenize) for sent_tokenize in sentences]

# Viewing first 5 sentences
# sentences[0:5]

# Filed ID based access 
# print(reuters.fileids(categories = ['housing','income']))

# print(reuters.sents(fileids = [u'test/16118',u'test/18534']))

sentences[:4]

["YUGOSLAV ECONOMY WORSENED IN 1986 , BANK DATA SHOWS National Bank economic data for 1986 shows that Yugoslavia ' s trade deficit grew , the inflation rate rose , wages were sharply higher , the money supply expanded and the value of the dinar fell .",
 'The trade deficit for 1986 was 2 . 012 billion dlrs , 25 . 7 pct higher than in 1985 .',
 'The trend continued in the first three months of this year as exports dropped by 17 . 8 pct , in hard currency terms , to 2 . 124 billion dlrs .',
 'Yugoslavia this year started quoting trade figures in dinars based on current exchange rates , instead of dollars based on a fixed exchange rate of 264 . 53 dinars per dollar .']

In [11]:
# Accessing Wordnet Corpus
from nltk.corpus import wordnet as wn

word = 'hike'

# get word synsets
word_synsets = wn.synsets(word)
print(word_synsets)

[Synset('hike.n.01'), Synset('rise.n.09'), Synset('raise.n.01'), Synset('hike.v.01'), Synset('hike.v.02')]


In [14]:
from nltk.corpus import gutenberg
bible = gutenberg.open('bible-kjv.txt')

bible = bible.readlines()

token = [item.split() for item in bible]

token[0:1]

[['[The', 'King', 'James', 'Bible]']]

In [15]:
from nltk import pos_tag

text = "It  a beautiful home"
print(pos_tag(text.split()))


[('It', 'PRP'), ('a', 'DT'), ('beautiful', 'JJ'), ('home', 'NN')]


In [16]:
# Print Stop words
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
print(stopwords)

{"hadn't", 'few', "shouldn't", 'won', "don't", 'ours', 'isn', 'who', 'the', 'being', 'if', 'mightn', "you've", 'no', "she's", 'o', 'about', 'same', 'its', 'we', 'wasn', 'this', 'against', 'up', 'she', 'where', 'my', 'himself', 'be', 'there', 'm', "aren't", 'hasn', 'when', 'some', 'been', 'am', 'that', 'by', 'other', 'me', 'him', "wasn't", 'for', 'which', 'wouldn', 'such', 've', 'don', 're', 'each', 'further', 'than', 'hers', 'or', 'down', 'haven', "you're", 'herself', 'doing', "isn't", 'y', 'before', 'with', 'her', 'on', "mightn't", 'only', "that'll", 'll', 'aren', 'off', 'an', 'out', 'your', 'themselves', 'here', 'all', "hasn't", 'ourselves', 'did', 'more', 'nor', 'in', 'just', 'not', "won't", 'very', 'what', 'into', "mustn't", 'will', 'of', 'yourself', 'shan', 'a', "needn't", "wouldn't", 'you', 'itself', 'has', 'was', 'then', "you'll", 'now', 'because', 'were', 'should', 'ma', 'having', 'whom', 'can', 'didn', "shan't", 'most', 'from', 'while', 'his', 'those', 'at', 'how', 'they', "di

In [17]:
from nltk.stem import PorterStemmer

words = ['game','gaming','gamed','games']
stemwords = [PorterStemmer().stem(v) for v in words]
print(stemwords)

['game', 'game', 'game', 'game']


### Corpus_Entailments

In [18]:
# Entaiments : An entailment is an Implication.
# Example : Looking implices seeing, Buying Implies choosing and paying 

from nltk.corpus import wordnet as wn
print(wn.synset('look.v.01').entailments()) # out : [Synset('see.v.01')]

print(wn.synset('listen.v.01').entailments()) # output : [Synset('hear.v.01')]

print(wn.synset('buy.v.01').entailments()) #  output : [Synset('choose.v.01'), Synset('pay.v.01')]

[Synset('see.v.01')]
[Synset('hear.v.01')]
[Synset('choose.v.01'), Synset('pay.v.01')]


### Corpus_FrequencyDistribution

In [19]:
import nltk
from nltk.corpus import webtext
#print(webtext.fileids())

fileid = 'singles.txt'
wbt_words = webtext.words('singles.txt') # Get the words based on file id
# print(wbt_words)

fdist = nltk.FreqDist(wbt_words) 

# Count of the maximum appearing token
print(fdist.max(),fdist[fdist.max()])

#Total number of distinct tokens in the bag
print("bag:",fdist.N())

# Most common 10 words in the bag
print(fdist.most_common(10))

# Plot Graph
fdist.plot(cumulative = True)


, 539
bag: 4867
[(',', 539), ('.', 353), ('/', 110), ('for', 99), ('and', 74), ('to', 74), ('lady', 68), ('-', 66), ('seeks', 60), ('a', 52)]


<Figure size 640x480 with 1 Axes>

<matplotlib.axes._subplots.AxesSubplot at 0x22c6f6eeb88>

### Corpus_MeronymsAndHolonyms

In [20]:
# Meronym : Meronym is a word that denotes a constituent part or a member of something.
# Example : apple is a meronym of apple tree

# Holonymous : The opposite of a meronym is a holonym - The name of the whole of which the meronym
# Appletree is a holonym of apple 

from nltk.corpus import wordnet as wn
wn.synset('bed.n.01').part_holonyms()

wn.synset('bed.n.01').part_meronyms()


[Synset('bedstead.n.01'), Synset('mattress.n.01')]

### Corpus_WordSimilarity

In [None]:
# Word Similarity : Compute the similarity betwen words based on the distance between words in the wordNet network.
# The Smaller the distance,the more similar the words

from nltk.corpus import wordnet as wn

dog = wn.synset('dog.v.01')
cat = wn.synset('cat.v.01')

print(wn.path_similarity(dog,cat))

phone = wn.synset('phone.v.01')

print(wn.path_similarity(phone,dog))

In [None]:
## Other Corpus data

In [21]:
# Import corpus
from nltk.corpus import brown

# Get the categories in the corpus
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [22]:
# Get the length of the categories
len(brown.categories())

15

In [23]:
# Get the Senetence code snippet where category is 'mystery'
sentences = brown.sents(categories = 'mystery')

# Get the POS tagged sentences
brown.tagged_sents(categories = 'mystery')

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [24]:
# Get the Sentence in Natural Language
sentences = [' '.join(sent_tokenize) for sent_tokenize in sentences]

# Get first five sentences
sentences[0:5]

['There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .',
 'An interne , a nurse and two attendants were in charge of us .',
 "I felt lonely and depressed as I stared out the bus window at Chicago's grim , dirty West Side .",
 'It seemed incredible , as I listened to the monotonous drone of voices and smelled the fetid odors coming from the patients , that technically I was a ward of the state of Illinois , going to a hospital for the mentally ill .',
 'I suddenly thought of Mary Jane Brennan , the way her pretty eyes could flash with anger , her quiet competence , the gentleness and sweetness that lay just beneath the surface of her defenses .']

In [27]:
# Get the Noun form tagged words
taggedwords = brown.tagged_words(categories = 'mystery')

nouns = [(word,tags) for word,tags in taggedwords if any(noun_tag in tags for noun_tag in ['NP','NN'])]

nouns[:10]

[('patients', 'NNS'),
 ('bus', 'NN'),
 ('morning', 'NN'),
 ('Hanover', 'NP'),
 ('interne', 'NN'),
 ('nurse', 'NN'),
 ('attendants', 'NNS'),
 ('charge', 'NN'),
 ('bus', 'NN'),
 ('window', 'NN')]

In [28]:
from nltk.probability import FreqDist

# Get First 10 occuring Noun

nouns_freq = FreqDist([word for word,tag in nouns])

nouns_freq.most_common(10)

[('man', 106),
 ('time', 82),
 ('door', 80),
 ('car', 69),
 ('room', 65),
 ('Mr.', 63),
 ('way', 61),
 ('office', 50),
 ('eyes', 48),
 ('hand', 46)]

In [29]:
from nltk.corpus import reuters

# get Sentenses
sentences = reuters.sents(categories = ['housing','income'])

sentences

[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986', 'was', '2', '.', '012', 'billion', 'dlrs', ',', '25', '.', '7', 'pct', 'higher', 'than', 'in', '1985', '.'], ...]

In [30]:
# Filed ID based access 

reuters.fileids(categories = ['housing','income'])

['test/16118',
 'test/18534',
 'test/18540',
 'test/18664',
 'test/18665',
 'test/18672',
 'test/18911',
 'test/19875',
 'test/20106',
 'test/20116',
 'training/1035',
 'training/1036',
 'training/10602',
 'training/10604',
 'training/11170',
 'training/11665',
 'training/2618',
 'training/29',
 'training/3105',
 'training/3708',
 'training/3720',
 'training/3723',
 'training/3898',
 'training/5883',
 'training/5886',
 'training/6000',
 'training/6067',
 'training/6197',
 'training/7005',
 'training/7006',
 'training/7015',
 'training/7036',
 'training/7098',
 'training/7099',
 'training/9615']

In [31]:
reuters.sents(fileids = [u'test/16118',u'test/18534'])

[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986', 'was', '2', '.', '012', 'billion', 'dlrs', ',', '25', '.', '7', 'pct', 'higher', 'than', 'in', '1985', '.'], ...]