<a href="https://colab.research.google.com/github/RajkumarGalaxy/rajkumar_nltk/blob/master/frequency_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Frequency of words in a document
## A python NLP code
### Using nltk api
#### created in [colab](https://colab.research.google.com)
created by [Rajkumar Lakshmanamoorthy](https://github.com/RajkumarGalaxy/)

In [60]:
# import necessary libraries
import nltk
from nltk.probability import FreqDist
import pandas as pd
import numpy as np
from pprint import pprint

### Download the corpora

In [61]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Fetch words from the corpus

In [63]:
# read corpus and tokenize it
words = nltk.Text(nltk.corpus.gutenberg.words('bryant-stories.txt'))
stop = set(stopwords.words('english'))
pprint(stop)
print(len(stop))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [64]:
words = [i.lower() for i in words if i.isalpha()]
words = [i.lower() for i in words if i not in stop]

In [65]:
print('Total number of words: ', len(words))
print('Vocabulary count:      ', len(set(words)))

Total number of words:  21718
Vocabulary count:       3688


### Find most frequent words in the document

In [66]:
fdist = FreqDist(words)
top = 50
table = pd.DataFrame(index=np.arange(1,top+1,1), columns=['word', 'frequency', 'normalized_frequency', 'count_document_ratio'])
i = 1
print('frequency            --- count of the word')
print('normalized_frequency --- ratio of frequency to vocabulary count')
print('count_document_ratio --- ratio of frequency to total count of words in document')
print('-'*80)
for x, v in fdist.most_common(top):
  table.loc[i, ['word', 'frequency', 'normalized_frequency', 'count_document_ratio']] = x, v, v/len(fdist), v/len(words)
  i += 1
print(table)

frequency            --- count of the word
normalized_frequency --- ratio of frequency to vocabulary count
count_document_ratio --- ratio of frequency to total count of words in document
--------------------------------------------------------------------------------
           word frequency normalized_frequency count_document_ratio
1        little       597             0.161876            0.0274887
2          said       453             0.122831            0.0208583
3          came       191            0.0517896           0.00879455
4           one       183            0.0496204           0.00842619
5         could       158            0.0428416           0.00727507
6          king       141            0.0382321           0.00649231
7          went       122            0.0330803           0.00561746
8         would       112            0.0303688           0.00515701
9         great       110            0.0298265           0.00506492
10          day       107             0.029013      

#### Let's meet with some advanced techniques later