# Frequency Distribution
***

Freqency Distribution consists of finding the frequency of appearence of words in a text. NLTK provides support for such operation using `FreqDist` and `ConditionalFreqDist` classes.

##### Getting started

In [1]:
import nltk
from nltk import FreqDist, ConditionalFreqDist

In [2]:
s = "I saw Susie sitting in a shoeshine shop. Where she shines she sits, and where she sits she shines."

##### Split string in a list of words

In [3]:
s_low = s.split(' ')

In [4]:
print(s_low)

['I', 'saw', 'Susie', 'sitting', 'in', 'a', 'shoeshine', 'shop.', 'Where', 'she', 'shines', 'she', 'sits,', 'and', 'where', 'she', 'sits', 'she', 'shines.']


##### Frequency distribution of list of words

In [5]:
s_fd = FreqDist(s_low)

In [6]:
s_fd

FreqDist({'she': 4, 'I': 1, 'saw': 1, 'Susie': 1, 'sitting': 1, 'in': 1, 'a': 1, 'shoeshine': 1, 'shop.': 1, 'Where': 1, ...})

##### Top n most common words

In [7]:
s_fd.most_common(2)

[('she', 4), ('I', 1)]

##### Frequency of words in a corpus

In [8]:
from nltk.corpus import brown

In [9]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [10]:
lw_humor = brown.words(categories=['humor'])

In [11]:
print(lw_humor)

['It', 'was', 'among', 'these', 'that', 'Hinkle', ...]


In [12]:
len(lw_humor)

21695

In [13]:
fd_humor = FreqDist(lw_humor)

In [14]:
fd_humor.most_common(10)

[(',', 1331),
 ('the', 930),
 ('.', 877),
 ('of', 515),
 ('and', 512),
 ('a', 505),
 ('to', 463),
 ('``', 343),
 ("''", 340),
 ('in', 334)]

##### list of words which appear only once in a text (hapax)

In [15]:
fd_humor.hapaxes()

['Hinkle',
 'fancying',
 'marriages',
 'listed',
 'Mormon',
 'Beard',
 'instigation',
 'fourth',
 'victim',
 'beard',
 'secured',
 'report',
 'trail',
 'Cal',
 'gas',
 'main',
 'boulevards',
 'attendants',
 'Ye',
 'Olde',
 'Gasse',
 'Filling',
 'Station',
 'Avocado',
 'Avenue',
 'paused',
 'route',
 'Diego',
 'headed',
 'direction',
 'Juan',
 'Capistrano',
 'By-the-Sea',
 'quaint',
 'Spanish',
 'Mission',
 'Drive-in',
 'eating',
 'tamale',
 'convertible',
 'quest',
 'stolen',
 'rug',
 'robe',
 'pawnshop',
 'Glendale',
 'placed',
 'informing',
 'questioning',
 'apprehended',
 'larceny',
 'suspect',
 'murders',
 'uncovered',
 'meaningless',
 'phrases',
 'souls',
 'detective',
 'commenting',
 'behavior',
 'myriad',
 'citizens',
 'community',
 'mentally',
 'unhinged',
 'harmless',
 'awaiting',
 'failed',
 'endeavoring',
 'wrists',
 'fainted',
 'sight',
 'blood',
 'authorities',
 'significance',
 'episode',
 'offered',
 'whisky',
 'lifelong',
 'teetotaler',
 'vaulting',
 'mayorship',
 'esca

##### Most frequent word

In [16]:
fd_humor.max()

','

##### Relative frequency of word

In [17]:
fd_humor.freq('in')

0.01539525236229546

##### Absolute Frequency of a word

In [18]:
fd_humor.get('in')

334

##### Normalized Frequency of word in text (Relative Frequency)

In [19]:
fd_humor.get('in') / len(lw_humor)

0.01539525236229546