# Gutenbern corpus with NLTK package
### Step 1: Download and Install Gutenberg Corpus

    Install NLTK: run sudo pip install -U nltk
    Install Numpy (optional): run sudo pip install -U numpy
    Test installation: run python then type import nltk

### Step 2: Import source data from the package NLTK
#### The Brown Corpus was the first million-word electronic corpus of English, created in 1961 at Brown University. This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on

In [4]:
import nltk as NLTK
import warnings
warnings.filterwarnings('ignore')

In [6]:
from nltk.corpus import brown
cats = brown.categories()
print(cats)

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


### Step 3: Relative Frequency distributionand conditional frequency of modals

In [8]:
#Frequency Distribution
text = brown.words(categories='news')
fdist = NLTK.FreqDist(w.lower() for w in text)
modals = ['can', 'could', 'may', 'might', 'will', 'would', 'should']
print('Frequency Distribution of modals \n')
for mods in modals:
    print(mods + ':', fdist[mods], end=' ')

Frequency Distribution of modals 

can: 94 could: 87 may: 93 might: 38 will: 389 would: 246 should: 61 

In [9]:
cFrqDist = NLTK.ConditionalFreqDist((genre, word)
                               for genre in brown.categories()
                               for word in brown.words(categories=genre))
genres = cats
print('Conditional frequency Distribution for modals across all genres\n')
cFrqDist.tabulate(conditions=genres, samples=modals)

Conditional frequency Distribution for modals across all genres

                   can  could    may  might   will  would should 
      adventure     46    151      5     58     50    191     15 
 belles_lettres    246    213    207    113    236    392    102 
      editorial    121     56     74     39    233    180     88 
        fiction     37    166      8     44     52    287     35 
     government    117     38    153     13    244    120    112 
        hobbies    268     58    131     22    264     78     73 
          humor     16     30      8      8     13     56      7 
        learned    365    159    324    128    340    319    171 
           lore    170    141    165     49    175    186     76 
        mystery     42    141     13     57     20    186     29 
           news     93     86     66     38    389    244     59 
       religion     82     59     78     12     71     68     45 
        reviews     45     40     45     26     58     47     18 
        rom

### Inaugural corpus

In [12]:
from nltk.corpus import inaugural

In [13]:
inaguralCats = inaugural.fileids()
print(inaguralCats)

['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt', '1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', '1949-Truman.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', '1965-Johnson.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt', '1985-Reaga

In [14]:
NLTK.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/raam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
inauguralText = inaugural.words(fileids='1789-Washington.txt')
inauguralText = [word for word in inauguralText if word not in stopwords.words('english') 
                 and word.isalpha() == True 
                 and len(word) > 7]

In [16]:
topTen = dict()
for word in inauguralText:
    topTen[word] = fdist[word]

try:
    topTen = sorted(topTen.items(), key=lambda x: x[1], reverse = True)
    print(topTen)
except:
    topTen = dict()
    for word in inauguralText:
        topTen[word] = fdist[word]
    topTen = sorted(topTen.items(), key=lambda x: x[1], reverse = True)
    print(topTen)

[('government', 73), ('national', 72), ('administration', 62), ('expected', 40), ('received', 33), ('department', 32), ('possible', 29), ('together', 27), ('republican', 26), ('question', 22), ('resolution', 21), ('necessary', 18), ('executive', 18), ('official', 17), ('security', 17), ('effective', 15), ('important', 14), ('decision', 13), ('addition', 12), ('considered', 12), ('personal', 12), ('citizens', 11), ('required', 11), ('included', 11), ('advantage', 10), ('ordinary', 9), ('conflict', 8), ('constitutional', 8), ('therefore', 8), ('advanced', 7), ('established', 7), ('attention', 7), ('sufficient', 6), ('confidence', 6), ('resulted', 6), ('compared', 6), ('governments', 6), ('recommendations', 6), ('experience', 6), ('produced', 6), ('essential', 5), ('employed', 5), ('voluntary', 5), ('circumstances', 5), ('entering', 5), ('prospect', 5), ('judgment', 5), ('summoned', 4), ('experienced', 4), ('consulted', 4), ('independent', 4), ('accomplished', 4), ('communities', 4), ('th

In [18]:
inauguralFDist = NLTK.FreqDist(inTxt.lower() for inTxt in inauguralText)
topWords = ['government', 'national', 'administration', 'expected', 'received', 'department', 
            'possible', 'together', 'republican', 'question']
print('Frequency Distribution of Top 10 words whose character length in > 7 \n')
for top10 in topWords:
    print(top10 + ':', inauguralFDist[top10], '\n', end=' ')

Frequency Distribution of Top 10 words whose character length in > 7 

government: 8 
 national: 1 
 administration: 2 
 expected: 1 
 received: 1 
 department: 2 
 possible: 1 
 together: 1 
 republican: 1 
 question: 1 
 