In [1]:
import nltk
from nltk.corpus import brown
from collections import Counter

In [2]:
# Download Brown corpus if not already downloaded
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Praveena\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
# 1. Explore the sections (categories) in the Brown Corpus
sections = brown.categories()
print("Brown Corpus Sections:", sections)

Brown Corpus Sections: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
# 2. Categorize documents in each section
section_documents = {section: len(brown.fileids(categories=section)) for section in sections}
print("\nNumber of documents in each section:")
for section, count in section_documents.items():
    print(f"{section}: {count}")


Number of documents in each section:
adventure: 29
belles_lettres: 75
editorial: 27
fiction: 29
government: 30
hobbies: 36
humor: 9
learned: 80
lore: 48
mystery: 24
news: 44
religion: 17
reviews: 17
romance: 29
science_fiction: 6


In [10]:
# 3. Choose a section and analyze it (e.g., 'news')
section_name = 'news'
file_ids = brown.fileids(categories=section_name)
# Count the number of words in the chosen section
word_count = sum(len(brown.words(fileid)) for fileid in file_ids)
print("Word count is",word_count)
# Count the number of sentences in the chosen section
sentence_count = sum(len(brown.sents(fileid)) for fileid in file_ids)
print("Sentence count is",sentence_count)
# Find all genres in the chosen section
unique_genres = set(brown.categories(fileids=file_ids))
print("No. of Genres:",unique_genres)


Word count is 100554
Sentence count is 4623
No. of Genres: {'news'}


In [12]:
# 4. Count 'wh' words in the chosen section
wh_words = ['what', 'why', 'who', 'whom', 'where', 'when', 'which', 'whose', 'whether', 'how']
wh_word_count = Counter(word.lower() for fileid in file_ids for word in brown.words(fileid) if word.lower() in wh_words)
print(f"\n'Wh' word counts in '{section_name}' section:")
for wh_word, count in wh_word_count.items():
    print(f"{wh_word}: {count}")


'Wh' word counts in 'news' section:
which: 245
when: 169
who: 268
whether: 18
where: 59
what: 95
why: 14
whom: 8
how: 42
whose: 22


In [14]:
# 1. Get all file IDs in the corpus
file_ids = brown.fileids()
print("\nFile IDs:")
print(file_ids)

# 2. Get file IDs corresponding to specific categories
categories = brown.categories()
print("\nCategories in Brown Corpus:")
print(categories)

file_ids_in_news = brown.fileids(categories=['news'])
print("\nFile IDs in 'news' category:")
print(file_ids_in_news)

# 3. Get categories corresponding to specific file IDs
categories_for_files = brown.categories(fileids=file_ids[:5])
print("\nCategories for first 5 files:")
print(categories_for_files)

# 4. Get raw content of the whole corpus
raw_content = brown.raw()
print("\nRaw content sample:")
print(raw_content[:500])  # Print first 500 characters

# 5. Get raw content of specific files
raw_content_files = brown.raw(fileids=['ca01', 'ca02'])
print("\nRaw content of files 'ca01' and 'ca02':")
print(raw_content_files[:500])  # Print first 500 characters





File IDs:
['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', 'ca08', 'ca09', 'ca10', 'ca11', 'ca12', 'ca13', 'ca14', 'ca15', 'ca16', 'ca17', 'ca18', 'ca19', 'ca20', 'ca21', 'ca22', 'ca23', 'ca24', 'ca25', 'ca26', 'ca27', 'ca28', 'ca29', 'ca30', 'ca31', 'ca32', 'ca33', 'ca34', 'ca35', 'ca36', 'ca37', 'ca38', 'ca39', 'ca40', 'ca41', 'ca42', 'ca43', 'ca44', 'cb01', 'cb02', 'cb03', 'cb04', 'cb05', 'cb06', 'cb07', 'cb08', 'cb09', 'cb10', 'cb11', 'cb12', 'cb13', 'cb14', 'cb15', 'cb16', 'cb17', 'cb18', 'cb19', 'cb20', 'cb21', 'cb22', 'cb23', 'cb24', 'cb25', 'cb26', 'cb27', 'cc01', 'cc02', 'cc03', 'cc04', 'cc05', 'cc06', 'cc07', 'cc08', 'cc09', 'cc10', 'cc11', 'cc12', 'cc13', 'cc14', 'cc15', 'cc16', 'cc17', 'cd01', 'cd02', 'cd03', 'cd04', 'cd05', 'cd06', 'cd07', 'cd08', 'cd09', 'cd10', 'cd11', 'cd12', 'cd13', 'cd14', 'cd15', 'cd16', 'cd17', 'ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', 'ce08', 'ce09', 'ce10', 'ce11', 'ce12', 'ce13', 'ce14', 'ce15', 'ce16', 'ce17', 'ce18', 'ce1

In [15]:
# 6. Get raw content of specific categories
raw_content_categories = brown.raw(categories=['news', 'editorial'])
print("\nRaw content of 'news' and 'editorial' categories:")
print(raw_content_categories[:500])  # Print first 500 characters

# 7. Get all words in the corpus
words_in_corpus = brown.words()
print("\nTotal words in the corpus:")
print(len(words_in_corpus))

# 8. Get words from specific file IDs
words_in_files = brown.words(fileids=['ca01', 'ca02'])
print("\nWords in files 'ca01' and 'ca02':")
print(words_in_files[:50])  # Print first 50 words

# 9. Get words from specific categories
words_in_categories = brown.words(categories=['news', 'editorial'])
print("\nWords in 'news' and 'editorial' categories:")
print(words_in_categories[:50])  # Print first 50 words

# 10. Get all sentences in the corpus
sentences_in_corpus = brown.sents()
print("\nTotal sentences in the corpus:")
print(len(sentences_in_corpus))

# 11. Get sentences from specific file IDs
sentences_in_files = brown.sents(fileids=['ca01', 'ca02'])
print("\nSentences in files 'ca01' and 'ca02':")
print(sentences_in_files[:3])  # Print first 3 sentences

# 12. Get sentences from specific categories
sentences_in_categories = brown.sents(categories=['news', 'editorial'])
print("\nSentences in 'news' and 'editorial' categories:")
print(sentences_in_categories[:3])  # Print first 3 sentences


Raw content of 'news' and 'editorial' categories:


	The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.


	The/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/c

Total words in the corpus:
1161192

Words in files 'ca01' and 'ca02':
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.', 'The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'ove

In [17]:
# 13. Get sentences from specific file IDs
sentences_files_specific = brown.sents(fileids=['ca01', 'ca02'])
print("\nSentences from files 'ca01' and 'ca02':")
print(sentences_files_specific[:3])





Sentences from files 'ca01' and 'ca02':
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']]


In [18]:
# 14. Get sentences from specific categories
sentences_categories_specific = brown.sents(categories=['news', 'hobbies'])
print("\nSentences from 'news' and 'hobbies' categories:")
print(sentences_categories_specific[:3])




Sentences from 'news' and 'hobbies' categories:
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']]


In [19]:
# 15. Get the absolute path of a file
file_path = brown.abspath('ca01')
print("\nAbsolute path of file 'ca01':")
print(file_path)




Absolute path of file 'ca01':
C:\Users\Praveena\AppData\Roaming\nltk_data\corpora\brown\ca01


In [20]:
# 16. Get the encoding of a file
file_encoding = brown.encoding('ca01')
print("\nEncoding of file 'ca01':")
print(file_encoding)




Encoding of file 'ca01':
ascii


In [21]:
# 17. Open a stream for reading a corpus file
file_stream = brown.open('ca01')
print("\nOpened file stream for 'ca01':")
print(file_stream.read(500))  # Read first 500 characters
file_stream.close()




Opened file stream for 'ca01':


	The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.


	The/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/c


In [25]:
# 18. Get the root path of the corpus
root_path = brown.root
print("\nRoot path of the Brown Corpus:")
print(root_path)




Root path of the Brown Corpus:
C:\Users\Praveena\AppData\Roaming\nltk_data\corpora\brown


In [23]:
# 19. Get the contents of the README file of the corpus
readme_content = brown.readme()
print("\nContents of README file:")
print(readme_content)


Contents of README file:
BROWN CORPUS

A Standard Corpus of Present-Day Edited American
English, for use with Digital Computers.

by W. N. Francis and H. Kucera (1964)
Department of Linguistics, Brown University
Providence, Rhode Island, USA

Revised 1971, Revised and Amplified 1979

http://www.hit.uib.no/icame/brown/bcm.html

Distributed with the permission of the copyright holder,
redistribution permitted.

