# Assignment 1 - NLTK and corpus functions

### Subcorpus for Play Script

In [63]:
import nltk
from nltk.corpus import PlaintextCorpusReader

In [64]:
corpus_root = "./data"
reviews = PlaintextCorpusReader(corpus_root, '.*')
play_corpus = reviews.words('play.txt')

In [65]:
reviews = PlaintextCorpusReader(corpus_root, '.*')
playWords = reviews.words('play.txt')

#### 1. The length

In [67]:
play_corpus_length = len(playWords)
print("The corpus length:", play_corpus_length )

The corpus length: 33264


#### 2. The lexical diversity

In [69]:
def lexical_diversity(playWords):
    return len(set(playWords)) / len(playWords)
    
print("The lexical diversity:", lexical_diversity(playWords))

The lexical diversity: 0.12785594035594036


#### 3. Top 10 most frequent words and their counts

In [71]:
from nltk import FreqDist

In [72]:
filtered_words = [word.lower() for word in playWords if word.isalpha()]

fdist_play = FreqDist(filtered_words)
most_common_words = fdist_play.most_common(10)
print("10 most frequent words in play script:")
for word, count in most_common_words:
    print(f"word: '{word}', count: {count}")

10 most frequent words in play script:
word: 'and', count: 720
word: 'the', count: 686
word: 'i', count: 646
word: 'to', count: 539
word: 'a', count: 453
word: 'of', count: 385
word: 'my', count: 355
word: 'that', count: 353
word: 'is', count: 348
word: 'romeo', count: 327


#### 4. Words that are at least 10 characters long and their counts

In [93]:
play_long_words = [word.lower() for word in playWords if word.isalpha() and len(word) >= 10]

fdist_play_long_words = FreqDist(play_long_words)

play_long_words_counts = fdist_play_long_words.most_common()

print("Words in the play script that are 10 or more characters long and their counts:")
for word, frequency in play_long_words_counts:
    print(f"{word}: {frequency}")

Words in the play script that are 10 or more characters long and their counts:
servingman: 31
apothecary: 10
servingmen: 7
banishment: 6
churchyard: 6
lamentable: 5
slaughtered: 5
gentlewoman: 4
shakespeare: 3
affections: 3
oppression: 3
inconstant: 3
confession: 3
philosophy: 3
nightingale: 3
unaccustomed: 3
bridegroom: 3
attendants: 2
torchbearers: 2
gentlewomen: 2
households: 2
underneath: 2
maidenhead: 2
disposition: 2
misfortune: 2
discourses: 2
henceforth: 2
flattering: 2
counterfeit: 2
circumstance: 2
henceforward: 2
everlasting: 2
quarreling: 2
immediately: 2
lamentation: 2
threatened: 2
disobedient: 2
prosperous: 2
detestable: 2
instruments: 2
misadventure: 2
shakespeares: 1
characters: 1
misadventured: 1
overthrows: 1
continuance: 1
maidenheads: 1
flourishes: 1
rebellious: 1
pernicious: 1
mistempered: 1
interchanging: 1
augmenting: 1
artificial: 1
portentous: 1
importuned: 1
transgression: 1
preserving: 1
accustomed: 1
unattainted: 1
transparent: 1
remembered: 1
lammastide: 1

#### 5. The longest sentence (type the sentence and give the number of words)

In [76]:
play_sentences = reviews.sents('play.txt')

longest_len = max(len(s) for s in play_sentences)

longest_play_sents = [s for s in play_sentences if len(s) == longest_len]


if longest_play_sents:  
    joined_longest_sents = ' '.join(longest_play_sents[0])


print(f"Longest sentence:\n{joined_longest_sents}")
print()
print(f"Number of words: {len(longest_play_sents[0])}")
# the dashes are used for a pausing effect since it is a play script and does not indicate the end of a sentence
## Therefore, making this the longest sentence

Longest sentence:
Or , if I live , is it not very like The horrible conceit of death and night , Together with the terror of the place — As in a vault , an ancient receptacle 40 Where for this many hundred years the bones Of all my buried ancestors are packed ; Where bloody Tybalt , yet but green in earth , Lies fest ’ ring in his shroud ; where , as they say , At some hours in the night spirits resort — 45 Alack , alack , is it not like that I , So early waking , what with loathsome smells , And shrieks like mandrakes torn out of the earth , That living mortals , hearing them , run mad — O , if I wake , shall I not be distraught , 50 Environèd with all these hideous fears , And madly play with my forefathers ’ joints , And pluck the mangled Tybalt from his shroud , And , in this rage , with some great kinsman ’ s bone , As with a club , dash out my desp ’ rate brains ?

Number of words: 191


In [77]:
from nltk.tokenize import word_tokenize

play_sentences = reviews.sents('play.txt')

def count_words(sentence):
    words = word_tokenize(' '.join(sentence))
    words_only = [word for word in words if word.isalpha()]
    return len(words_only)

longest_len = max(count_words(s) for s in play_sentences)

longest_play_sents = [s for s in play_sentences if count_words(s) == longest_len]

if longest_play_sents:  
    joined_longest_sents = ' '.join(longest_play_sents[0])

print(f"Longest sentence:\n{joined_longest_sents}")
print()
print(f"Number of words: {count_words(longest_play_sents[0])}")

Longest sentence:
Or , if I live , is it not very like The horrible conceit of death and night , Together with the terror of the place — As in a vault , an ancient receptacle 40 Where for this many hundred years the bones Of all my buried ancestors are packed ; Where bloody Tybalt , yet but green in earth , Lies fest ’ ring in his shroud ; where , as they say , At some hours in the night spirits resort — 45 Alack , alack , is it not like that I , So early waking , what with loathsome smells , And shrieks like mandrakes torn out of the earth , That living mortals , hearing them , run mad — O , if I wake , shall I not be distraught , 50 Environèd with all these hideous fears , And madly play with my forefathers ’ joints , And pluck the mangled Tybalt from his shroud , And , in this rage , with some great kinsman ’ s bone , As with a club , dash out my desp ’ rate brains ?

Number of words: 152


#### 6. A stemmed version of the longest sentence

In [79]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
play_sentences = reviews.sents('play.txt')

longest_len = max(count_words(s) for s in play_sentences)
longest_play_sents = [s for s in play_sentences if count_words(s) == longest_len]


if longest_play_sents:  
    longest_sentence = longest_play_sents[0]
   
words_in_sentence = word_tokenize(' '.join(longest_sentence))
words_only = [word for word in words_in_sentence if word.isalpha()]
   
stemmed_words = [stemmer.stem(word) for word in words_only]
stemmed_sentence = ' '.join(stemmed_words)

print(f"Original longest sentence:\n{' '.join(longest_sentence)}\n")
print(f"Stemmed version of the longest sentence:\n{stemmed_sentence}")


Original longest sentence:
Or , if I live , is it not very like The horrible conceit of death and night , Together with the terror of the place — As in a vault , an ancient receptacle 40 Where for this many hundred years the bones Of all my buried ancestors are packed ; Where bloody Tybalt , yet but green in earth , Lies fest ’ ring in his shroud ; where , as they say , At some hours in the night spirits resort — 45 Alack , alack , is it not like that I , So early waking , what with loathsome smells , And shrieks like mandrakes torn out of the earth , That living mortals , hearing them , run mad — O , if I wake , shall I not be distraught , 50 Environèd with all these hideous fears , And madly play with my forefathers ’ joints , And pluck the mangled Tybalt from his shroud , And , in this rage , with some great kinsman ’ s bone , As with a club , dash out my desp ’ rate brains ?

Stemmed version of the longest sentence:
or if i live is it not veri like the horribl conceit of death and 

#### 7. Overall (not for each subcorpus): A reflection (1 paragraph or so): 
What do the most frequent words, the longest words, and longest sentence tell you about each
of the 3 genres? How do you interpret the lexical diversity?