In [1]:
import nltk

In [2]:
from nltk.corpus import gutenberg 

In [3]:
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /home/sv/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /home/sv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
moby_raw = gutenberg.raw('melville-moby_dick.txt') 

In [5]:
def answer_one(text):

    tokens = nltk.word_tokenize(text)
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens)

In [6]:
answer_one(moby_raw)

0.08133224587104161

In [7]:
from nltk.tokenize import word_tokenize

def answer_two(text): 

    tokens = nltk.word_tokenize(text)
    whale_count = sum([1 for token in tokens if token.lower() == 'whale'])
    
    total_tokens = len(tokens)
    return (whale_count / total_tokens) * 100

In [8]:
answer_two(moby_raw)

0.42583559452295433

In [9]:
from nltk import FreqDist

def answer_three(text):

    tokens = nltk.word_tokenize(text)
    freq_dist = FreqDist(tokens)
    return freq_dist.most_common(10)

In [10]:
answer_three(moby_raw)

[(',', 19204),
 ('the', 13715),
 ('.', 7306),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2978)]

In [11]:
from collections import Counter

def answer_four(text, min_length=6, min_frequency=150):

    tokens = nltk.word_tokenize(text)
    bag_of_words = Counter(tokens)
    
    return sorted([x[0] for x in bag_of_words.most_common() if len(x[0]) > 5 and x[1] > 150])

In [12]:
answer_four(moby_raw)

['Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'seemed',
 'should',
 'though',
 'through',
 'whales',
 'without']

In [13]:
def answer_five(text):

    tokens = word_tokenize(text)
    longest_word = max(tokens, key=len)
    return longest_word, len(longest_word)

In [14]:
answer_five(moby_raw)

("twelve-o'clock-at-night", 23)

In [15]:
def answer_six(text, min_frequency=2000):

    tokens = word_tokenize(text.lower())
    word_frequencies = Counter(tokens)
    return [(word, freq) for word, freq in word_frequencies.items() if word.isalpha() and freq > min_frequency]

In [16]:
answer_six(moby_raw)

[('a', 4698),
 ('to', 4597),
 ('the', 14422),
 ('in', 4163),
 ('and', 6414),
 ('i', 2101),
 ('his', 2530),
 ('of', 6586),
 ('it', 2508),
 ('that', 3081)]

In [17]:
from nltk import sent_tokenize, word_tokenize
import numpy as np

def answer_seven(text):

    sentences = sent_tokenize(text)
    total_tokens = sum(len(word_tokenize(sentence)) for sentence in sentences)
    return total_tokens / len(sentences) if len(sentences) > 0 else 0

In [18]:
answer_seven(moby_raw)

25.88591149005278

In [19]:
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize
from collections import Counter

def answer_eight(text, n=5):
    
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    pos_freq = Counter(tag for word, tag in pos_tags)
    return pos_freq.most_common(n)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sv/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
answer_eight(moby_raw)

[('NN', 32727), ('IN', 28662), ('DT', 25879), (',', 19204), ('JJ', 17613)]

In [21]:
nltk.download('words')
from nltk.corpus import words
from nltk.metrics import edit_distance

correct_spellings = words.words()

def spelling_recommender(misspelled_words):
    recommendations = []

    for misspelled_word in misspelled_words:
        candidates = [word for word in correct_spellings if word.startswith(misspelled_word[0])]
        recommended_word = min(candidates, key=lambda word: edit_distance(misspelled_word, word, transpositions=True))
        recommendations.append(f'{misspelled_word}_recommendation: {recommended_word}')

    return recommendations

misspelled_words = ['cormulent', 'incendenece', 'validrate']
recommendations = spelling_recommender(misspelled_words)
print(recommendations)

[nltk_data] Downloading package words to /home/sv/nltk_data...
[nltk_data]   Package words is already up-to-date!


['cormulent_recommendation: corpulent', 'incendenece_recommendation: intendence', 'validrate_recommendation: validate']
