In [None]:
import numpy as np
import nltk
from nltk.corpus import gutenberg, words
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

In [78]:
moby_raw = gutenberg.raw('melville-moby_dick.txt') 

### Example 1
How many tokens (words and punctuation symbols) are in moby_raw?
This function should return an integer.

In [79]:
def example_one():
    from nltk.tokenize import word_tokenize
    return len(word_tokenize(moby_raw)) 


In [80]:
print ('{:,}'.format(example_one()))

255,028


### Example 2
How many unique tokens (unique words and punctuation) does moby_raw have?
This function should return an integer.

In [81]:
def example_two():    
    return len(set(nltk.word_tokenize(moby_raw)))

In [82]:
print ('{:,}'.format(example_two()))

20,742


### Example 3
After lemmatizing the verbs, how many unique tokens does moby_raw have?
This function should return an integer.

In [None]:
def example_three():
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w,'v') for w in nltk.word_tokenize(moby_raw)]
    return len(set(lemmatized))

In [84]:
print ('{:,}'.format(example_three()))

16,887


<font color = green >

### Question 1

</font>


What is the lexical diversity of the given text input? (i.e. ratio of unique tokens to the total number of tokens)
<br>*This function should return a float.*


In [None]:
def answer_one():
    # Split the text into tokens and count how many there are
    tokens = word_tokenize(moby_raw)
    
    return len(set(tokens)) / len(tokens)

In [88]:
answer_one()

0.08133224587104161

<font color = green >

### Question 2

</font>

What percentage of tokens is 'whale'or 'Whale'?
<br>*This function should return a float.*

In [None]:
def answer_two():
    # Text tokenisation
    tokens = word_tokenize(moby_raw)
    
    # Count how many times "whale" or "Whale" appears
    count_whale = sum(1 for t in tokens if t == "whale" or t == "Whale")
    
    # Return the percentage frequency of the word "whale"
    return count_whale / len(tokens)*100

In [93]:
answer_two()

0.4125037250811676

<font color = green >

### Question 3

</font>

What are the 20 most frequently occurring (unique) tokens in the text? What is their frequency?
<br>*This function should return a list of 10 tuples where each tuple is of the form `(token, frequency)`. The list should be sorted in descending order of frequency.*

In [None]:
def answer_three():
    # Tokenisation
    tokens = word_tokenize(moby_raw)
    
    # Word frequency distribution
    freq_dist = FreqDist(tokens)
    
    # 20 most frequently used words
    return freq_dist.most_common(20)

In [95]:
answer_three()

[(',', 19204),
 ('the', 13715),
 ('.', 7306),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2978),
 ('his', 2459),
 ('it', 2196),
 ('I', 2113),
 ('!', 1767),
 ('is', 1722),
 ('--', 1713),
 ('with', 1659),
 ('he', 1658),
 ('was', 1639),
 ('as', 1620)]

<font color = green >

### Question 4

</font>

What tokens have a length of greater than 5 and frequency of more than 150?
<br>*This function should return a sorted list of the tokens that match the above constraints. To sort your list, use `sorted()`*

In [None]:
def answer_four():
    # Tokenisation
    tokens = word_tokenize(moby_raw)
    
    # Select words longer than 5 characters and occurring more than 150 times
    freq_dist = FreqDist(tokens)
    
    result = [w for w in freq_dist if len(w) > 5 and freq_dist[w] > 150]
    
    # Sort results alphabetically
    return sorted(result)

In [97]:
answer_four()

['Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'seemed',
 'should',
 'though',
 'through',
 'whales',
 'without']

<font color = green >

### Question 5

</font>

Find the longest word in text1 and that word's length.
<br>
*This function should return a tuple `(longest_word, length)`.*


In [None]:
def answer_five():
    # Tokenisation
    
    tokens = word_tokenize(moby_raw)
    
    # Find the longest word
    longest_word = max(tokens, key=len)
    
    # Returning the word and length
    return (longest_word, len(longest_word))

In [100]:
answer_five()

("twelve-o'clock-at-night", 23)

<font color = green >

### Question 6

</font>

What unique words have a frequency of more than 2000? What is their frequency?
<br>*This function should return a list of tuples of the form `(frequency, word)` sorted in descending order of frequency.*


In [None]:
def answer_six():
    # Tokenisation
    tokens = word_tokenize(moby_raw)
    freq_dist = FreqDist(tokens)
    
    # Create a list of tuples (frequency, word) only for words with a frequency > 2000
    result = [(freq, word) for word, freq in freq_dist.items() if freq > 2000]
    
    return sorted(result, reverse=True)

In [104]:
answer_six()

[(19204, ','),
 (13715, 'the'),
 (7306, '.'),
 (6513, 'of'),
 (6010, 'and'),
 (4545, 'a'),
 (4515, 'to'),
 (4173, ';'),
 (3908, 'in'),
 (2978, 'that'),
 (2459, 'his'),
 (2196, 'it'),
 (2113, 'I')]

<font color = green >

### Question 7

</font>

What is the average number of tokens per sentence?
<br>*This function should return a float.*

In [None]:
def answer_seven():
    # Sentences from raw text
    sentences = sent_tokenize(moby_raw)
    
    # Count the number of tokens in each sentence
    tokens_per_sentence = [len(word_tokenize(sent)) for sent in sentences]
    
    # average value
    return np.mean(tokens_per_sentence)

In [130]:
answer_seven()

25.88591149005278

<font color = green >

### Question 8

</font>

What are the 5 most frequent parts of speech in this text? What is their frequency?
<br>*This function should return a list of tuples of the form `(part_of_speech, frequency)` sorted in descending order of frequency.*

In [None]:
tokens = word_tokenize(moby_raw)

def answer_eight():
    # Part-of-speech tagging
    tagged = nltk.pos_tag(tokens)
    tags = [tag for _, tag in tagged]
    
    # calculation
    pos_counts = Counter(tags)
    
    # sorting: by frequency -> by tag
    top5 = sorted(pos_counts.items(), key=lambda x: (-x[1], x[0]))[:5]
    
    return top5

In [122]:
answer_eight()

[('NN', 32727), ('IN', 28662), ('DT', 25879), (',', 19204), ('JJ', 17613)]

<font color = green >

### Question 9

</font>

Create spelling recommender, that take a list of misspelled words and recommends a correctly spelled word for every word in the list.

For every misspelled word, the recommender should find find the word in `correct_spellings` that has the shortest `edit distance` (you may need  to use `nltk.edit_distance(word_1, word_2, transpositions=True)`), and starts with the same letter as the misspelled word, and return that word as a recommendation.

Recommender should provide recommendations for the three words: `['cormulent', 'incendenece', 'validrate']`.
<br>*This function should return a list of length three:
`['cormulent_reccomendation', 'incendenece_reccomendation', 'validrate_reccomendation']`.*

In [None]:
def answer_nine(default_words=['cormulent', 'incendenece', 'validrate']):
    # Dictionary
    correct_spellings = words.words()
    recommendations = []
    
    for w in default_words:
        # searching for a word with the minimum edit distance
        best_match = min(correct_spellings, key=lambda c: nltk.edit_distance(w, c, transpositions=True))
        recommendations.append(best_match)
    
    return recommendations

In [133]:
answer_nine()

['corpulent', 'intendence', 'validate']