# NLP Practical Test
----

## Import libraries and read data

In [1]:
import nltk
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wahe3bru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
data = open('alice_in_wonderland.txt', 'r').read()

----
## Convert to lowercase and remove punctuation

In [27]:
def remove_punctuation(words):
    punctuation = set(string.punctuation)
    words = "".join(char.lower() for char in words if char not in punctuation)
    return words

In [29]:
data = remove_punctuation(data)

### Question 1: What is the 14th character in the book?

In [57]:
print(f'The 14th character in the book is: "{data[13]}"')

The 14th character in the book is: "i"


----
## Tokenise the data

In [32]:
tokeniser = TreebankWordTokenizer()

### Question 2: What is the 34th word in the book?

In [33]:
data_words = tokeniser.tokenize(data)

In [59]:
print(f'The 34th word in the book is: {data_words[33]}')

The 34th word in the book is: reading


### Question 3: How many words are in the book?

In [56]:
print(f'There are {len(data_words)} words in the book')

There are 26391 words in the book


----
## Stemming

In [36]:
stemmer = SnowballStemmer('english')

### Question 4: What is the stem of the word '*writing*'

In [37]:
print(f"The stem of the word 'writing' is: {stemmer.stem('writing')}")

The stem of the word 'writing' is: write


### Question 5: What is the stem of the 55th word in the book?

In [38]:
print(f'The stem of the 55th word ({data_words[54]}) in the book is: {stemmer.stem(data_words[54])}')

The stem of the 55th word (pictures) in the book is: pictur


----
## Lemmatization

In [39]:
lemmatizer = WordNetLemmatizer()

### Question 6: What is the lemma of the word '*hippopotami*'?

In [40]:
print(f"The lemma of the word 'hippopotami' is: {lemmatizer.lemmatize('hippopotami')}")

The lemma of the word 'hippopotami' is: hippopotamus


### Question 7: What is the lemma of the 389th word in the book?

In [42]:
print(f'The lemma of the 389th word ({data_words[388]}) in the book is: {lemmatizer.lemmatize(data_words[388])}')

The lemma of the 389th word (bookshelves) in the book is: bookshelf


## Stopwords

### Question 8: How many stopwords are in the book?

In [43]:
stopwords_set = set(stopwords.words('english'))

In [44]:
data_stopwords = [word for word in data_words if word in stopwords_set]

In [45]:
print(f'There are {len(data_stopwords)} stopwords in the book')

There are 13768 stopwords in the book


## Bag of Words

In [46]:
def bag_of_words_count(words, word_dict={}):
    """ this function takes in a list of words and returns a dictionary 
        with each word as a key, and the value represents the number of 
        times that word appeared"""
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

### Question 9: How any times does '*Alice*' appear in the book?

In [49]:
data_no_stopwords = [word for word in data_words if word not in stopwords_set]

In [51]:
data_bow = bag_of_words_count(data_no_stopwords)

In [53]:
print(f"'Alice' appears in the book: 0 times \n'alice' appears in the book {data_bow['alice']} times")

'Alice' appears in the book: 0 times 
'alice' appears in the book 386 times


### Question 10: What is the most common word in the book?

In [54]:
data_common_words = sorted(data_bow, key= data_bow.get, reverse=True)

In [55]:
print(f'The most common word in the book is: {data_common_words[0]}')

The most common word in the book is: said
