# Task 5:
    1. Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt
    2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.
    3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?
    4. Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?
    5. *(not necessary) Find Top 100 most used verbs in sentences with Alice. Get word vectors using a pre-trained word2vec model and visualize them. Compare the words using embeddings.

In [1]:
#!pip install spacy

In [2]:
#!python -m spacy download en

In [3]:
import math
import urllib.request
import spacy

alice_handler = urllib.request.urlopen("http://www.gutenberg.org/files/11/11-0.txt")
alice_text = alice_handler.read().decode('utf-8')

In [4]:
# Define english vocab
nlp = spacy.load("en_core_web_sm")

In [5]:
# Removing copyright, tokenize, lemmatize, stop words

end = "*** END OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***"

alice_text = alice_text[alice_text.index("CHAPTER I.\r"):]
alice_text = alice_text[:alice_text.index(end) - len(end)]
alice_text = alice_text.lower()

In [6]:
alice_text_nlp = nlp(alice_text)

In [7]:
def prepare_text(doc):
    Nouns = []
    Noun_set = []
    trimmed_noun_set = []
    removing_duplicates = []
    arr = []
    vocab = []
    vocab_dict = {}

    nlp.vocab["alice"].is_stop = True

    doc = nlp(doc.lower())

    for possible_nouns in doc:
        if possible_nouns.pos_ in ["NOUN","PROPN"] and not possible_nouns.is_stop:
            Nouns.append(possible_nouns)


    for i in Nouns:
        Noun_set.append([i])

    for i in Noun_set:
            trimmed_noun_set.append([i])

    for word in trimmed_noun_set:
        if word not in removing_duplicates:
            removing_duplicates.append(word)

    for word in Noun_set:
        string = ''
        for j in word:
            string+= str(j)+ " "
        vocab.append(string.strip())

    for word in vocab:
        if word == "_":
            continue
        if word not in vocab_dict:
            vocab_dict[word]= 0
        else:
            vocab_dict[word]+=1
    arr = vocab_dict.keys()
    return vocab_dict , arr

In [8]:
def computeTF(wordDict,bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict


def computeIDF(doclist):
    count = 0
    idfDict = {}
    for element in doclist:
        for j in element:
            count+=1
    N = count

    # count number of usages of word w in doc
    idfDict = dict.fromkeys(doclist[0].keys(),0)

    for doc in doclist:
        for word,val in doc.items():
            if val>0:
                idfDict[word]+= 1

    # divide N by denominator above
    for word,val in idfDict.items():
        if val == 0:
            idfDict[word] = 0.0
        else:
            idfDict[word] = math.log(N / float(val))

    return idfDict

def computeTfidf(tf,idf):
    tfidf = {}
    sorted_list = []
    for word , val in tf.items():
        tfidf[word] = val * idf[word]

    ranking_list  = sorted(tfidf.items(),reverse=True, key = lambda kv:(kv[1], kv[0]))[:10]
    for i, _ in ranking_list:
        sorted_list.append(i)

    return sorted_list


In [9]:
# Split text into chapters
import re
chapters = re.split('chapter *', alice_text)[1:]

Find most import words using tf-idf metric. We will use pairs(pronoun+noun, verb+noun etc..) in order to come up with meaningfull chapter names

In [10]:
import re

print(f"Top 10 words per chapter (a.k.a. chapter names):")
for i,chapter in enumerate(chapters):
    vocab_dict , arr = prepare_text(chapter)
    tf = computeTF(vocab_dict,arr)
    idf = computeIDF([vocab_dict])
    tfidf = computeTfidf(tf,idf)
    chapter_name = " ".join(tfidf)
    
    print(f"CHAPTER {i+1}: {chapter_name}")

Top 10 words per chapter (a.k.a. chapter names):
CHAPTER 1: way time rabbit door key use things table people moment
CHAPTER 2: mouse way pool feet things cats tears time voice thing
CHAPTER 3: mouse dodo thing race lory course question prizes birds party
CHAPTER 4: rabbit bill window thing voice room puppy moment gloves door
CHAPTER 5: caterpillar pigeon youth size bit tone time serpent mouth father
CHAPTER 6: cat footman duchess baby way pig door cook thing tone
CHAPTER 7: hatter dormouse march hare time tea thing twinkle treacle table
CHAPTER 8: queen king head cat soldiers gardeners game voice rabbit minute
CHAPTER 9: turtle gryphon duchess queen day tone thing school moral course
CHAPTER 10: gryphon turtle dance soup voice whiting sea soo oop lobsters
CHAPTER 11: king hatter court dormouse witness queen rabbit jury voice march
CHAPTER 12: king jury queen rabbit sister head voice dream verses time


We will ignore verbs which probably refer to Alice since there is really a lot of them in the book.

In [11]:
def find_alice_verbs(doc):
    verbs = []

    doc = nlp(doc.upper())


    for possible_nouns in doc:
        #print(possible_nouns.lemma_)
        if possible_nouns.lemma_.lower()=="alice":
            if possible_nouns.head.pos_ == "VERB" and possible_nouns.head.text.lower() != 'alice':

                cand = [possible_nouns.text, possible_nouns.head.text]
                verbs.append(cand[1])
    return set(verbs)

In [12]:
total_count_verbs = {}
for chapter in chapters:
    cur_keys = find_alice_verbs(chapter)

    for key in cur_keys:
        if key in total_count_verbs:
            total_count_verbs[key]+=1
        else:
            total_count_verbs[key]=1

In [13]:
# Most used verbs
dict(sorted(total_count_verbs.items(), key=lambda item: item[1], reverse=True))

{'SAID': 12,
 'THOUGHT': 6,
 'WENT': 5,
 'BEGAN': 5,
 'HAD': 3,
 'THINK': 3,
 'BE': 3,
 'BEEN': 3,
 'CRIED': 3,
 'TURNING': 3,
 'WAITED': 3,
 'WAS': 3,
 'SEE': 3,
 'VENTURED': 2,
 'HEARD': 2,
 'LOOKING': 2,
 'SEEN': 2,
 'BEGINNING': 2,
 'TELL': 2,
 '’S': 2,
 'LIKE': 2,
 'MOVED': 2,
 'GOT': 2,
 'LOOKED': 2,
 'SHUTTING': 1,
 'ADVISE': 1,
 'STARTED': 1,
 'TOOK': 1,
 'UNDERSTAND': 1,
 'OUGHT': 1,
 'KEPT': 1,
 'CALLED': 1,
 'ALLOW': 1,
 'POINTING': 1,
 'SEEMED': 1,
 'WHISPERS': 1,
 'HAVE': 1,
 'CROUCHED': 1,
 'CAN’T': 1,
 'DID': 1,
 'SAW': 1,
 'KEEP': 1,
 'TRIED': 1,
 'KNOW': 1,
 'COMING': 1,
 'WISH': 1,
 'CONSIDERED': 1,
 'SUPPOSE': 1,
 'SHOUTED': 1,
 'GAVE': 1,
 'CAME': 1,
 'FOUND': 1,
 'WANTED': 1,
 'SPEAK': 1,
 'HEAR': 1,
 'FEEL': 1,
 'LEAVING': 1,
 'TAKING': 1,
 'DARE': 1,
 'THANK': 1,
 'ASKED': 1,
 'PANTED': 1,
 'IMAGINE': 1,
 'MADE': 1,
 'WAKE': 1,
 'LET': 1}