In [1]:
import os
import re
import numpy as np

from typing import List, Tuple
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prazd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prazd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prazd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prazd\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
text_path = os.path.join('data', 'alice_in_wonderland.txt')
with open(file=text_path, mode='r', encoding='utf-8') as file:
    book = file.readlines()

In [3]:
# convert book -> text corpus
corpus = [line.strip() for line in book]
corpus = ' '.join(corpus).split('THE END')[0]

In [4]:
# extract CHAPTER's lines
TRUE_CHAPTERS = [elem.strip().split(maxsplit=1) for elem in corpus.split('CHAPTER')[1:13]]

# Get CHAPTER's text
corpus = corpus.split('CHAPTER')[13:]
corpus = [elem.replace(TRUE_CHAPTERS[i][0], '').replace(TRUE_CHAPTERS[i][1], '') for i, elem in enumerate(corpus)]

### Text preprocessing

In [7]:
# replace " in sentences to get mush more clear ones
corpus = [elem.replace('“', '').replace('”', '') for elem in corpus]

# split chapters into sentences via '.', '?', '!'
sentences_re = r'(?<=[.!?;])\s+'
corpus = [re.split(sentences_re, elem) for elem in corpus]

In [8]:
corpus[0][0]

'     Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, and what is the use of a book, thought Alice without pictures or conversations?'

In [9]:
# preprocessing functions
def get_words(text: str) -> str:
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def lowercase(text: str) -> str:
    return text.lower()

from nltk.tokenize import word_tokenize
def tokenize(text: str) -> List[str]:
    return word_tokenize(text)

from nltk.corpus import wordnet
TAG_WORDNET_MAPPING = {
    "J": wordnet.ADJ,
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "R": wordnet.ADV
}
def get_wordnet_pos(words: List[str]) -> List[Tuple[str]]:
    tags = nltk.pos_tag(words)
    return [(elem[0], TAG_WORDNET_MAPPING.get(elem[1][0].upper(), wordnet.NOUN)) for elem in tags]

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize(word: str, pos: str) -> str:
    return lemmatizer.lemmatize(word, pos)

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english') 
def stopwords_cleaner(words: List[str]) -> List[str]:
    return [elem for elem in words if elem not in stopwords_english]

def text_preprocessing(text: str) -> List[str]:
    text = get_words(text=text)
    text = lowercase(text=text)
    text = tokenize(text=text)
    text = get_wordnet_pos(words=text)
    text = [lemmatize(word=elem[0], pos=elem[1]) for elem in text]
    text = stopwords_cleaner(words=text)

    return text

### Top 10 most important words from each chapter in the text.

In [11]:
top_words = []
for temp_chapter in corpus:
    temp_preprocessed_test_corpus = [text_preprocessing(text=elem) for elem in temp_chapter]
    temp_preprocessed_test_corpus = [' '.join(elem) for elem in temp_preprocessed_test_corpus if len(elem) > 5]

    temp_tfidf = TfidfVectorizer(
        input="content",
        encoding="utf-8",
        lowercase=False,
        preprocessor=None,
        tokenizer=None,
        analyzer="word",
        stop_words=None,
        norm="l2",
        use_idf=True,
        smooth_idf=True
    )
    temp_matrix = temp_tfidf.fit_transform(temp_preprocessed_test_corpus)
    temp_words = temp_tfidf.get_feature_names_out()
    temp_top_words = [temp_words[i] for i in np.argsort(temp_matrix.toarray().sum(axis=0))[-10:] if temp_words[i] != 'alice']
    top_words.append(temp_top_words)

In [12]:
for temp_chapter, elem in zip(TRUE_CHAPTERS, top_words):
    print(f'{temp_chapter[1]}: {elem}')

Down the Rabbit-Hole: ['try', 'find', 'way', 'get', 'see', 'think', 'say', 'little', 'go']
The Pool of Tears: ['one', 'know', 'come', 'foot', 'im', 'mouse', 'little', 'go', 'say']
A Caucus-Race and a Long Tale: ['seem', 'think', 'look', 'one', 'get', 'know', 'dodo', 'mouse', 'say']
The Rabbit Sends in a Little Bill: ['rabbit', 'grow', 'one', 'come', 'say', 'little', 'make', 'get', 'go']
Advice from a Caterpillar: ['pigeon', 'minute', 'get', 'well', 'think', 'im', 'caterpillar', 'little', 'say']
Pig and Pepper: ['see', 'footman', 'like', 'get', 'think', 'little', 'cat', 'go', 'say']
A Mad Tea-Party: ['go', 'well', 'take', 'march', 'hare', 'time', 'hatter', 'dormouse', 'say']
The Queen’s Croquet-Ground: ['three', 'see', 'think', 'come', 'go', 'king', 'look', 'queen', 'say']
The Mock Turtle’s Story: ['make', 'think', 'duchess', 'queen', 'gryphon', 'go', 'mock', 'turtle', 'say']
The Lobster Quadrille: ['could', 'dance', 'go', 'lobster', 'would', 'gryphon', 'turtle', 'mock', 'say']
Who Stol

1. Down the Rabbit-Hole: Find the way
   
2. The Pool of Tears: One little mouse
   
3. A Caucus-Race and a Long Tale: Looking for dodo
   
4. The Rabbit Sends in a Little Bill: Grown rabbit
   
5. Advice from a Caterpillar: Сaterpillar thoughts
   
6. Pig and Pepper: Little cat
   
7. A Mad Tea-Party: Dormouse
   
8. The Queen’s Croquet-Ground: King and Queen
   
9.  The Mock Turtle’s Story: Turtle's mock
    
10. The Lobster Quadrille: Dance
    
11. Who Stole the Tarts?: Court
    
12. Alice’s Evidence: Looking for evidence

### Top 10 most used verbs in sentences with Alice.

In [13]:
# change some preprocessing functions
lemmatizer = WordNetLemmatizer()
def lemmatize(word: str, pos: str) -> str:
    return (lemmatizer.lemmatize(word, pos), pos)

stopwords_english = stopwords.words('english') 
def stopwords_cleaner(words: List[str]) -> List[str]:
    return [(elem[0], elem[1]) for elem in words if elem[0] not in stopwords_english]

def text_preprocessing(text: str) -> List[str]:
    text = get_words(text=text)
    text = lowercase(text=text)
    text = tokenize(text=text)
    text = get_wordnet_pos(words=text)
    text = [lemmatize(word=elem[0], pos=elem[1]) for elem in text]
    text = stopwords_cleaner(words=text)

    return text

In [14]:
counter = {}
for temp_chapter in corpus:
    for temp_sentence in temp_chapter:
        temp_preprocessed_sentence = text_preprocessing(text=temp_sentence)
        temp_words = [elem[0] for elem in temp_preprocessed_sentence]
        if 'alice' not in temp_words:
            continue
        for word in temp_preprocessed_sentence:
            if word[1] == 'v' and word[0] != 'alice':
                counter[word[0]] = counter.get(word[0], 0) + 1

print(f"Top 10 most used verbs in sentences with Alice: {sorted(list(counter.items()), key=lambda elem: -elem[1])[:10]}")

Top 10 most used verbs in sentences with Alice: [('say', 192), ('think', 61), ('go', 57), ('look', 43), ('get', 42), ('begin', 33), ('see', 28), ('know', 20), ('find', 19), ('make', 18)]
