### 1. Downloading text

In [72]:
import requests

url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text


### 2. Preprocessing

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...


In [60]:
import re

def process_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters and numbers
    text = re.sub(r'[^a-z ]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)


### 3. Most important words

In [79]:
chapter_names = text.split('CHAPTER')[1:13]
chapters = text.split('CHAPTER')[13:]
chapter_names_processed = [process_text(name) for name in chapter_names]
chapters_processed = [process_text(_) for _ in chapters]


In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Find top 10 words for each chapter
top_words_by_chapter = []
for chapter in chapters_processed:
    # Calculate TF-IDF scores for words in the chapter
    tfidf_matrix = tfidf_vectorizer.fit_transform([chapter])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_words = [feature_names[i] for i in tfidf_matrix.sum(axis=0).argsort()[0, -10:][::-1]]
    top_words_by_chapter.append(top_words)


In [86]:
for i, top_words in enumerate(top_words_by_chapter):
    chapter_name = ', '.join(top_words[0][0])
    print(f"{chapter_names_processed[i]}: {chapter_name}")


rabbithole: nothing, door, thought, think, way, one, see, like, little, alice
ii pool tear: one, must, went, foot, dear, thing, said, mouse, little, alice
iii caucusrace long tale: lory, one, soon, long, thing, know, dodo, mouse, alice, said
iv rabbit sends little bill: get, bill, thought, heard, quite, one, rabbit, said, little, alice
v advice caterpillar: dont, serpent, ive, size, pigeon, im, little, caterpillar, alice, said
vi pig pepper: baby, went, little, much, footman, duchess, like, cat, alice, said
vii mad teaparty: went, thing, know, time, hare, march, dormouse, hatter, alice, said
viii queen croquetground: went, three, two, see, cat, king, head, alice, queen, said
ix mock turtle story: say, dont, queen, went, gryphon, duchess, turtle, mock, alice, said
x lobster quadrille: join, lobster, beautiful, wont, would, alice, gryphon, turtle, mock, said
xi stole tart: thought, court, rabbit, dormouse, queen, one, alice, hatter, king, said
xii alices evidence: state, copy, term, elec

Rabbithole Chronicles: Through the Door of Thought

The Pool of Tears: Footprints of the Dear Mouse

The Caucus Race and the Long Tale: Lory's Knowing Dodo

Rabbit Sends Little Bill: Chasing the White Rabbit's Thoughts

Advice from the Caterpillar: Conversations with the Wise Serpent

Pig and Pepper: Of Babies and Cheshire Cats

The Mad Teaparty: Hares, March Hares, and Dormice

The Queen's Croquet Ground: Heads, Cats, and Royal Decrees

The Mock Turtle's Story: Gryphons and Mock Turtles Speak

The Lobster Quadrille: Of Beautiful Lobsters and Unlikely Dances

Who Stole the Tart: Trials and Witnesses in Wonderland

Alice's Evidence: A State of Electronic Wonderland

#### 4. Most often words with Alice word

In [89]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [90]:
from nltk import pos_tag
from collections import Counter

# Tokenize sentences
sentences = re.split(r'[.!?]', text)
sentences = [process_text(_) for _ in sentences]
alice_verbs = []

# Extract verbs in sentences with "Alice"
for sentence in sentences:
    if 'alice' in sentence:
        words = word_tokenize(sentence)
        # Perform part-of-speech tagging
        tagged_words = pos_tag(words)
        # Extract verbs (VB*)
        verbs = [word for word, pos in tagged_words if pos.startswith('VB')]
        alice_verbs.extend(verbs)

# Count verb occurrences
verb_counts = Counter(alice_verbs)

# Print top 10 most used verbs with Alice
print("Top 10 most used verbs with Alice:")
print(verb_counts.most_common(10))


Top 10 most used verbs with Alice:
[('said', 166), ('thought', 35), ('went', 27), ('say', 20), ('looked', 18), ('began', 18), ('got', 16), ('know', 16), ('see', 15), ('think', 14)]


most often Alice says and then thinks :)