### 1. Downloading text

In [1]:
import requests

url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text


### 2. Preprocessing

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import re

def process_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters and numbers
    text = re.sub(r'[^a-z ]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)


### 3. Most important words

In [4]:
chapter_names = text.split('CHAPTER')[1:13]
chapters = text.split('CHAPTER')[13:]
chapter_names_processed = [process_text(name) for name in chapter_names]
chapters_processed = [process_text(_) for _ in chapters]


In [6]:
' '.join(chapters_processed)




In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(input='content', analyzer='word').fit(chapters_processed)

# Find top 10 words for each chapter
top_words_by_chapter = []
for chapter in chapters_processed:
    # Calculate TF-IDF scores for words in the chapter
    tfidf_matrix = tfidf_vectorizer.transform([chapter])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_words = [feature_names[i] for i in tfidf_matrix.sum(axis=0).argsort()[0, -10:][::-1]]
    top_words_by_chapter.append(top_words)


In [20]:
for i, top_words in enumerate(top_words_by_chapter):
    chapter_name = ', '.join(top_words[0][0])
    print(f"{chapter_names_processed[i]}: {chapter_name}")


rabbithole: way, one, see, door, eat, like, little, key, bat, alice
ii pool tear: oh, cried, said, cat, dear, swam, pool, little, alice, mouse
iii caucusrace long tale: know, course, thimble, dry, prize, lory, alice, dodo, said, mouse
iv rabbit sends little bill: ann, one, fan, said, bottle, little, bill, rabbit, puppy, alice
v advice caterpillar: caterpillarwell, father, size, egg, youth, serpent, pigeon, alice, caterpillar, said
vi pig pepper: cook, mad, like, pig, duchess, baby, alice, footman, cat, said
vii mad teaparty: asleep, draw, tea, twinkle, hare, alice, march, said, hatter, dormouse
viii queen croquetground: head, executioner, hedgehog, gardener, soldier, cat, king, alice, said, queen
ix mock turtle story: dont, went, queen, moral, duchess, gryphon, alice, turtle, said, mock
x lobster quadrille: wont, alice, whiting, beautiful, join, lobster, said, gryphon, turtle, mock
xi stole tart: rabbit, thecourt, juror, queen, dormouse, witness, court, hatter, said, king
xii alices ev

#### 4. Most often words with Alice word

In [21]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\matni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [22]:
from nltk import pos_tag
from collections import Counter

# Tokenize sentences
sentences = re.split(r'[.!?]', text)
sentences = [process_text(_) for _ in sentences]
alice_verbs = []

# Extract verbs in sentences with "Alice"
for sentence in sentences:
    if 'alice' in sentence:
        words = word_tokenize(sentence)
        # Perform part-of-speech tagging
        tagged_words = pos_tag(words)
        # Extract verbs (VB*)
        verbs = [word for word, pos in tagged_words if pos.startswith('VB')]
        alice_verbs.extend(verbs)

# Count verb occurrences
verb_counts = Counter(alice_verbs)

# Print top 10 most used verbs with Alice
print("Top 10 most used verbs with Alice:")
print(verb_counts.most_common(10))


Top 10 most used verbs with Alice:
[('said', 166), ('thought', 35), ('went', 27), ('say', 20), ('looked', 18), ('began', 18), ('got', 16), ('know', 16), ('see', 15), ('think', 14)]


most often Alice says and then thinks :)