1. Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website

In [7]:
import requests

url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
response.encoding = 'utf-8'
text = response.text

2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization

In [8]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Remove HTML tags using BeautifulSoup
text = BeautifulSoup(text, "html.parser").get_text()
text = text.replace('XII.', '12.')
text = text.replace('XI.', '11.')
text = text.replace('IX.', '9.')
text = text.replace('VIII.', '8.')
text = text.replace('VII.', '7.')
text = text.replace('VI.', '6.')
text = text.replace('IV.', '4.')
text = text.replace('X.', '10.')
text = text.replace('V.', '5.')
text = text.replace('III.', '3.')
text = text.replace('II.', '2.')
text = text.replace('I.', '1.')

# Tokenize the text
words = word_tokenize(text)

# Convert to lowercase and remove non-alphabetic characters
words = [word.lower() for word in words if word.isalpha()]

# Remove stopwords
stop_words = set(stopwords.words("english"))
words = [word for word in words if word not in stop_words]

# Lemmatize the words
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]

3. Find the top 10 important words for each chapter and name the chapters:

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Split the text into chapters
chapters = re.split(r'chapter', ' '.join(words))
last_chapter = chapters[-1]
chapters[-1] = ' end '.join(last_chapter.split(' end ')[0:2])

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10)

chapter_names = []
top_words_per_chapter = []

# Process each chapter
for i, chapter in enumerate(chapters[13:]): 
    chapter_vector = tfidf_vectorizer.fit_transform([chapter])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Sort and get top 10 words by TF-IDF score
    sorted_words = [feature_names[i] for i in chapter_vector.toarray().argsort()[0][::-1][:10]]
    
    # Naming chapters using top words
    chapter_name = f"Chapter {i + 1}: {' '.join(sorted_words)}"
    chapter_names.append(chapter_name)
    top_words_per_chapter.append(sorted_words)

# Print chapter names and top words
for i, chapter_name in enumerate(chapter_names):
    print(chapter_name)
    print(top_words_per_chapter[i])

Chapter 1: alice little way like see think door said one could
['alice', 'little', 'way', 'like', 'see', 'think', 'door', 'said', 'one', 'could']
Chapter 2: alice mouse little said go dear thing foot thought must
['alice', 'mouse', 'little', 'said', 'go', 'dear', 'thing', 'foot', 'thought', 'must']
Chapter 3: said alice mouse dodo know one soon lory long bird
['said', 'alice', 'mouse', 'dodo', 'know', 'one', 'soon', 'lory', 'long', 'bird']
Chapter 4: alice little rabbit said one bill thought sure heard get
['alice', 'little', 'rabbit', 'said', 'one', 'bill', 'thought', 'sure', 'heard', 'get']
Chapter 5: said alice caterpillar serpent pigeon well little minute think size
['said', 'alice', 'caterpillar', 'serpent', 'pigeon', 'well', 'little', 'minute', 'think', 'size']
Chapter 6: said alice cat like little duchess much footman would baby
['said', 'alice', 'cat', 'like', 'little', 'duchess', 'much', 'footman', 'would', 'baby']
Chapter 7: said alice hatter dormouse march hare time thing we

4. Find the Top 10 most used verbs in sentences with Alice.

In [15]:
import nltk
from collections import Counter

# Tokenize sentences
sentences = nltk.sent_tokenize(text)

# Find sentences containing "Alice" and extract verbs
alice_verbs = []
for sentence in sentences:
    if "Alice" in sentence:
        words = word_tokenize(sentence)
        tagged_words = nltk.pos_tag(words)
        verbs = [word for word, pos in tagged_words if pos.startswith("V") and word.isalpha() and word not in stop_words]
        alice_verbs.extend(verbs)

# Count verb occurrences
verb_counts = Counter(alice_verbs)

# Get the top 10 most used verbs with Alice
top_verbs = verb_counts.most_common(10)

# Print the results
print("Top 10 verbs used with Alice:")
for verb, count in top_verbs:
    print(f"{verb}: {count}")

Top 10 verbs used with Alice:
said: 254
thought: 50
went: 41
know: 37
say: 33
looked: 31
see: 31
got: 27
think: 25
began: 25
