In [1]:
#%%md 1. Harvest the Text
import urllib.request # To open and read URLs
import re # To match regular expressions
import unicodedata # To normalize texts

# Book of Romeo and Juliet and The Comedy of Errors from Project Gutenberg
url1 = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
url2 = 'https://www.gutenberg.org/cache/epub/23046/pg23046.txt'
try:
    with urllib.request.urlopen(url1) as f:
        text1 = f.read().decode('utf-8')
        print(text1[:500])  
    with urllib.request.urlopen(url2) as g:
        text2 = g.read().decode('utf-8')
        print(text2[:500])
except Exception as e:
    print("An error occurred:", e)
#%%md 2. Analyzing the text
#Step 1: Text Cleaning and Processing 
def clean_text(text):
# Remove text before and after the main content 
    start = re.search(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK .* \*\*\*", text)
    end = re.search(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK .*\*\*\*", text)
    if start and end: 
        cleaned = text[start.end():end.start()]
    else: 
        print("Warning: Could not find Gutenberg delimiters; cleaning entire text")
        cleaned = text
    return cleaned.strip()
try:
    cleaned_text1 = clean_text(text1)
    cleaned_text2 = clean_text(text2)
    print("Cleaned Text 1 Preview:\n", cleaned_text1[:500])
except Exception as e:
    print("Error during cleaning:", e)
# Step 3: Normalize unicode
try:
    cleaned_text1 = unicodedata.normalize("NFKD", cleaned_text1)
    cleaned_text2 = unicodedata.normalize("NFKD", cleaned_text2)
    # Step 4: Remove HTML tags and special characters
    cleaned_text1 = re.sub(r"<.*?>", " ", cleaned_text1)
    cleaned_text2 = re.sub(r"<.*?>", " ", cleaned_text2)  
    cleaned_text1 = re.sub(r"[^a-zA-Z0-9\s.,;:'\"!?-]", " ", cleaned_text1)
    cleaned_text2 = re.sub(r"[^a-zA-Z0-9\s.,;:'\"!?-]", " ", cleaned_text2)

# Step 4: Convert to lowercase
    cleaned_text1 = cleaned_text1.lower()
    cleaned_text2 = cleaned_text2.lower()

# Step 5: Replace multiple spaces/newlines with one space
    cleaned_text1 = re.sub(r"\s+", " ", cleaned_text1).strip()
    cleaned_text2 = re.sub(r"\s+", " ", cleaned_text2).strip()
        # Step 7: Tokenize into words
# Step 6: Tokenization
    token_1 = re.findall(r"\b\w+\b", cleaned_text1)
    token_2 = re.findall(r"\b\w+\b", cleaned_text2)
    
# Step 7: Print short preview
    print("Cleaned text preview Text 1:\n", cleaned_text1[:])
    print("Cleaned text preview Text 2:\n", cleaned_text2[:])
    print("\nTotal words Text 1:", len(token_1))
    print("\nTotal words Text 2:", len(token_2))
except Exception as e:
    print("An error occurred:", e)
#%%md Step 2: Removing Stop Words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Define the stop words set
stop_words = set(stopwords.words('english'))
# Filter out stop words
filtered_tokens1 = [word for word in token_1 if word.lower() not in stop_words]
filtered_tokens2 = [word for word in token_2 if word.lower() not in stop_words]
print("Filtered tokens:", filtered_tokens1[:50], filtered_tokens2[:50])
print("\nTotal Filtered:", len(filtered_tokens1, filtered_tokens2))
print("\nTotal Original:", len(token_1, token_2))

﻿The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eB
﻿The Project Gutenberg eBook of The Comedy of Errors
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using thi
Cleaned Text 1

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kchheav1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Filtered tokens: ['tragedy', 'romeo', 'juliet', 'william', 'shakespeare', 'contents', 'prologue', 'act', 'scene', 'public', 'place', 'scene', 'ii', 'street', 'scene', 'iii', 'room', 'capulet', 'house', 'scene', 'iv', 'street', 'scene', 'v', 'hall', 'capulet', 'house', 'act', 'ii', 'chorus', 'scene', 'open', 'place', 'adjoining', 'capulet', 'garden', 'scene', 'ii', 'capulet', 'garden', 'scene', 'iii', 'friar', 'lawrence', 'cell', 'scene', 'iv', 'street', 'scene', 'v'] ['transcriber', 'note', 'text', 'comedy', 'errors', 'volume', 'nine', 'volume', '1863', 'cambridge', 'edition', 'shakespeare', 'preface', 'e', 'text', '23041', 'plays', 'volume', 'available', 'separate', 'e', 'texts', 'general', 'notes', 'original', 'location', 'end', 'play', 'text', 'critical', 'notes', 'grouped', 'end', 'scene', 'line', 'numbers', 'original', 'text', 'line', 'breaks', 'dialogue', 'including', 'prose', 'passages', 'unchanged', 'brackets', 'also', 'unchanged', 'avoid', 'ambiguity']


TypeError: len() takes exactly one argument (2 given)

In [None]:
#%%md 1. Harvest the Text
import urllib.request # To open and read URLs
import re # To match regular expressions
import unicodedata # To normalize texts

# Book of Romeo and Juliet and The Comedy of Errors from Project Gutenberg
url1 = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
url2 = 'https://www.gutenberg.org/cache/epub/23046/pg23046.txt'
try:
    with urllib.request.urlopen(url1) as f:
        text1 = f.read().decode('utf-8')
        print(text1[:500])  
    with urllib.request.urlopen(url2) as g:
        text2 = g.read().decode('utf-8')
        print(text2[:500])
except Exception as e:
    print("An error occurred:", e)
#%%md 2. Analyzing the text
#Step 1: Text Cleaning and Processing 
def clean_text(text):
# Remove text before and after the main content 
    start = re.search(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK .* \*\*\*", text)
    end = re.search(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK .*\*\*\*", text)
    if start and end: 
        cleaned = text[start.end():end.start()]
    else: 
        print("Warning: Could not find Gutenberg delimiters; cleaning entire text")
        cleaned = text
    return cleaned.strip()
try:
    cleaned_text1 = clean_text(text1)
    cleaned_text2 = clean_text(text2)
    print("Cleaned Text 1 Preview:\n", cleaned_text1[:500])
except Exception as e:
    print("Error during cleaning:", e)
# Step 3: Normalize unicode
try:
    cleaned_text1 = unicodedata.normalize("NFKD", cleaned_text1)
    cleaned_text2 = unicodedata.normalize("NFKD", cleaned_text2)
    # Step 4: Remove HTML tags and special characters
    cleaned_text1 = re.sub(r"<.*?>", " ", cleaned_text1)
    cleaned_text2 = re.sub(r"<.*?>", " ", cleaned_text2)  
    cleaned_text1 = re.sub(r"[^a-zA-Z0-9\s.,;:'\"!?-]", " ", cleaned_text1)
    cleaned_text2 = re.sub(r"[^a-zA-Z0-9\s.,;:'\"!?-]", " ", cleaned_text2)

# Step 4: Convert to lowercase
    cleaned_text1 = cleaned_text1.lower()
    cleaned_text2 = cleaned_text2.lower()

# Step 5: Replace multiple spaces/newlines with one space
    cleaned_text1 = re.sub(r"\s+", " ", cleaned_text1).strip()
    cleaned_text2 = re.sub(r"\s+", " ", cleaned_text2).strip()
        # Step 7: Tokenize into words
# Step 6: Tokenization
    token_1 = re.findall(r"\b\w+\b", cleaned_text1)
    token_2 = re.findall(r"\b\w+\b", cleaned_text2)
    
# Step 7: Print short preview
    print("Cleaned text preview Text 1:\n", cleaned_text1[:])
    print("Cleaned text preview Text 2:\n", cleaned_text2[:])
    print("\nTotal words Text 1:", len(token_1))
    print("\nTotal words Text 2:", len(token_2))
except Exception as e:
    print("An error occurred:", e)
#%%md Step 2: Removing Stop Words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Define the stop words set
stop_words = set(stopwords.words('english'))
# Filter out stop words
filtered_tokens1 = [word for word in token_1 if word.lower() not in stop_words]
filtered_tokens2 = [word for word in token_2 if word.lower() not in stop_words]
print("Filtered tokens:", filtered_tokens1[:50], filtered_tokens2[:50])
print("\nTotal Filtered:", len(filtered_tokens1, filtered_tokens2))
print("\nTotal Original:", len(token_1, token_2))
'''#%%md Step 3: Frequency Analysis 
# # Step 7: (Optional) Rejoin filtered words into cleaned text
cleaned_text = " ".join(filtered_tokens)
# Step 8: Print results
print("Original text:\n", text[:50])
print("\nCleaned text:\n", cleaned_text[:50])
print("\nTokens before stopword removal:\n", tokens[:50])
print("\nTokens after stopword removal:\n", filtered_tokens[:50])

# %%md Step 3: Frequency Analysis
# Step 9: Count word frequencies
word_freq = {}
for word in filtered_tokens:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1
#%%md Step 4: Computing Summary Statistics 
#Print the top 50 words
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:50]
print("\nTop 50 most frequent words:\n" ,  top_words)

##Calculate average word length
total_length = sum(len(word) * count for word, count in word_freq.items())
total_words = sum(word_freq.values())
average_word_length = total_length / total_words if total_words > 0 else 0
print(f"\nAverage word length: {average_word_length:.2f}")

#Calculate vocabulary size
vocabulary_size = len(word_freq)
print(f"\nVocabulary size: {vocabulary_size}")

#Calculate vocabulary richness
vocabulary_richness = vocabulary_size / total_words if total_words > 0 else 0
print(f"\nVocabulary richness: {vocabulary_richness:.4f}")

#%%md Part 3: Learning with AI I will talk about two problems I ran 
# into while doing this assignment. The first problem I ran into was
# how to install nltk. The second problem was learning how to compare two pieces of text from 
# the Gutenburg project. 

# Problem one: While I was able to run my code before I forked the repository, 
# I was not able to rerun the same code after I forked the repositoty. The code 
# keep saying that no module named nltk was found. 

# Crafted prompts 1: It took me five prompts to get to the solution. 
# I first sent a screenshot of the error message I was getting which says
# "no module nltk found", and asked copilot how to fix it. Copilot suggested that I install nltk using pip. I followed the 
# the suggestion, and piped install nltk in the terminal. However, in when 
# I installed the package, I got a message that said "Requirement already satisfied".
# I was very confused, so I sent another prompt asking "what could be wrong?"
# if nltk was already installed. Copilot suggested that I checked my python
# environment, and make sure that I was using the same environment where nltk was located. 
# Copilot gave me an option to check my nltk installation by running "pip show nltk" 
# in the terminal and check the location of the package in jupyter notebook by running 
# import sysprint(sys.executable) and !{sys.executable} -m pip install nltk. The locations 
# did not match. I decided to run the "python -m pip install nltk" in the console
# instead of the terminal and that fixed the problem. 

# Review and verify(1): Copilot gave me different options and 
# I usually started with the simplest one first, usually the first or 
# second suggestion. In this case, it takes three prompts to get to the solution. 

# The process: Even though installing nltk does not seem hard, 
# because it only takes one command to install it, not being able to get 
# it to work was very confusing. Thankfully, copilot was able to guide me, giving 
# me a different suggestions based on my comfort level. 

# Problem two: When comparing two pieces of texts from the Gutenburg prohect, 
# I felt like my code was very repetitive and long. I somewhat have an idea on how 
# to work with one text, but did not know how to extend the code to work with 
# more than two texts or more. 

# Crafted prompts #2: 
# 1) I started by sending copilot my existing code that works with one text, 
# and asked "how do I clean and process two texts at the same time?" 
# 2) While I was normalizing the unicode, I got an error message 
# saying "an error occurred: invalid normalization form". I did not 
# understand what I did wrong, so I asked chat how to fix the problem
# including the screenshot of the error message. 





# Review and verify(2): 
# 1) Copilot suggested that I could create a function that takes in a text as 
# input and returns the cleaned and processed text. I know we covered this in
# class, so I incorporated the function into my code. It made my code so much more 
# concise and easier to read. 
# 2) Copilot pointed out that I made a type in the normalization 
# form, and suggested that I change "NFDK" to "NFKD. After this step, my code works fine. 
