In [None]:
!pip install requests beautifulsoup4 nltk



In [None]:
import os
import sys
import pickle

# Detect if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "/content/drive/MyDrive/Smiles Discourse Analysis"
else:
    base_dir = "path/to/your/local/project/folder" # add directory if running locally

The first step is to download two books by Samuel Smiles. These texts — Self-Help and Thrift — were written by the same author, as part of the same series, and on related themes. This makes them well-suited for comparison: we can expect both overlap and divergence in how key concepts are expressed.

While these two books serve as a compelling case study, any pair of corpora could be used for this type of analysis. What's important is that the comparison is guided by a clear research question or interpretive goal — beyond simply identifying similar sentences.

In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

# This is a fairly standard BeautifulSoup pipeline and should work well on most plain text sources.
# When working with other websites, however, extra care may be needed to handle inconsistent formatting,
# especially with headers, footers, and licensing text.
# Fortunately, Gutenberg has already removed most of this boilerplate for us, so we can proceed with minimal cleanup.


def fetch_and_process_gutenberg_text(url):
    # Step 1: Fetch the text file
    response = requests.get(url)
    raw_text = response.text

    # Step 2: Remove Gutenberg headers/footers
    start_marker = "*** START OF"
    end_marker = "*** END OF"
    start_idx = raw_text.find(start_marker)
    end_idx = raw_text.find(end_marker)

    if start_idx != -1 and end_idx != -1:
        clean_text = raw_text[start_idx:end_idx]
    else:
        clean_text = raw_text  # fallback

    # Optional: Clean line breaks and extra spaces
    clean_text = clean_text.replace('\r\n', ' ').replace('\n', ' ')
    clean_text = ' '.join(clean_text.split())

    # Step 3: Tokenise into sentences
    sentences = sent_tokenize(clean_text)

    return sentences, clean_text

# 📘 Samuel Smiles: Self-Help
self_help_url = "https://www.gutenberg.org/files/935/935-0.txt"
self_help_sentences, self_help_fulltext = fetch_and_process_gutenberg_text(self_help_url)

# 📗 Samuel Smiles: Character
thrift_url = "https://www.gutenberg.org/cache/epub/14418/pg14418.txt"
thrift_sentences, thrift_fulltext = fetch_and_process_gutenberg_text(thrift_url)

#  check each variable is populated. Note len for full text will not give word length, but just show comparable variable sizes
print(f"Self-Help - Total sentences: {len(self_help_sentences)}")
print(f"Self-Help - Full text length: {len(self_help_fulltext)}")
print(f"Thrift - Total sentences: {len(thrift_sentences)}")
print(f"Thrift - Full text length: {len(thrift_fulltext)}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Self-Help - Total sentences: 4485
Self-Help - Full text length: 818382
thrift - Total sentences: 5926
thrift - Full text length: 714525


In [None]:
# Define save directory
pickle_dir = os.path.join(base_dir, "pickles")
os.makedirs(pickle_dir, exist_ok=True)

# Save pickles correctly
with open(os.path.join(pickle_dir, 'self_help.pkl'), 'wb') as f:
    pickle.dump(self_help_sentences, f)

with open(os.path.join(pickle_dir, 'thrift.pkl'), 'wb') as f:
    pickle.dump(thrift_sentences, f)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
