In [None]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [40]:
import os
import sys
import pickle

# Detect if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "/content/drive/MyDrive/Smiles Discourse Analysis"
else:
    base_dir = "path/to/your/local/project/folder" # add directory if running locally
# Define save directory
pickle_dir = os.path.join(base_dir, "pickles")
os.makedirs(pickle_dir, exist_ok=True)
output_dir = os.path.join(base_dir, "output")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open(os.path.join(pickle_dir, 'full_self_help.pkl'), 'rb') as f:
   self_help_fulltext = pickle.load(f)

with open(os.path.join(pickle_dir, 'full_thrift.pkl'), 'rb') as f:
    thrift_fulltext= pickle.load(f)


In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re

def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation (.,?! etc.)
    text = re.sub(r"\d+", "", text)      # remove digits
    text = re.sub(r"\s+", " ", text)     # collapse multiple spaces
    return text.strip().lower()

In [None]:
self_help_fulltext = clean_text(self_help_fulltext)
thrift_fulltext = clean_text(thrift_fulltext)

In [None]:
sh_tokens = word_tokenize(self_help_fulltext)
thrift_tokens = word_tokenize(thrift_fulltext)
sh_tokens = [w for w in sh_tokens if not w in stop_words]
thrift_tokens = [w for w in thrift_tokens if not w in stop_words]

In [76]:
from collections import Counter
import pandas as pd
# Notice a problem with the list? It has 'men' and 'man' this could be resolved by using a lemmatiser
# Also other stopword lists could be used together to further reduce less valuable terms
# Stopwords are a tricky issue because some lists may remove useful words, so if you are unsure search the list for the term then remove it if necessary
sh_word_freq = Counter(sh_tokens)
thrift_word_freq = Counter(thrift_tokens)
top_sh_words = sh_word_freq.most_common(20)
top_thrift_words = thrift_word_freq.most_common(20)
print('Top Self-Help Words: ')
print(top_sh_words)
print('Top Thrift Words: ')
print(top_thrift_words)


Top Self-Help Words: 
[('one', 458), ('life', 386), ('man', 384), ('great', 346), ('time', 331), ('men', 322), ('said', 302), ('upon', 297), ('work', 275), ('may', 247), ('years', 233), ('many', 225), ('would', 210), ('first', 201), ('character', 197), ('made', 189), ('found', 181), ('much', 172), ('could', 171), ('industry', 169)]
Top Thrift Words: 
[('man', 409), ('one', 379), ('men', 372), ('may', 281), ('life', 277), ('money', 276), ('upon', 236), ('would', 232), ('much', 229), ('savings', 225), ('time', 223), ('many', 221), ('good', 213), ('mr', 209), ('great', 206), ('must', 185), ('said', 182), ('means', 179), ('people', 178), ('working', 177)]


In [None]:
with open(os.path.join(pickle_dir, 'self_help.pkl'), 'rb') as f:
    self_help_sentences = pickle.load(f)

with open(os.path.join(pickle_dir, 'thrift.pkl'), 'rb') as f:
    thrift_sentences = pickle.load(f)

In [59]:
import spacy
# So why didn't we begin with spacy? If you run the cell, you will see.
# The time for spacy is much longer, but the spacy lemmas will be most effective and not leave errors such as keeping 'men' and 'man' separate
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
sh_lemmas = []
th_lemmas = []
for sentence in self_help_sentences:
  for doc in nlp.pipe(sentence, batch_size=1000):
    lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    sh_lemmas.append(lemmas)
for sentence in thift_sentences:
  for doc in nlp.pipe(thrift_fulltext, batch_size=1000):
    lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    th_lemmas.append(lemmas)



In [73]:
# Lemmatise + remove punctuation, spaces, and stopwords
sh_lemmas = []
th_lemmas = []

# Self-Help
for doc in nlp.pipe(self_help_sentences, batch_size=1000):
    lemmas = [
        token.lemma_ for token in doc
        if not token.is_punct and not token.is_space and token.lemma_.lower() not in stop_words and len(token.lemma_) >2
    ]
    sh_lemmas.append(lemmas)

# Thrift
for doc in nlp.pipe(thrift_sentences, batch_size=1000):
    lemmas = [
        token.lemma_ for token in doc
        if not token.is_punct and not token.is_space and token.lemma_.lower() not in stop_words and len(token.lemma_) >2
    ]
    th_lemmas.append(lemmas)


In [74]:
# If you want to, you can load the lemmas into tokens for the rest of this notebook
# This will depend on the goals of the research
flat_sh_lemmas = [lemma for sent in sh_lemmas for lemma in sent]
flat_th_lemmas = [lemma for sent in th_lemmas for lemma in sent]

# Now this works perfectly
sh_word_freq = Counter(flat_sh_lemmas)
thrift_word_freq = Counter(flat_th_lemmas)

top_sh_words = sh_word_freq.most_common(20)
top_thrift_words = thrift_word_freq.most_common(20)

print('Top Self-Help Words: ')
print(top_sh_words)
print('Top Thrift Words: ')
print(top_thrift_words)

Top Self-Help Words: 
[('man', 781), ('work', 547), ('one', 493), ('great', 460), ('say', 431), ('life', 429), ('make', 393), ('time', 382), ('upon', 299), ('year', 285), ('find', 273), ('good', 272), ('well', 259), ('may', 246), ('many', 225), ('character', 215), ('become', 215), ('labour', 214), ('would', 210), ('first', 209)]
Top Thrift Words: 
[('man', 848), ('one', 421), ('work', 404), ('say', 360), ('make', 356), ('well', 329), ('good', 319), ('money', 306), ('life', 295), ('time', 286), ('may', 282), ('great', 276), ('year', 268), ('upon', 238), ('would', 236), ('much', 230), ('many', 221), ('become', 216), ('class', 214), ('Mr.', 213)]


In [75]:
# Subtract frequencies to get unique words in each corpus
unique_to_sh = sh_word_freq - thrift_word_freq
unique_to_thrift = thrift_word_freq - sh_word_freq

# This provides a crude way of looking at the differences between the corpora.
# However, this is only useful when comparing the this output to the most common words prior to subtraction
print(f"\n📘 Top 20 words that appear in the *Self-Help* corpus but not (or less often) in *Thrift* — based on frequency counts after subtracting shared words:")
print(unique_to_sh.most_common(20))

print(f"\n📗 Top 20 words that appear in the *Thrift* corpus but not (or less often) in *Self-Help* — again, based on frequency counts after subtraction:")
print(unique_to_thrift.most_common(20))


📘 Top 20 words that appear in the *Self-Help* corpus but not (or less often) in *Thrift* — based on frequency counts after subtracting shared words:
[('great', 184), ('study', 168), ('character', 157), ('work', 143), ('life', 134), ('find', 127), ('boy', 106), ('success', 104), ('Sir', 98), ('time', 96), ('mind', 96), ('industry', 90), ('learn', 88), ('first', 88), ('machine', 84), ('career', 75), ('one', 72), ('succeed', 72), ('though', 72), ('say', 71)]

📗 Top 20 words that appear in the *Thrift* corpus but not (or less often) in *Self-Help* — again, based on frequency counts after subtraction:
[('money', 230), ('saving', 166), ('debt', 154), ('class', 145), ('save', 140), ('bank', 134), ('pound', 134), ('pay', 128), ('wage', 126), ('society', 123), ('Mr.', 120), ('people', 119), ('spend', 110), ('home', 100), ('hundred', 99), ('workman', 98), ('poor', 93), ('woman', 93), ('house', 89), ('provide', 81)]


In [79]:
# Remember to base comparisons off percentages as each dataset has different word amounts
# Expect the highest frequencies to still be quite low, using lemmas will increase them
df_top_sh = pd.DataFrame(top_sh_words,columns=['word','freq'])
df_top_th = pd.DataFrame(top_thrift_words,columns=['word','freq'])
df_top_sh['percentage'] = round(df_top_sh['freq'] / df_top_sh['freq'].sum(),2)
df_top_th['percentage'] = round(df_top_th['freq'] / df_top_th['freq'].sum(),2)
print(f'self-help top words: ')
print(df_top_sh)
output_dir = os.path.join(base_dir, "output")
df_top_sh.to_csv(os.path.join(output_dir, 'self_help_top_words.csv'), index=False)
print(f'thrift top words: ')
print(df_top_th)
df_top_th.to_csv(os.path.join(output_dir, 'thrift_top_words.csv'), index=False)

self-help top words: 
         word  freq  percentage
0         one   458        0.09
1        life   386        0.07
2         man   384        0.07
3       great   346        0.07
4        time   331        0.06
5         men   322        0.06
6        said   302        0.06
7        upon   297        0.06
8        work   275        0.05
9         may   247        0.05
10      years   233        0.04
11       many   225        0.04
12      would   210        0.04
13      first   201        0.04
14  character   197        0.04
15       made   189        0.04
16      found   181        0.03
17       much   172        0.03
18      could   171        0.03
19   industry   169        0.03
thrift top words: 
       word  freq  percentage
0       man   409        0.08
1       one   379        0.08
2       men   372        0.08
3       may   281        0.06
4      life   277        0.06
5     money   276        0.06
6      upon   236        0.05
7     would   232        0.05
8      much   229

In [None]:
sh_bigrams = nltk.FreqDist(nltk.bigrams(sh_tokens))
thrift_bigrams = nltk.FreqDist(nltk.bigrams(thrift_tokens))
print(sh_bigrams.most_common(20))
print(thrift_bigrams.most_common(20))

[(('one', 'day'), 31), (('many', 'years'), 24), (('young', 'man'), 23), (('one', 'occasion'), 22), (('sir', 'john'), 22), (('several', 'years'), 21), (('set', 'work'), 17), (('young', 'men'), 16), (('robert', 'peel'), 16), (('early', 'life'), 15), (('indefatigable', 'industry'), 14), (('ten', 'years'), 14), (('men', 'business'), 13), (('years', 'old'), 13), (('like', 'manner'), 13), (('years', 'age'), 13), (('called', 'upon'), 13), (('granville', 'sharp'), 13), (('man', 'may'), 12), (('one', 'first'), 12)]
[(('savings', 'banks'), 67), (('savings', 'bank'), 67), (('working', 'classes'), 47), (('thousand', 'pounds'), 43), (('post', 'office'), 30), (('years', 'ago'), 28), (('penny', 'bank'), 28), (('working', 'men'), 27), (('working', 'man'), 27), (('per', 'cent'), 26), (('mr', 'sikes'), 25), (('hundred', 'years'), 23), (('working', 'people'), 23), (('mr', 'chadwick'), 22), (('penny', 'banks'), 22), (('old', 'age'), 21), (('office', 'savings'), 20), (('great', 'deal'), 20), (('pounds', 'y

In [None]:
searcher = input("Enter search term: ").lower()

# Filter bigrams where search term appears in either position of the tuple
filtered_sh_bigrams = [(k, v) for k, v in sh_bigrams.items() if searcher in k[0].lower() or searcher in k[1].lower()]
filtered_thrift_bigrams = [(k, v) for k, v in thrift_bigrams.items() if searcher in k[0].lower() or searcher in k[1].lower()]

# Sort by frequency and print top 20
print(f"\n🔍 Self-Help bigrams containing '{searcher}':")
print(sorted(filtered_sh_bigrams, key=lambda x: -x[1])[:20])

print(f"\n🔍 Thrift bigrams containing '{searcher}':")
print(sorted(filtered_thrift_bigrams, key=lambda x: -x[1])[:20])


Enter search term: work

🔍 Self-Help bigrams containing 'work':
[(('set', 'work'), 17), (('hard', 'work'), 7), (('worked', 'hard'), 7), (('worked', 'way'), 6), (('working', 'man'), 6), (('working', 'trade'), 5), (('great', 'works'), 5), (('working', 'power'), 4), (('work', 'great'), 4), (('upon', 'work'), 4), (('good', 'works'), 4), (('working', 'classes'), 4), (('true', 'worker'), 3), (('must', 'work'), 3), (('worked', 'trade'), 3), (('work', 'occupied'), 3), (('willing', 'work'), 3), (('part', 'work'), 3), (('work', 'even'), 3), (('working', 'qualities'), 3)]

🔍 Thrift bigrams containing 'work':
[(('working', 'classes'), 47), (('working', 'men'), 27), (('working', 'man'), 27), (('working', 'people'), 23), (('hard', 'work'), 8), (('english', 'workmen'), 7), (('worked', 'way'), 6), (('work', 'must'), 5), (('condition', 'working'), 5), (('colliers', 'ironworkers'), 5), (('skilled', 'workmen'), 5), (('amongst', 'working'), 5), (('work', 'hard'), 4), (('men', 'work'), 4), (('skilled', 'wo

In [None]:
sh_df_bigrams = pd.DataFrame(filtered_sh_bigrams, columns = ['word','freq'])
thrift_df_bigrams = pd.DataFrame(filtered_thrift_bigrams, columns = ['word','freq'])
print(f' Self-Help top bigrams: ')
print(sh_df_bigrams)
print(f' Thrift top bigrams: ')
print(thrift_df_bigrams)

 Self-Help top bigrams: 
                       word  freq
0            (true, worker)     3
1     (worker, stimulating)     1
2              (must, work)     3
3             (work, order)     1
4           (good, working)     1
...                     ...   ...
1187          (work, brief)     1
1188    (directed, working)     1
1189        (working, ship)     1
1190           (last, work)     1
1191      (work, completed)     1

[1192 rows x 2 columns]
 Thrift top bigrams: 
                                           word  freq
0                               (savings, work)     1
1                                (work, strive)     1
2                             (thrift, workmen)     1
3                      (workmen, capitalhabits)     1
4     (operativescolliers, ironworkersearnings)     1
...                                         ...   ...
1114                           (workmen, leads)     1
1115                       (workmen, tottenham)     1
1116                         (work

In [None]:
sh_trigrams = nltk.FreqDist(nltk.trigrams(sh_tokens))
thrift_trigrams = nltk.FreqDist(nltk.trigrams(thrift_tokens))
print(sh_trigrams.most_common(20))
print(thrift_trigrams.most_common(20))

[(('sir', 'walter', 'scott'), 10), (('sir', 'robert', 'peel'), 9), (('sir', 'joshua', 'reynolds'), 9), (('sir', 'humphry', 'davy'), 5), (('late', 'sir', 'robert'), 4), (('twenty', 'years', 'age'), 4), (('illustrations', 'power', 'perseverance'), 4), (('sir', 'charles', 'bell'), 4), (('sir', 'charles', 'napier'), 4), (('greater', 'less', 'degree'), 3), (('sir', 'john', 'sinclair'), 3), (('morning', 'till', 'nine'), 3), (('sir', 'samuel', 'romilly'), 3), (('conservatoire', 'des', 'arts'), 3), (('des', 'arts', 'et'), 3), (('arts', 'et', 'métiers'), 3), (('never', 'lost', 'sight'), 3), (('great', 'secret', 'success'), 3), (('east', 'india', 'company'), 3), (('remarkable', 'illustrations', 'power'), 3)]
[(('post', 'office', 'savings'), 19), (('hundred', 'years', 'ago'), 17), (('office', 'savings', 'banks'), 16), (('middle', 'upper', 'classes'), 7), (('hundred', 'thousand', 'pounds'), 7), (('_a', 'penny', 'day_'), 7), (('paris', 'universal', 'exhibition'), 6), (('five', 'per', 'cent'), 6), (

In [None]:
searcher = input("Enter search term: ").lower()

# Filter trigrams where the search term appears in any of the three words
filtered_sh_trigrams = [(k, v) for k, v in sh_trigrams.items() if any(searcher in word.lower() for word in k)]
filtered_thrift_trigrams = [(k, v) for k, v in thrift_trigrams.items() if any(searcher in word.lower() for word in k)]

# Sort by frequency and print top 20
print(f"\n🔍 Self-Help trigrams containing '{searcher}':")
print(sorted(filtered_sh_trigrams, key=lambda x: -x[1])[:20])

print(f"\n🔍 Thrift trigrams containing '{searcher}':")
print(sorted(filtered_thrift_trigrams, key=lambda x: -x[1])[:20])


In [None]:
sh_df_trigrams = pd.DataFrame(filtered_sh_trigrams, columns = ['word','freq'])
thrift_df_trigrams = pd.DataFrame(filtered_thrift_trigrams, columns = ['word','freq'])
print(f' Self-Help top trigrams: ')
print(sh_df_trigrams)
print(f' Thrift top trigrams: ')
print(thrift_df_trigrams)