In [46]:
import re
from pathlib import Path

import spacy, pandas as pd, gensim as gn
import gensim.corpora as corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore, LdaModel
# from gensim.models.ldamodel import LdaModel

# spacy internal setup
if not 'nlp' in locals():
    model_loaded: bool = False
    while not model_loaded:
        # we keep trying until this works...
        try:
            print("attempting to load model")
            # attempt to load the spacy model
            nlp: spacy.Language = spacy.load("en_core_web_sm")
            print("model loaded successfully")
            model_loaded = True
        except BaseException as e:
            print("model failed to load")
            # if we fail to load the model, we are going to make
            # sure the package is installed...
            from spacy.cli import download
            download("en_core_web_sm")
            # then loop around and try to load it again

In [2]:
path_root: Path = Path(".").resolve()

In [3]:
md_text_raw: str = path_root.joinpath("moby_dick.txt").read_text('utf-8')

# Preprocess the text

In [4]:
# DEFINE SOME USEFUL VARIABLES FOR PREPROCESSING
# utf-8 codes for some characters in the text
dq_op: str = "\u201C" # double quotes open
dq_cl: str = "\u201D" # double quotes close
sq_op: str = "\u2018" # single quote open
sq_cl: str = "\u2019" # single quote close
under: str = "\u005F" # underscore
hyphe: str = "\u002D" # hyphen/minus
mdash: str = "\u2014" # em dash

In [5]:
# replace newlines and carriage returns with whitespace
md_text: str = re.sub("[\n\r]", " ", md_text_raw)
# remove everything before the first chapter
md_text = re.sub("[\w\W]*(?=CHAPTER 1\. Loomings)", "", md_text, count=1)
# remove everything after the end of the epilogue
md_text = re.sub("\s*(\*\*\* END)[\w\W]+", "", md_text, count=1)
# there are some weird embedded books in the middle of the text
# first we remove the in-betweens of the embedded books
md_text = re.sub("(Thus ends BOOK[\w\W]+?)([\w\W]+?BOOK[\w\W]+?)(?=\s\s)", "", md_text)
# then we get rid of the chapter headings of the embedded books
md_text = re.sub("BOOK\s+[IV]+([\w\W]+?(?=CHAPTER)CHAPTER\s+[IV\d]+\.\s+[\w\W]+?(?=\.)\.)?", "", md_text)
# CLEANING UP IMPORTANT UTF-8 CHARACTERS
# replace all utf single quotes with ascii single quotes
md_text = re.sub(f"{sq_op}|{sq_cl}", "\'", md_text)
# replace all utf double quotes with ascii double quotes
md_text = re.sub(f"{dq_op}|{dq_cl}", "\"", md_text)
# replace utf underscores with ascii
md_text = re.sub(f"{under}", "_", md_text)
# replace utf hyphens with ascii
md_text = re.sub(f"{hyphe}", "-", md_text)
# replace utf em dash with hyphen
md_text = re.sub(f"{mdash}", "-", md_text)
# next (and I don't know if this is a good idea...) we're going to replace
# all punctuation that is NOT a:
# . or ! or ? or whitespace or "
md_text = re.sub("[^\w\d\s\.\"?!]", " ", md_text)
# finally, we want to replace repeated whitespace with single whitespace
md_text = re.sub("\s+", " ", md_text)

In [6]:
# defining a function to contain intermediate variables
def split_chapters(text: str) -> dict[str, str]:
    # build a list of the chapter titles and their contents
    list_results: list[str] = [result.strip() for result in re.split("(CHAPTER\s\d+\.\s)|(Epilogue)", text) if result != "" and result is not None]
    # use that list to create a dictionary that is {chapter_title: chapter_content}
    return {x: list_results[i+1] for i, x in enumerate(list_results) if re.match("(CHAPTER\s\d+\.)|(Epilogue)", x)}
# run the processed text through the chapter splitter
dict_chapters: dict[str, str] = split_chapters(md_text)
# dict_chapters

In [7]:
# instantiate the spacy nlp object
# this object contains all of the functionality for
# turning a string or list of strings into spacy "Documents"
"""we no longer need to do the following line as it's loaded with the imports"""
# nlp: spacy.Language = spacy.load("en_core_web_sm")

# we CAN add pipelines here though
nlp.add_pipe("merge_noun_chunks")

# our book is a little too long for the default
# processing limit so we increase it slightly
nlp.max_length = 1200000

In [8]:
# let spacy process the full text
# this produces a "Document"
# the document is a tokenized representation of the full
# text with a bunch of extra information attached to the tokens
doc = nlp(md_text)

In [9]:
# spacy stores things like parts of speech as integer tags
# we can use this reverse lookup table to get the string label
lookup: dict[int, str] = {y: x for x, y in spacy.symbols.IDS.items()}

In [10]:
# we can now build a list of all things that are like nouns for the full text
noun_alikes: list[str] = [x.text for x in doc if lookup[x.pos] in {"NOUN", "PROPN", "PRON"}]

In [11]:
# using pandas we build a series from the noun_alikes list
nouns: pd.Series = pd.Series(noun_alikes)
# that we can use to get the unique nouns and how many times they appear in the text
nouns_counts: pd.Series = nouns.value_counts()

In [12]:
nouns_counts

it                2187
I                 2074
he                1643
him               1054
that               863
                  ... 
one voyage           1
the logs             1
many hunters         1
given waters         1
another orphan       1
Name: count, Length: 27214, dtype: int64

In [13]:
# bigram_mod = gensim.models.phrases.Phraser()

In [28]:
[x for x in list(doc.sents)[0].as_doc()]

[CHAPTER, 1, .]

In [14]:
# culling the tokens to get rid of the punctuation tokens and stopwords.
tokens_filtered: list[str] = [
    token.lemma_ for token in doc if 
    not token.is_stop
    and not token.is_punct
    and not token.is_digit
]

In [35]:
texts: list[list[str]] = [
    [
        token.text for token in sentence.as_doc() if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
    ]
    for sentence in doc.sents
]

In [36]:
id2word = corpora.Dictionary(texts)

In [40]:
corpus = [id2word.doc2bow(text) for text in texts]

In [49]:
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=20, 
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [50]:
doc_lda = lda_model[corpus]

In [51]:
lda_model.print_topics()

[(0,
  '0.046*"aloft" + 0.032*"high" + 0.013*"right" + 0.012*"leaning" + 0.011*"looking" + 0.007*"the bulwarks" + 0.002*"seated" + 0.002*"ships" + 0.000*"Ahab" + 0.000*"s"'),
 (1,
  '0.044*"way" + 0.023*"fixed" + 0.022*"all this" + 0.011*"view" + 0.010*"try" + 0.004*"needs" + 0.003*"my soul" + 0.002*"bringing" + 0.002*"knowing" + 0.002*"served"'),
 (2,
  '0.219*"like" + 0.024*"goes" + 0.007*"order" + 0.006*"jump" + 0.004*"sleep" + 0.000*"sleeps" + 0.000*"s" + 0.000*"Ahab" + 0.000*"Stubb" + 0.000*"upwards"'),
 (3,
  '0.044*"t" + 0.039*"long" + 0.026*"nigh" + 0.024*"thought" + 0.021*"sail" + 0.017*"years" + 0.017*"ago" + 0.009*"taking" + 0.008*"mind" + 0.005*"bear"'),
 (4,
  '0.029*"grow" + 0.021*"little" + 0.015*"pay" + 0.008*"the same time" + 0.008*"mean" + 0.000*"s" + 0.000*"smaller" + 0.000*"sir" + 0.000*"eight and forty hours" + 0.000*"the whale"'),
 (5,
  '0.038*"tell" + 0.000*"Ahab" + 0.000*"thee" + 0.000*"s" + 0.000*"muttered" + 0.000*"the whale" + 0.000*"swim" + 0.000*"these sha

In [53]:
import pyLDAvis as vis
import pyLDAvis.gensim_models as gensimvis
vis.enable_notebook()

In [56]:
vis.gensimvis.prepare(lda_model, corpus, )

TypeError: pyLDAvis._prepare.prepare() argument after ** must be a mapping, not LdaModel