In [93]:
import pandas as pd
import random
import spacy
import gensim
import pyLDAvis.gensim_models
from gensim import models, corpora
from gensim import similarities
from gensim.models.coherencemodel import CoherenceModel

In [94]:
nlp = spacy.load("en_core_web_md")

In [95]:
balzac_text = open("documents/HonoreDeBalzac.txt").read()
alice_text = open("documents/AliceBrown.txt").read()
chesterton_text = open("documents/Chesterton.txt").read()

# Create SpaCy documents
balzac_doc = nlp(balzac_text)
alice_doc = nlp(alice_text)
chesterton_doc = nlp(chesterton_text)

# Pre-processing function for SpaCy docs
The preprocess_and_filter function is designed to enhance the pre-processing of SpaCy documents by combining lemmatization and filtering based on linguistic characteristics. It removes stop words, punctuation, and non-alphabetic tokens while maintaining only those classified as nouns, verbs, or adjectives. This function is particularly useful for refining textual data in a way that retains essential linguistic information, ensuring that only meaningful words contribute to subsequent analyses. By employing this function on SpaCy documents, users can achieve cleaner and more focused text representations, conducive to various natural language processing tasks.

In [96]:
def preprocess_and_filter(doc):
    # Perform pre-processing and filtering tasks
    processed_text = " ".join(token.lemma_ for token in doc if
                              not token.is_stop and
                              token.is_alpha and
                              not token.is_punct and
                              not token.is_space and
                              token.pos_ in ['NOUN', 'VERB', 'ADJ'])
    return processed_text


# Apply preprocessing and filtering to each document
processed_balzac = preprocess_and_filter(balzac_doc)
processed_alice = preprocess_and_filter(alice_doc)
processed_chesterton = preprocess_and_filter(chesterton_doc)

In [97]:
print("Processed Balzac Text:", processed_balzac)



# Bag-of-Words Representation
* A `corpora.Dictionary` is created to represent the vocabulary of the processed documents. The dictionary is constructed by combining the pre-processed texts of three documents: `HonoreDeBalzac`, `AliceBrown`, and `Chesterton`.
* The resulting corpus_bow_pos is a list of BoW representations for each document, allowing for further analysis using topic modeling techniques.

In [98]:
dictionary = corpora.Dictionary([processed_balzac.split(),
                                 processed_alice.split(),
                                 processed_chesterton.split()])

corpus_bow_pos = [dictionary.doc2bow(article.split()) for article in
                  [processed_balzac, processed_alice, processed_chesterton]]

In [99]:

len(dictionary)

2373

# Latent Dirichlet Allocation
* An LDA topic model is trained on the Bag-of-Words (BoW) corpus (`corpus_bow_pos`) using the Gensim library. The `ldamodel.LdaModel` function is employed to create the model, with the specified parameters, including the number of topics (`num_topics=5`).


In [100]:
lda_model = models.ldamodel.LdaModel(corpus=corpus_bow_pos,
                                     id2word=dictionary,
                                     num_topics=5,
                                     random_state=1)

In [101]:
lda_model.print_topics()

[(0,
  '0.008*"man" + 0.007*"say" + 0.006*"old" + 0.005*"know" + 0.005*"go" + 0.005*"look" + 0.005*"little" + 0.005*"come" + 0.004*"time" + 0.004*"day"'),
 (1,
  '0.009*"man" + 0.009*"say" + 0.009*"old" + 0.005*"little" + 0.005*"day" + 0.005*"time" + 0.005*"think" + 0.005*"lie" + 0.005*"come" + 0.004*"look"'),
 (2,
  '0.009*"man" + 0.007*"say" + 0.006*"old" + 0.005*"come" + 0.005*"know" + 0.005*"look" + 0.005*"go" + 0.004*"time" + 0.004*"day" + 0.004*"little"'),
 (3,
  '0.011*"man" + 0.007*"old" + 0.006*"say" + 0.006*"little" + 0.006*"know" + 0.005*"come" + 0.005*"go" + 0.004*"lie" + 0.004*"think" + 0.003*"word"'),
 (4,
  '0.009*"man" + 0.008*"go" + 0.007*"say" + 0.007*"old" + 0.006*"day" + 0.005*"come" + 0.005*"know" + 0.005*"little" + 0.004*"hand" + 0.004*"time"')]

To aid in the interpretation of the various topics we have used the pyLDAvis library to visualize the topics and their associated terms. The size of the circles represents the prevalence of the topic in the corpus. The distance between the circles represents the similarity between the topics. The bars on the right represent the frequency of the terms in the corpus. The red bars represent the frequency of the term in the topic, while the blue bars represent the frequency of the term in the corpus.

In [102]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow_pos, dictionary)
pyLDAvis.display(vis)

# Latent Semantic Indexing

In [103]:
lsa_model = models.LsiModel(corpus=corpus_bow_pos, id2word=dictionary, num_topics=5)


  sparsetools.csc_matvecs(
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m


In [104]:
lsa_model.print_topics()

[(0,
  '-0.315*"man" + -0.254*"say" + -0.236*"old" + -0.182*"go" + -0.175*"know" + -0.164*"come" + -0.160*"little" + -0.146*"day" + -0.141*"look" + -0.135*"time"'),
 (1,
  '0.209*"man" + -0.194*"go" + 0.186*"ear" + 0.170*"think" + 0.170*"table" + -0.134*"day" + 0.132*"wig" + 0.129*"family" + 0.126*"priest" + -0.125*"time"'),
 (2,
  '-0.255*"lie" + -0.188*"little" + -0.156*"grave" + -0.152*"year" + -0.151*"hand" + 0.117*"son" + -0.115*"set" + -0.114*"child" + 0.114*"room" + 0.108*"cry"')]

* A sample query is crafted to demonstrate the process of finding the document most akin to the provided text. The query, "The child went to the store to buy some candy," is tokenized and converted into a Bag-of-Words (BoW) representation using the dictionary established during pre-processing.

In [105]:
query_text = "The child went to the store to buy some candy."
query_bow = dictionary.doc2bow(query_text.split())


# Similarity Queries
* Cosine similarity scores are computed between the BoW representation of the query and each document in the corpus. The cossim function from the Gensim library facilitates this calculation. Both LSA and LDA models are employed to obtain similarity scores.
* The document index with the highest similarity score is determined for both LSA and LDA models. The max function, along with the range and lambda functions, is utilized for this purpose.
* In this case, HonoreDeBalzac is the first document (index 0), AliceBrown is the second document (index 1), and Chesterton is the third document (index 2).

In [106]:

# Get the similarity scores between the query and each document 
lsa_similarity_scores = [gensim.matutils.cossim(query_bow, doc_bow) for doc_bow in corpus_bow_pos]

lda_similarity_scores = [gensim.matutils.cossim(query_bow, doc_bow) for doc_bow in lda_model[corpus_bow_pos]]

# Identify the document with the highest similarity for each method
lsa_most_similar_doc = max(range(len(lsa_similarity_scores)), key=lambda i: lsa_similarity_scores[i])
lda_most_similar_doc = max(range(len(lda_similarity_scores)), key=lambda i: lda_similarity_scores[i])

print(f"LSA Most Similar Document: {lsa_most_similar_doc}")
print(f"LDA Most Similar Document: {lda_most_similar_doc}")


LSA Most Similar Document: 1
LDA Most Similar Document: 0
