In [1]:
%pip install -q "numpy<2" \
    scipy==1.12.0 \
    gensim==4.3.1 \
    pyLDAvis==3.4.1 \
    spacy==3.7.2
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
import os
import string
import time
import multiprocessing as mp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy

import gensim
from gensim import corpora, models
from gensim.models import Phrases, LdaModel, LdaMulticore, CoherenceModel
from gensim.corpora import Dictionary

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

In [15]:
num_processors = mp.cpu_count()
print(f'Using {num_processors} workers')

Using 11 workers


In [3]:
# Read the data from the CSV
df = pd.read_parquet('clean1.parquet')

In [5]:
# Use the function from Nick for cleaning the data
def clean_text(corpus, spacy_pipeline, as_string=True, n_proc=2, batch_size=200):
    """
    Cleans a sequence of text by applying some simple processing techniques.

    Args:
        corpus (Iterable): a sequence of text to be processed
        spacy_pipeline: the Spacy pipeline object for processing text
        n_proc (int): the number of processors to use for parallel processing
        batch_size (int): the number of texts to process in a single batch

    Returns:
        clean_sequence (list): a cleaned version of the original text
    """
    # container to store cleaned documents
    corpus_clean = []

    for doc in spacy_pipeline.pipe(
        corpus,
        disable=["ner"],
        n_process=n_proc,
        batch_size=batch_size
    ):

        # container for cleaned document tokens
        doc_tokens = [
            token.lemma_.lower()
            for token in doc
            if not token.is_stop and token.is_alpha and (len(token) > 1) and token.pos_ in ("NOUN", "VERB") # should consider if these are appropriate
        ]
        
        if as_string:
            corpus_clean.append(" ".join(doc_tokens))
        else:
            corpus_clean.append(doc_tokens)

    return corpus_clean

In [None]:
# Load the spaCy model
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

In [7]:
# Clean the dataset
normalized_corpus = clean_text(
    corpus=df["text_clean"].tolist(),
    spacy_pipeline=nlp,
    as_string=False,
    n_proc=int(4)
)

Python(88369) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(88370) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(88372) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(88373) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(88374) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [9]:
df_clean = pd.DataFrame(normalized_corpus)
df_clean.to_parquet('clean_lda.parquet')

In [10]:
# Create dictionary
dictionary = Dictionary(normalized_corpus)

# Filter out words rare or common words
dictionary.filter_extremes(no_below=0.01*len(normalized_corpus), no_above=0.5)

In [14]:
corpus = [dictionary.doc2bow(doc) for doc in normalized_corpus]

In [16]:
# Define Nick's function for calculating coherence score
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = LdaMulticore(corpus=corpus,
                       id2word=dictionary,
                       num_topics=k,
                       random_state=100,                  
                       passes=10,
                       alpha=a,
                       eta=b,
                       workers=num_processors-1)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=normalized_corpus, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()