In [1]:
import csv

text_data = [
    "This is the first text data.",
    "This is the second text data.",
    "This is the third text data.",
    "This is the fourth text data.",
    "This is the fifth text data.",
]

with open("data.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["text"])
    for data in text_data:
        writer.writerow([data])


In [None]:


import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Load the spaCy model
nlp = spacy.load('en', disable=['parser', 'ner'])          

# Define a function to preprocess the text data
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in gensim.parsing.preprocessing.STOPWORDS]
    # Lemmatize
    words = [nlp(word)[0].lemma_ for word in words]
    return words

# Load your text data into a pandas dataframe
df = pd.read_csv("path/to/your/data.csv")
text_data = df['text'].tolist()

# Preprocess the text data
processed_texts = [preprocess(text) for text in text_data]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Train the LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the topics
pprint(ldamodel.print_topics())

# Compute the coherence score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#
