# LDA DEMO

In [1]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')


# Sample documents for demonstration

In [2]:
documents = [
   "Natural language processing is a subfield of artificial intelligence.",
   "Latent Dirichlet Allocation is a generative probabilistic model.",
   "Topic modeling is used to identify topics present in a corpus of text.",
   "Gensim is a popular Python library for topic modeling and document similarity."
]

# Preprocess the documents

In [3]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

# Print topics and their keywords

In [4]:
pprint(lda_model.print_topics())

[(0,
  '0.061*"document" + 0.061*"popular" + 0.061*"Gensim" + 0.061*"library" + '
  '0.061*"similarity" + 0.061*"Python" + 0.061*"Allocation" + '
  '0.061*"generative" + 0.061*"Dirichlet" + 0.061*"Latent"'),
 (1,
  '0.095*"processing" + 0.095*"language" + 0.095*"natural" + '
  '0.095*"intelligence" + 0.095*"artificial" + 0.095*"subfield" + '
  '0.024*"modeling" + 0.024*"topic" + 0.024*"probabilistic" + 0.024*"model"'),
 (2,
  '0.157*"topic" + 0.089*"modeling" + 0.089*"text" + 0.089*"present" + '
  '0.089*"corpus" + 0.089*"identify" + 0.022*"Python" + 0.022*"library" + '
  '0.022*"similarity" + 0.022*"Gensim"')]


# Assign topics to documents

In [5]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.047823485), (1, 0.9042529), (2, 0.047923613)]
Document 2 - Topic: [(0, 0.90374076), (1, 0.048149187), (2, 0.048110057)]
Document 3 - Topic: [(0, 0.042918444), (1, 0.041926842), (2, 0.9151547)]
Document 4 - Topic: [(0, 0.9208642), (1, 0.037447672), (2, 0.0416881)]


#                   

# Mini Exercise hehe

Instructions:

Use the provided Python code to perform topic modeling on a set of sample documents.
Modify the sample documents or add your own to see how the results change.
Experiment with the number of topics (parameter: num_topics) in the LDA model. Observe how different numbers of topics impact the results

Make a small insight on what you have observe when you change, increase, or decrease some parameters.(Short Essay lang)