# LDA

In [1]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
# do this before running
# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# Sample documents for demonstration

In [2]:
# documents = [
#    "Natural language processing is a subfield of artificial intelligence.",
#    "Latent Dirichlet Allocation is a generative probabilistic model.",
#    "Topic modeling is used to identify topics present in a corpus of text.",
#    "Gensim is a popular Python library for topic modeling and document similarity."
# ]

documents = [
    "The nice thing about Eventbrite is that it's free to use as long as you're not charging for the event. There is a fee if you are charging for the event -  2.5% plus a $0.99 transaction fee.",
    "We have the gold level plan and use it for everything, love the features! It is one of the best bang for buck possible.",
    "Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy. \n IBM has a rich history with machine learning. One of its own, Arthur Samuel, is credited for coining the term, “machine learning” with his research (link resides outside ibm.com) around the game of checkers. Robert Nealey, the self-proclaimed checkers master, played the game on an IBM 7094 computer in 1962, and he lost to the computer. Compared to what can be done today, this feat seems trivial, but its considered a major milestone in the field of artificial intelligence."
]

# Preprocess the documents

In [5]:
def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model_4 = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=12)

lda_model_8 = gensim.models.LdaModel(corpus, num_topics=8, id2word=dictionary, passes=12)

lda_model_12 = gensim.models.LdaModel(corpus, num_topics=12, id2word=dictionary, passes=12)

# Print topics and their keywords

In [6]:
print('LDA Model 4 topics')
pprint(lda_model_4.print_topics())

LDA Model 4 topics
[(0,
  '0.075*"subfield" + 0.075*"natural" + 0.075*"artificial" + '
  '0.075*"intelligence" + 0.075*"language" + 0.075*"processing" + '
  '0.015*"Allocation" + 0.015*"model" + 0.015*"Dirichlet" + 0.015*"Latent"'),
 (1,
  '0.055*"modeling" + 0.055*"popular" + 0.055*"change" + 0.055*"spot" + '
  '0.055*"stay" + 0.055*"step" + 0.055*"Gensim" + 0.055*"similarity" + '
  '0.055*"library" + 0.055*"Python"'),
 (2,
  '0.095*"topic" + 0.053*"corpus" + 0.053*"text" + 0.053*"identify" + '
  '0.053*"present" + 0.053*"modeling" + 0.053*"generative" + '
  '0.053*"probabilistic" + 0.053*"Latent" + 0.053*"Dirichlet"'),
 (3,
  '0.047*"popular" + 0.047*"Dota" + 0.047*"gamplay" + 0.047*"Steam" + '
  '0.047*"friendly" + 0.047*"game" + 0.047*"difficult" + 0.047*"beginner" + '
  '0.047*"pc" + 0.047*"future"')]


In [7]:
print('LDA Model 8 topics')
pprint(lda_model_8.print_topics())

LDA Model 8 topics
[(0,
  '0.099*"probabilistic" + 0.099*"Allocation" + 0.099*"Latent" + '
  '0.099*"generative" + 0.099*"model" + 0.099*"Dirichlet" + 0.011*"future" + '
  '0.011*"thankful" + 0.011*"topic" + 0.011*"spot"'),
 (1,
  '0.153*"thankful" + 0.153*"future" + 0.017*"topic" + 0.017*"spot" + '
  '0.017*"stay" + 0.017*"change" + 0.017*"subfield" + 0.017*"modeling" + '
  '0.017*"popular" + 0.017*"processing"'),
 (2,
  '0.099*"natural" + 0.099*"artificial" + 0.099*"intelligence" + '
  '0.099*"language" + 0.099*"processing" + 0.099*"subfield" + 0.011*"future" + '
  '0.011*"thankful" + 0.011*"topic" + 0.011*"spot"'),
 (3,
  '0.023*"future" + 0.023*"topic" + 0.023*"spot" + 0.023*"thankful" + '
  '0.023*"stay" + 0.023*"Dirichlet" + 0.023*"subfield" + 0.023*"step" + '
  '0.023*"processing" + 0.023*"language"'),
 (4,
  '0.172*"topic" + 0.091*"modeling" + 0.091*"identify" + 0.091*"text" + '
  '0.091*"present" + 0.091*"corpus" + 0.010*"future" + 0.010*"thankful" + '
  '0.010*"stay" + 0.010*

In [8]:
print('LDA Model 12 topics')
pprint(lda_model_12.print_topics())

LDA Model 12 topics
[(0,
  '0.023*"thankful" + 0.023*"Allocation" + 0.023*"step" + 0.023*"future" + '
  '0.023*"Latent" + 0.023*"topic" + 0.023*"spot" + 0.023*"modeling" + '
  '0.023*"subfield" + 0.023*"intelligence"'),
 (1,
  '0.143*"step" + 0.143*"stay" + 0.143*"spot" + 0.143*"change" + '
  '0.011*"thankful" + 0.011*"modeling" + 0.011*"future" + 0.011*"topic" + '
  '0.011*"artificial" + 0.011*"friendly"'),
 (2,
  '0.113*"model" + 0.113*"probabilistic" + 0.113*"generative" + '
  '0.113*"Allocation" + 0.113*"Latent" + 0.113*"Dirichlet" + 0.009*"thankful" '
  '+ 0.009*"modeling" + 0.009*"topic" + 0.009*"future"'),
 (3,
  '0.023*"thankful" + 0.023*"future" + 0.023*"step" + 0.023*"topic" + '
  '0.023*"intelligence" + 0.023*"modeling" + 0.023*"spot" + 0.023*"change" + '
  '0.023*"beginner" + 0.023*"MOBA"'),
 (4,
  '0.062*"Gensim" + 0.062*"library" + 0.062*"similarity" + 0.062*"document" + '
  '0.062*"Python" + 0.062*"intelligence" + 0.062*"artificial" + '
  '0.062*"subfield" + 0.062*"langu

# Assign topics to documents

In [9]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_4.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.89277446), (1, 0.035743766), (2, 0.035742547), (3, 0.035739247)]
Document 2 - Topic: [(0, 0.035774115), (1, 0.035756335), (2, 0.8927196), (3, 0.03574993)]
Document 3 - Topic: [(0, 0.031292062), (1, 0.031857412), (2, 0.90557444), (3, 0.031276066)]
Document 4 - Topic: [(0, 0.027820146), (1, 0.9156434), (2, 0.028551973), (3, 0.0279845)]
Document 5 - Topic: [(0, 0.016696254), (1, 0.01677307), (2, 0.016687347), (3, 0.94984335)]
Document 6 - Topic: [(0, 0.08350026), (1, 0.08345448), (2, 0.08344903), (3, 0.74959624)]
Document 7 - Topic: [(0, 0.050079986), (1, 0.84981567), (2, 0.05005538), (3, 0.050049033)]


In [10]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_8.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.017857186), (1, 0.017857194), (2, 0.87499964), (3, 0.017857201), (4, 0.017857186), (5, 0.017857201), (6, 0.017857185), (7, 0.017857183)]
Document 2 - Topic: [(0, 0.87499964), (1, 0.017857194), (2, 0.017857186), (3, 0.017857201), (4, 0.017857186), (5, 0.017857201), (6, 0.017857185), (7, 0.017857183)]
Document 3 - Topic: [(0, 0.015625035), (1, 0.015625041), (2, 0.015625035), (3, 0.015625047), (4, 0.8906187), (5, 0.015625047), (6, 0.01563109), (7, 0.015625032)]
Document 4 - Topic: [(0, 0.013888942), (1, 0.01388895), (2, 0.013888942), (3, 0.01388896), (4, 0.013905542), (5, 0.01388896), (6, 0.90275776), (7, 0.013891902)]
Document 5 - Topic: [(7, 0.9416651)]
Document 6 - Topic: [(0, 0.041666735), (1, 0.70833284), (2, 0.041666735), (3, 0.04166676), (4, 0.041666735), (5, 0.04166676), (6, 0.04166673), (7, 0.041666728)]
Document 7 - Topic: [(0, 0.025000101), (1, 0.025000116), (2, 0.025000101), (3, 0.02500013), (4, 0.025000097), (5, 0.02500013), (6, 0.8249992), (7, 0.02

In [11]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_8.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.017857186), (1, 0.017857194), (2, 0.87499964), (3, 0.017857201), (4, 0.017857186), (5, 0.017857201), (6, 0.017857185), (7, 0.017857183)]
Document 2 - Topic: [(0, 0.87499964), (1, 0.017857192), (2, 0.017857185), (3, 0.0178572), (4, 0.017857185), (5, 0.0178572), (6, 0.01785718), (7, 0.017857179)]
Document 3 - Topic: [(0, 0.015625035), (1, 0.015625041), (2, 0.015625035), (3, 0.015625047), (4, 0.8906192), (5, 0.015625047), (6, 0.015630549), (7, 0.015625032)]
Document 4 - Topic: [(0, 0.013888942), (1, 0.01388895), (2, 0.013888942), (3, 0.01388896), (4, 0.013905821), (5, 0.01388896), (6, 0.90275747), (7, 0.013891902)]
Document 5 - Topic: [(7, 0.9416651)]
Document 6 - Topic: [(0, 0.041666735), (1, 0.70833284), (2, 0.041666735), (3, 0.04166676), (4, 0.041666735), (5, 0.04166676), (6, 0.04166673), (7, 0.041666728)]
Document 7 - Topic: [(0, 0.025000101), (1, 0.025000116), (2, 0.025000101), (3, 0.02500013), (4, 0.025000097), (5, 0.02500013), (6, 0.8249992), (7, 0.025000