In [29]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')

In [30]:
documents = [
    "Machines and technology are evolving at a rapid pace as our society grows, and there is an ever-increasing need for more manpower, especially when it comes to manufacturing.",
    "Deep learning is a branch of machine learning where it teaches artificial intelligence (AI) to imitate the way a human gains knowledge.",
    "Deep learning performs nonlinear transformations to its input and uses what it learns to create a statistical model as output.",
    "Manufacturing is the backbone of industry, and thus requires quality control to make sure that the product has no defects.",
    "It is possible to improve detecting defects more accurately and efficiently using deep learning rather than relying on human sight which is prone to error."
]

<h2>Preprocess the documents</h2>

In [37]:
def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)


<h2>Print topics and their keywords</h2>

In [38]:
pprint(lda_model.print_topics())

[(0,
  '0.033*"manufacturing" + 0.033*"pace" + 0.033*"evolve" + 0.033*"rapid" + '
  '0.033*"manpower" + 0.033*"technology" + 0.033*"society" + '
  '0.033*"especially" + 0.033*"need" + 0.033*"grow"'),
 (1,
  '0.019*"deep" + 0.019*"learning" + 0.019*"manufacturing" + 0.019*"defect" + '
  '0.019*"human" + 0.019*"machine" + 0.019*"require" + 0.019*"backbone" + '
  '0.019*"control" + 0.019*"product"'),
 (2,
  '0.038*"create" + 0.038*"output" + 0.038*"learn" + 0.038*"nonlinear" + '
  '0.038*"statistical" + 0.038*"transformation" + 0.038*"input" + '
  '0.038*"model" + 0.038*"use" + 0.038*"perform"'),
 (3,
  '0.019*"deep" + 0.019*"learning" + 0.019*"manufacturing" + 0.019*"defect" + '
  '0.019*"human" + 0.019*"machine" + 0.019*"require" + 0.019*"control" + '
  '0.019*"product" + 0.019*"industry"'),
 (4,
  '0.090*"learning" + 0.049*"deep" + 0.049*"human" + 0.049*"machine" + '
  '0.049*"knowledge" + 0.049*"intelligence" + 0.049*"branch" + 0.049*"teach" + '
  '0.049*"imitate" + 0.049*"AI"')]


<h2>Assign topics to documents</h2>

In [39]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.9427557), (1, 0.014294837), (2, 0.014321894), (3, 0.014294837), (4, 0.014332702)]
Document 2 - Topic: [(0, 0.013400027), (1, 0.0133385295), (2, 0.013370788), (3, 0.0133385295), (4, 0.94655216)]
Document 3 - Topic: [(0, 0.015444845), (1, 0.015393085), (2, 0.9382312), (3, 0.015393085), (4, 0.015537756)]
Document 4 - Topic: [(0, 0.020105876), (1, 0.020011099), (2, 0.9198672), (3, 0.020011099), (4, 0.020004647)]
Document 5 - Topic: [(0, 0.9425321), (1, 0.014294841), (2, 0.014390826), (3, 0.014294841), (4, 0.014487332)]
