## Import necessary modules

In [1]:
import gensim
from gensim import corpora

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string
from pprint import pprint

## Cleaning function

In [2]:
STOP = set(stopwords.words("english"))
PUNCTUATION = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in STOP])
    punct_free = "".join(ch for ch in stop_free if ch not in PUNCTUATION)
    normalized = " ".join(lemma.lemmatize(word) for word in punct_free.split())
    return normalized

## Example

In [3]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

doc_complete = [doc1, doc2, doc3, doc4, doc5]

In [4]:
doc_clean = [clean(doc).split() for doc in doc_complete]

In [5]:
dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [6]:
LDA = gensim.models.ldamodel.LdaModel

ldamodel = LDA(doc_term_matrix, num_topics=3, id2word=dictionary, passes=50, iterations=50)

In [7]:
ldamodel.print_topics(num_topics=3, num_words=3)

[(0, '0.099*"sugar" + 0.069*"sister" + 0.069*"father"'),
 (1, '0.029*"driving" + 0.029*"father" + 0.029*"sister"'),
 (2, '0.071*"pressure" + 0.041*"driving" + 0.041*"school"')]