In [None]:
import json
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
def read_text(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_name = os.path.join(root, file)
            try:
                with open(file_name, 'rt') as f:
                    data = json.load(f)
                    text = []
                    for section in data['body']:
                        text.append(section['title'])
                        text.append(section['text'])
                    yield ''.join(text)
            except json.decoder.JSONDecodeError:
                continue

In [None]:
path = input("Data directory:")

data_samples = list(read_text(path))
n_samples = len(data_samples)
n_features = 1000
n_components = 10
n_top_words = 20

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)