In [7]:
from pprint import pprint
from collections import defaultdict
from gensim import corpora
from gensim import models

In [8]:
text_corpus = [
   "Natural language processing is the process of converting a natural language into a computer language.",
   "Machine learning is the process of learning from data.",
   "Machine learning is the process of applying statistical algorithms to the data.",
   
]

In [9]:
def tokenize(corpus):
    stoplist = set('for a of the and to in'.split(' '))
    texts = [[word for word in document.lower().split() if word not in stoplist] for document in corpus]
    return texts

In [10]:
def get_frequency(texts):  
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    return frequency

In [11]:
texts = tokenize(text_corpus)
frequency = get_frequency(texts)
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(processed_corpus)
corpus = [dictionary.doc2bow(text) for text in processed_corpus]

pprint(corpus)

[[(0, 1), (1, 2), (2, 2), (3, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 2), (6, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]


In [13]:
tfidf = models.TfidfModel(corpus) 

corpus_tfidf = tfidf[corpus]
'''for doc in corpus_tfidf:
    print(doc)'''

lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=1)
corpus_lsi = lsi_model[corpus_tfidf]

lsi_model.print_topics(1)

[(0,
  '-0.707*"learning" + -0.500*"data." + -0.500*"machine" + 0.000*"is" + 0.000*"language" + -0.000*"natural" + 0.000*"process"')]