In [17]:
from pprint import pprint
from collections import defaultdict
from gensim import corpora
from gensim import models
from gensim.test.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [18]:
def get_text_from_file(path):
    file = []
    stop_words = set(stopwords.words('english'))
    with open (path, 'r') as f:
        for row in f:
            filtered_s = [w for w in word_tokenize(row) if not w.lower() in stop_words]
            file.append(simple_preprocess(str(filtered_s), deacc=True))
    f.close()
    return file

In [19]:
def tokenize(corpus):
    stoplist = set('for a of the and to in'.split(' '))
    texts = [[word for word in document.lower().split() if word not in stoplist] for document in corpus]
    return texts

In [20]:
def get_frequency(texts):  
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    return frequency

In [21]:
text = get_text_from_file('../topics.txt')
common_dictionary = Dictionary(text)
common_corpus = [common_dictionary.doc2bow(line) for line in text]

lsi_model = models.LsiModel(common_corpus, id2word = common_dictionary, num_topics=3)
pprint(lsi_model.print_topics(3))

[(0,
  '0.463*"first" + 0.220*"messi" + 0.203*"barcelona" + 0.202*"goal" + '
  '0.202*"team" + 0.193*"played" + 0.175*"goals" + 0.159*"may" + '
  '0.153*"scored" + 0.138*"season"'),
 (1,
  '-0.383*"th" + -0.274*"subspecies" + -0.233*"vary" + -0.233*"populations" + '
  '-0.221*"described" + -0.214*"within" + -0.191*"length" + -0.191*"size" + '
  '-0.191*"fur" + -0.191*"patterns"'),
 (2,
  '0.455*"throw" + 0.455*"free" + 0.395*"percentage" + 0.270*"season" + '
  '0.184*"career" + 0.156*"shooter" + 0.156*"high" + -0.144*"first" + '
  '0.144*"end" + 0.137*"curry"')]


In [22]:
texts = tokenize(text_corpus)
frequency = get_frequency(texts)
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(processed_corpus)
corpus = [dictionary.doc2bow(text) for text in processed_corpus]

pprint(corpus)

[[(0, 1), (1, 2), (2, 2), (3, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 2), (6, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]


In [23]:
tfidf = models.TfidfModel(corpus) 

corpus_tfidf = tfidf[corpus]
'''for doc in corpus_tfidf:
    print(doc)'''

lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=1)
corpus_lsi = lsi_model[corpus_tfidf]

lsi_model.print_topics(1)

[(0,
  '-0.707*"learning" + -0.500*"data." + -0.500*"machine" + 0.000*"is" + -0.000*"natural" + -0.000*"language" + -0.000*"process"')]