In [185]:
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
from nltk.corpus import stopwords

I imported all the necessary tools such as:  <br/>
- sent_tokenize, word_tokenize, WordPunctTokenizer for breaking text string into list of sentences or words.
- WordNetLemmatizer, PorterStemmer, LancasterStemmer, SnowballStemmer for converting words to it's root form
- LatentDirichletAllocation for training and calculating the top 5 words in each topic

In [186]:
stop_words = stopwords.words('english')
stop_words.append('.')
stop_words.append(',')
stop_words.append('\'s')
stop_words.append('\'\'')
stop_words.append('``')

I use nltk.corpus.stopwords to generate stop words. I also decided to add '.', ',', and ''s' becuase the stop word does not have it. 

In [187]:
input_text = []
count = 0
for line in open('data.txt', 'r').read().split('\n'):
    count += 1
    if line != '':
        input_text.append(line)

Read data.txt and store each lines in the text file into a list and ignoring the line with no text.

In [188]:
word_dict = {}
for index, line in enumerate(input_text):
    word_dict[index] = word_tokenize(line)

Use nltk.tokenize.word_tokenize to break the string text to words for each line and store word_dict with each line numbers as keys.

In [189]:
stem_word_dict = {}
for line in word_dict:
    stem_word_dict[line] = []
    for i in word_dict[line]:
        stem_word_dict[line].append(SnowballStemmer('english').stem(i))

I use nltk.stem.SnowballStemmer to convert words into it's root form this time with SnowballStemmer method instead of having multiple words with the same meaning but are in a different form such as eat, eating, ate, eaten, etc. SnowballStemmer is 'the most reasonable one' but it takes a long time to process and perform task. Since the word for stem is not long I decided to use it.

In [190]:
num = 2
bow = {}
for i in range(2):
    for w in stem_word_dict[i]:
        if w not in bow and w not in stop_words:
            bow[w] = 0

Create a dictionary for bag of word method from the first 2 lines of code

In [191]:
bow_dict = {}
for line in word_dict:
    bow_dict[line] = bow.copy()

For each list of words I made a copy of the bag of word I created and store them in bow_dict.

In [192]:
for w in stem_word_dict:
    for ww in stem_word_dict[w]:
        if ww in bow_dict[w]:
            bow_dict[w][ww] += 1

I count all occurence of each topics that are in bow_dict

In [193]:
X = []
for w in bow_dict:
    X.append(list(bow_dict[w].values()))

I convert all values bow_dict to list

In [194]:
lda = LatentDirichletAllocation(n_components=5)
lda.fit(X)

LatentDirichletAllocation(n_components=5)

I called LatentDirichletAllocation. I set the number of conponents to 5 for top 5 words in each topics.

In [195]:
words = list(bow.keys())
top = 5
www = []
print('Top 5 words of each sentence are:')
for n, topic in enumerate(lda.components_):
    sort_topic = topic.argsort()
    for i in range(5):
        www.append(words[sort_topic[(-1) - i]])
        print('Topic', n + 1, ':', words[sort_topic[(-1) - i]])

Top 5 words of each sentence are:
Topic 1 : food
Topic 1 : tri
Topic 1 : mother
Topic 1 : chang
Topic 1 : said
Topic 2 : angela
Topic 2 : lunch
Topic 2 : plain
Topic 2 : croissant
Topic 2 : tri
Topic 3 : food
Topic 3 : like
Topic 3 : tri
Topic 3 : sinc
Topic 3 : lunch
Topic 4 : food
Topic 4 : croissant
Topic 4 : plain
Topic 4 : pasta
Topic 4 : like
Topic 5 : food
Topic 5 : like
Topic 5 : tri
Topic 5 : sinc
Topic 5 : lunch


In [196]:
lemma_word_dict = {}
for line in word_dict:
    lemma_word_dict[line] = []
    lemmatizer = WordNetLemmatizer()
    for i in word_dict[line]:
        lemma_word_dict[line].append(lemmatizer.lemmatize(i, pos='n'))


num = 2
bow = {}
for i in range(2):
    for w in lemma_word_dict[i]:
        if w not in bow and w not in stop_words:
            bow[w] = 0

bow_dict = {}
for line in word_dict:
    bow_dict[line] = bow.copy()

for w in lemma_word_dict:
    for ww in lemma_word_dict[w]:
        if ww in bow_dict[w]:
            bow_dict[w][ww] += 1

X = []
for w in bow_dict:
    X.append(list(bow_dict[w].values()))

lda = LatentDirichletAllocation(n_components=5)
# lda.fit(X[:2])
lda.fit(X)

words = list(bow.keys())
top = 5
wwww = []
print('Top 5 words of each sentence are:')
for n, topic in enumerate(lda.components_):
    sort_topic = topic.argsort()
    for i in range(5):
        wwww.append(words[sort_topic[(-1) - i]])
        print('Topic', n + 1, ':', words[sort_topic[(-1) - i]])

Top 5 words of each sentence are:
Topic 1 : The
Topic 1 : food
Topic 1 : Ciarra
Topic 1 : new
Topic 1 : said
Topic 2 : food
Topic 2 : ha
Topic 2 : mother
Topic 2 : tried
Topic 2 : said
Topic 3 : food
Topic 3 : croissant
Topic 3 : plain
Topic 3 : pasta
Topic 3 : The
Topic 4 : lunch
Topic 4 : Angela
Topic 4 : like
Topic 4 : plain
Topic 4 : croissant
Topic 5 : The
Topic 5 : food
Topic 5 : Ciarra
Topic 5 : new
Topic 5 : said


I repeat the same steps except that this time I'm using Lemmatizer instead of Stemmer. The result are about the same

In [197]:
print('Stemmer ', list(set(www)))
print('Stemmer ', len(list(set(www))))
print('Lemmatizer', list(set(wwww)))
print('Lemmatizer', len(list(set(wwww))))


Stemmer  ['food', 'lunch', 'angela', 'pasta', 'tri', 'mother', 'chang', 'said', 'like', 'croissant', 'plain', 'sinc']
Stemmer  12
Lemmatizer ['food', 'lunch', 'tried', 'pasta', 'ha', 'mother', 'new', 'Ciarra', 'Angela', 'said', 'like', 'croissant', 'plain', 'The']
Lemmatizer 14


I count all the unique words in lemmatizer and stemmer and I see that Stemmer has 16 unique words and 17 unique words for lemmatizer.<br/>
Stemmer  ['food', 'lunch', 'daili', 'star', 'pasta', 'angela', 'tri', 'new', 'chang', 'said', 'ciarra', 'like', 'croissant', 'plain', 'franco', 'sinc']<br/>
Stemmer  16<br/>
Lemmatizer ['food', 'lunch', 'tried', 'pasta', 'ha', 'mother', 'new', 'Ciarra', 'Angela', 'said', 'like', 'croissant', 'plain', 'The']<br/>
Lemmatizer 14<br/>
There are 8 words in each list that match. For all the top 5 contributing words have 5 related words. All topics are related to food.