https://dsfabric.org/topic-modeling-in-python-latent-semantic-analysis

In [1]:
import re
from pprint import pprint

import nltk
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from scipy import linalg
from sklearn import decomposition
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
my_doc_list = pd.read_csv('./my_corpus_english.csv', names=['Label', 'Title', 'Text'])

my_doc_list.head()

Unnamed: 0,Label,Title,Text
0,fruit,some fruit1,apple apple apple apple apple apple apple appl...
1,fruit,some fruit2,apple apple apple apple apple apple apple appl...
2,vehicle,some vehicle1,grape grape grape car car car truck truck tr...
3,vehicle,some vehicle2,banana banana car car truck truck truck t...
4,mix,mix1,apple apple grape grape train train train


In [3]:
my_doc_list.shape

(5, 3)

In [4]:
df_text = my_doc_list['Text']

df_text.head()

0    apple apple apple apple apple apple apple appl...
1    apple apple apple apple apple apple apple appl...
2    grape grape grape car car car truck  truck  tr...
3    banana  banana  car car truck  truck  truck  t...
4          apple apple grape grape train  train  train
Name: Text, dtype: object

# LSI(SVD) - gensim

In [5]:
documents_list = []

for line in df_text:
    sentence = line.strip()
    new_sentence = re.sub(r"\d","", sentence)

    documents_list.append(sentence)

print(documents_list[0])

apple apple apple apple apple apple apple apple apple banana  banana  grape


In [6]:
# Standard stop words in NLTK
stop_words = set(stopwords.words('english'))

In [7]:
# Here is pre-processed documents
processed_list = []

# Lemmatizer
lemmatizer = WordNetLemmatizer()


for doc in documents_list:
    tokens = word_tokenize(doc.lower())

    stopped_tokens = [token for token in tokens if token not in stop_words]

    lemmatized_tokens = [lemmatizer.lemmatize(i, pos="n") for i in stopped_tokens]

    processed_list.append(lemmatized_tokens)

print(processed_list[0])

['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'banana', 'banana', 'grape']


In [8]:
word_dictionary = Dictionary(processed_list)

print(word_dictionary)

Dictionary(6 unique tokens: ['apple', 'banana', 'grape', 'car', 'train']...)


In [9]:
document_word_matrix = [word_dictionary.doc2bow(document) for document in processed_list]
document_word_matrix

[[(0, 9), (1, 2), (2, 1)],
 [(0, 8), (1, 2), (2, 2), (3, 1)],
 [(2, 3), (3, 3), (4, 8), (5, 4)],
 [(1, 2), (3, 2), (4, 7), (5, 4)],
 [(0, 2), (2, 2), (4, 3)]]

In [10]:
NUM_TOPICS = 2

lsi_model = LsiModel(corpus=document_word_matrix, num_topics=NUM_TOPICS, id2word=word_dictionary)

In [11]:
lsi_topics = lsi_model.show_topics(num_topics=NUM_TOPICS, formatted=False)

pprint(lsi_topics)

[(0,
  [('train', 0.6611919499942059),
   ('apple', 0.54942707037681),
   ('truck', 0.31968498514330346),
   ('grape', 0.25908187452359976),
   ('car', 0.23314174989662073),
   ('banana', 0.19307541692929353)]),
 (1,
  [('apple', -0.788067788509512),
   ('train', 0.5194983492858765),
   ('truck', 0.2763164859167143),
   ('car', 0.1322965145937798),
   ('banana', -0.12091104772479874),
   ('grape', -0.024453551440314407)])]


两个主题，看前3个word，感觉一下是什么主题

# LSI(SVD) - sklearn

In [12]:
df_text.head()

0    apple apple apple apple apple apple apple appl...
1    apple apple apple apple apple apple apple appl...
2    grape grape grape car car car truck  truck  tr...
3    banana  banana  car car truck  truck  truck  t...
4          apple apple grape grape train  train  train
Name: Text, dtype: object

In [13]:
# Define CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

# Apply it to the dataset
document_word_matrix = vectorizer.fit_transform(df_text).todense()

# Print the result
print(document_word_matrix)

[[9 2 0 1 0 0]
 [8 2 1 2 0 0]
 [0 0 3 3 8 4]
 [0 2 2 0 7 4]
 [2 0 0 2 3 0]]


In [14]:
vocab = np.array(vectorizer.get_feature_names())

pprint(vocab)

array(['apple', 'banana', 'car', 'grape', 'train', 'truck'], dtype='<U6')


In [15]:
U, s, Vh = linalg.svd(document_word_matrix, full_matrices=False)

In [16]:
print('Shape of U', U.shape)
print('Shape of s', s.shape)
print('Shape of Vh', Vh.shape)

print("U",U)
print("s",s)
print("Vh",Vh)

Shape of U (5, 5)
Shape of s (5,)
Shape of Vh (5, 6)
U [[ 0.4107263   0.60121503  0.18922803 -0.22447225 -0.61939435]
 [ 0.40652332  0.52801984 -0.06683286  0.4288696   0.60624918]
 [ 0.59109588 -0.45627215 -0.42795813  0.39279538 -0.32401392]
 [ 0.49665007 -0.38925788  0.69568819 -0.2357803   0.24948468]
 [ 0.26455069  0.00543689 -0.54092135 -0.74552164  0.2856308 ]]
s [13.6102225  12.24002292  3.03332537  1.53361399  0.6250181 ]
Vh [[ 0.54942707  0.19307542  0.23314175  0.25908187  0.66119195  0.31968499]
 [ 0.78806779  0.12091105 -0.13229651  0.02445355 -0.51949835 -0.27631649]
 [ 0.02853195  0.53939704  0.01340745 -0.76159148 -0.0582238   0.35305156]
 [-0.05238392 -0.0409268   0.74053519  0.20905509 -0.48556158  0.40952958]
 [-0.24526344  0.75626453  0.21307669  0.30770926  0.0178776  -0.4769733 ]]


In [17]:
num_top_words =3

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [18]:
print("topic 0")

print('\n'.join(show_topics(Vh[0:1])))

print("topic 1")
print('\n'.join(show_topics(Vh[1:2])))

topic 0
train apple truck
topic 1
apple banana grape


# NMF

In [19]:
df_text.head()

0    apple apple apple apple apple apple apple appl...
1    apple apple apple apple apple apple apple appl...
2    grape grape grape car car car truck  truck  tr...
3    banana  banana  car car truck  truck  truck  t...
4          apple apple grape grape train  train  train
Name: Text, dtype: object

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english')
vectors_tfidf = vectorizer_tfidf.fit_transform( df_text).todense() # (documents, vocab)
vectors_tfidf.shape

(5, 6)

In [21]:
vocab = np.array(vectorizer_tfidf.get_feature_names())
print("vocab",vocab)
print("vocab shape",vocab.shape)

vocab ['apple' 'banana' 'car' 'grape' 'train' 'truck']
vocab shape (6,)


In [22]:
import pandas as pd
def get_nmf_topics(vectorizer_tfidf, model, num_topics, n_top_words):
    feat_names = vectorizer_tfidf.get_feature_names() #word list
    word_dict = {};
    for i in range(num_topics):
        words_ids = model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
            
    return pd.DataFrame(word_dict);

In [23]:
from sklearn import decomposition

d = 2 # num topics
clf = decomposition.NMF(n_components=d, random_state=1)

In [24]:
W1 = clf.fit_transform(vectors_tfidf)
print(W1.shape)
H1 = clf.components_
print(H1.shape)

(5, 2)
(2, 6)


In [25]:
#
num_topics = 2
n_top_words = 4
df2=  get_nmf_topics(vectorizer_tfidf, clf, num_topics, n_top_words)
df2.head(10)

Unnamed: 0,Topic # 01,Topic # 02
0,train,apple
1,truck,grape
2,car,banana
3,grape,train


NMF的效果似乎比LSI(SVD)要好