In [2]:
!pip3 install gensim
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# 去除停用词
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# 去除只出现一次的词
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
#处理后的Bag-of-words文本
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

print(texts)

Collecting gensim
  Downloading gensim-4.2.0-cp38-cp38-macosx_10_9_x86_64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 98 kB/s eta 0:00:0111
Collecting smart-open>=1.8.1
  Downloading smart_open-6.2.0-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 4.3 MB/s eta 0:00:011
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.2.0 smart-open-6.2.0
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [3]:
#建立字典
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)
print()
#基于上述字典建立corpus
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [4]:
from gensim import models
#tf-idf表达

#初始化tf-idf模型，主要是计算IDF
tfidf = models.TfidfModel(corpus)  
print(tfidf)

TfidfModel<num_docs=9, num_nnz=28>


In [5]:
#得到每一个文档的TF-IDF表达，以稀疏矩阵的形式存储
corpus_tfidf = tfidf[corpus]
for doc, as_text in zip(corpus_tfidf, documents):
    print(doc, as_text)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)] Human machine interface for lab abc computer applications
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)] A survey of user opinion of computer system response time
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)] The EPS user interface management system
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)] System and human system engineering testing of EPS
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)] Relation of user perceived response time to error measurement
[(9, 1.0)] The generation of random binary unordered trees
[(9, 0.7071067811865475), (10, 0.7071067811865475)] The intersection graph of paths in trees
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)] Graph minors I

In [6]:
#运行LSI, 以TFIDF表达为输入，2个话题
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)  # 初始化LSI模型参数, K=2
corpus_lsi = lsi_model[corpus_tfidf]  #基于corpus_tfidf训练LSI模型

#打印出学习到的latent topic，一共两个
lsi_model.print_topics(2)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [7]:
# 打印文档的topic表达，每一个文档表示成2维的topic向量
for doc, as_text in zip(corpus_lsi, documents):
    print(doc, as_text)

[(0, 0.0660078339609013), (1, -0.5200703306361856)] Human machine interface for lab abc computer applications
[(0, 0.19667592859142244), (1, -0.7609563167700053)] A survey of user opinion of computer system response time
[(0, 0.08992639972446148), (1, -0.7241860626752515)] The EPS user interface management system
[(0, 0.07585847652177895), (1, -0.6320551586003436)] System and human system engineering testing of EPS
[(0, 0.10150299184979919), (1, -0.5737308483002957)] Relation of user perceived response time to error measurement
[(0, 0.7032108939378316), (1, 0.1611518021402551)] The generation of random binary unordered trees
[(0, 0.8774787673119839), (1, 0.16758906864659073)] The intersection graph of paths in trees
[(0, 0.9098624686818587), (1, 0.1408655362871869)] Graph minors IV Widths of trees and well quasi ordering
[(0, 0.6165825350569285), (1, -0.053929075663895716)] Graph minors A survey


In [8]:

from gensim.models import nmf
corpus_nmf = nmf.Nmf(corpus_tfidf, num_topics=2)
print(corpus_nmf)
corpus_nmf.print_topics(2)

<gensim.models.nmf.Nmf object at 0x7fa5c5a1b670>


[(0,
  '0.205*"5" + 0.191*"8" + 0.157*"7" + 0.137*"2" + 0.085*"3" + 0.085*"6" + 0.076*"1" + 0.041*"9" + 0.023*"0" + 0.000*"10"'),
 (1,
  '0.216*"11" + 0.183*"10" + 0.149*"4" + 0.134*"0" + 0.099*"9" + 0.078*"1" + 0.075*"2" + 0.029*"6" + 0.029*"3" + 0.008*"7"')]

In [9]:
print(corpus_nmf.get_term_topics(word_id=0))#得到某一个词的topic表达，输入为词id

print(corpus_nmf.get_document_topics([(0, 1), (1, 1), (2, 1)])) #得到文档的topic表示，输入为文档的bag of words表达

[(0, 0.11679245615549401), (1, 0.883207543844506)]
[(0, 0.6320229562765965), (1, 0.3679770437234034)]


In [10]:
# 基于texts数据集合训练一个word2vec模型, 隐维度为2
import gensim
print(texts)
w2v = gensim.models.Word2Vec(texts, min_count=1, vector_size= 2)
print(w2v)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]
Word2Vec<vocab=12, vector_size=2, alpha=0.025>


In [12]:
#查询词向量

for i, word in enumerate(w2v.wv.key_to_index):
    if i == 20:
        break
    print(word, ":", w2v.wv[word])

system : [-0.02681136  0.01182151]
graph : [0.25516748 0.45046365]
trees : [-0.4651475  -0.35584044]
user : [0.32290065 0.44859102]
minors : [-0.2507714  -0.18816864]
eps : [ 0.36902523 -0.07667363]
time : [-0.22683072  0.32770252]
response : [-0.24300753 -0.09080752]
survey : [0.14382899 0.04959369]
computer : [-0.41438133 -0.47257847]
interface : [0.3655466 0.2534694]
human : [0.33788466 0.03814328]


In [13]:
# 计算两个词的相似度
pairs = [
    ('human', 'computer'),   
    ('interface', 'computer'),  
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, w2v.wv.similarity(w1, w2)))

'human'	'computer'	-0.74
'interface'	'computer'	-0.97
