In [1]:
# Singular Value Decomposition
# Given a matrix A with shape (m, n), the function returns a tuple (U, S, Vᵗ) 
# U is a matrix with shape (m, m) containing the left-singular vectors in its columns
# S is a vector of size k = min(m, n) containing the singular values in descending order 
# Vᵗ is a matrix with shape (n, n) containing the right singular vectors in its rows
import numpy as np

A = np.array([[1, 0], [0, 1], [2, 2]])
print(A)
np.linalg.svd(A)

[[1 0]
 [0 1]
 [2 2]]


SVDResult(U=array([[-2.35702260e-01,  7.07106781e-01, -6.66666667e-01],
       [-2.35702260e-01, -7.07106781e-01, -6.66666667e-01],
       [-9.42809042e-01, -1.11022302e-16,  3.33333333e-01]]), S=array([3., 1.]), Vh=array([[-0.70710678, -0.70710678],
       [ 0.70710678, -0.70710678]]))

In [2]:
# Truncated SVD is particularly effective
# since the truncated matrix Aₖ is the best rank-k 
# approximation of the matrix A in terms of the Frobenius norm 
U, S, Vt = np.linalg.svd(A)

k = 1  # target rank
U_k = U[:, :k]
S_k = np.diag(S[:k])
Vt_k = Vt[:k, :]

print(U_k)
print(S_k)
print(Vt_k)

A_k = U_k @ S_k @ Vt_k
print(A_k)

[[-0.23570226]
 [-0.23570226]
 [-0.94280904]]
[[3.]]
[[-0.70710678 -0.70710678]]
[[0.5 0.5]
 [0.5 0.5]
 [2.  2. ]]


In [3]:
# 停用词表加载方法
def get_stopword_list():
    # 停用词表存储路径，每一行为一个词，按行读取进行加载
    # 进行编码转换确保匹配准确率
    stop_word_path = 'E:/Teaching/数据挖掘与最优化/Data/stopword.txt'
    stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path,encoding='utf-8').readlines()]
    return stopword_list

In [4]:
# 去除停用词
def word_filter(seg_list):
    stopword_list = get_stopword_list()
    filter_list = []
    for seg in seg_list:
        word = seg
        # 过滤停用词表中的词，以及长度为<2的词
        if not word in stopword_list and len(word) > 1:
            filter_list.append(word)
    return filter_list

In [6]:
import jieba
# 数据加载，corpus_path为数据集路径
def load_data(corpus_path='E:/Teaching/数据挖掘与最优化/Data/corpus.txt'):
    # 调用上面方式对数据集进行处理，处理后的每条数据仅保留非停用词
    doc_list = []
    for line in open(corpus_path, 'r',encoding='utf-8'):
        content = line.strip()
        seg_list = list(jieba.cut(content))
        filter_list = word_filter(seg_list)
        doc_list.append(" ".join(filter_list))

    return doc_list

In [7]:
text = load_data()
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidfmodel = TfidfVectorizer().fit(text)
input_matrix = tfidfmodel.transform(text).todense()
vocab = tfidfmodel.get_feature_names_out()

In [9]:
print(input_matrix)
print(input_matrix.shape)
print(vocab)
print(vocab.size)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(164, 8651)
['00' '000' '0000' ... '齐声' '龙头' '龟山']
8651


In [12]:
from sklearn.decomposition import TruncatedSVD
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
svd_modeling = TruncatedSVD(n_components=10, n_iter=100, random_state=122)
topic_word = svd_modeling.fit(np.asarray(input_matrix)).components_
document_topic = svd_modeling.fit_transform(np.asarray(input_matrix))

In [13]:
print(topic_word)
topic_word.shape

[[ 3.14622907e-03  1.32150050e-03  2.30722625e-03 ...  1.88475879e-03
   6.58291393e-04  3.57015722e-04]
 [-5.32513219e-03 -6.04240750e-04 -6.16468825e-04 ...  2.43655269e-03
  -1.25686469e-03 -5.90555932e-04]
 [-2.24833902e-03 -3.29246634e-06  2.45635684e-03 ... -9.16699480e-04
  -2.97458380e-04 -3.96218422e-05]
 ...
 [-5.81916987e-04 -1.25829558e-03 -5.86296347e-03 ...  2.67705371e-04
   8.22727671e-04 -2.41734210e-04]
 [ 1.09091662e-03  4.35373986e-04 -8.96612749e-03 ... -2.75672171e-04
   1.37406221e-04  5.97560498e-04]
 [-1.35853912e-03 -3.60755904e-04  2.36026531e-03 ...  1.43067092e-05
  -2.75576761e-04 -3.29264847e-04]]


(10, 8651)

In [14]:
print(document_topic)
document_topic.shape

[[ 8.37139928e-02 -1.03437730e-01 -4.11584750e-02 ... -5.05402961e-02
   3.63615386e-02  2.26393359e-02]
 [ 4.00131203e-02 -4.90227860e-02 -3.90512539e-03 ...  6.35680806e-04
  -1.31030443e-02 -9.91497252e-03]
 [ 1.01378024e-01 -2.08565483e-02 -9.54862265e-03 ...  1.13429506e-02
  -5.29737656e-02 -5.29017212e-02]
 ...
 [ 3.98908711e-02 -5.03295208e-02 -1.65647828e-02 ...  2.13179067e-02
  -3.95027322e-02  3.37989064e-02]
 [ 7.30539819e-02 -3.66572583e-02  2.38672573e-02 ...  3.95708990e-02
   5.20358487e-02  1.15777214e-01]
 [ 6.83154580e-02 -7.13793735e-02  1.24959514e-02 ...  2.84030688e-01
   2.16127026e-01  7.78270506e-01]]


(164, 10)

In [15]:
def get_topics(topic_word):
  topic_word_list = []
  for i, comp in enumerate(topic_word):
    terms_comp = zip(vocab,comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:5]
    topic=" "
    for t in sorted_terms:
        topic= topic + ' ' + t[0]
    topic_word_list.append(topic)
  return topic_word_list

In [16]:
get_topics(topic_word)

['  孩子 运动鞋 公益 活动 详细',
 '  孩子 运动鞋 送双 奔跑 快乐',
 '  六一 搜狐 孤儿 338 522',
 '  艾滋病 红丝 活动 详细 主题',
 '  公益 2011 创新奖 获奖 韩国',
 '  孤儿 指标 健康 试用 捐款',
 '  孤儿 艾滋病 红丝 捐款 助养',
 '  指标 试用 专人 出水 得不到',
 '  指标 盖茨 试用 现场 承诺',
 '  试用 报告 提交 2032 申请']

In [17]:
def get_document_topics(document_topic):
  document_topic_list = []
  for i, comp in enumerate(document_topic):
    terms_comp = zip(get_topics(topic_word),comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:2]
    topic=" "
    for t in sorted_terms:
        topic= topic + ' ' + t[0]
    document_topic_list.append(topic)
  return document_topic_list

In [22]:
get_document_topics(document_topic)

164