In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd

df = ["He is a good dog.", "The dog is too lazy.",
         "That is a brown cat.", "The cat is very active.", "I have brown cat and dog."]

tv = TfidfVectorizer(stop_words='english', smooth_idf=True)

tv_fit = tv.fit_transform(df)
# 原始单词文本矩阵
print(tv.get_feature_names_out())
print(tv_fit.toarray())

['active' 'brown' 'cat' 'dog' 'good' 'lazy']
[[0.         0.         0.         0.55645052 0.83088075 0.        ]
 [0.         0.         0.         0.55645052 0.         0.83088075]
 [0.         0.76944707 0.63871058 0.         0.         0.        ]
 [0.83088075 0.         0.55645052 0.         0.         0.        ]
 [0.         0.64846263 0.53828256 0.53828256 0.         0.        ]]


In [3]:
# SVD represent documents and terms in vectors
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100)
lsa = svd_model.fit_transform(tv_fit)

dictionary = tv.get_feature_names_out()
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T
dictionary
encoding_matrix

Unnamed: 0,topic_1,topic_2
active,0.200354,-0.242441
brown,0.596512,-0.20181
cat,0.629338,-0.329886
dog,0.415831,0.616903
good,0.132383,0.453377
lazy,0.132383,0.453377


In [8]:
import re
raw_documents = [
    "He is a good dog.",
    "The dog is too lazy.",
    "That is a brown cat.",
    "The cat is very active.",
    "I have brown cat and dog."
]

In [9]:
# 转为 DataFrame
df = pd.DataFrame({'raw_documents': raw_documents})
print("原始数据：")
print(df)

原始数据：
               raw_documents
0          He is a good dog.
1       The dog is too lazy.
2       That is a brown cat.
3    The cat is very active.
4  I have brown cat and dog.


In [10]:
# 获取 sklearn 内置的英文停用词（更完整）
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Sklearn 英文停用词（部分）：", list(ENGLISH_STOP_WORDS)[:10])

Sklearn 英文停用词（部分）： ['move', 'a', 'to', 'though', 'sometimes', 'ever', 'neither', 'thus', 'however', 'perhaps']


In [11]:
def preprocess_text(text):
    """
    预处理函数：去标点、转小写、分词、去除停用词、重新组合
    """
    # 1. 转小写
    text = text.lower()
    
    # 2. 去除标点符号（保留字母和空格）
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 3. 分词（按空格切分）
    words = text.split()
    
    # 4. 去除停用词
    words_clean = [word for word in words if word not in ENGLISH_STOP_WORDS]
    
    # 5. 重新组合成字符串
    return ' '.join(words_clean)

In [12]:
# 应用预处理函数
df['clean_documents'] = df['raw_documents'].apply(preprocess_text)

print("清洗后的数据：")
print(df[['raw_documents', 'clean_documents']])
print()

清洗后的数据：
               raw_documents clean_documents
0          He is a good dog.        good dog
1       The dog is too lazy.        dog lazy
2       That is a brown cat.       brown cat
3    The cat is very active.      cat active
4  I have brown cat and dog.   brown cat dog



In [13]:
# 构建 TfidfVectorizer 矩阵
# 使用 TfidfVectorizer（注意：这里也可以不再去停用词，因为我们已经预处理过了）
vectorizer = TfidfVectorizer(smooth_idf=True)  # 已清洗，无需再设 stop_words
X = vectorizer.fit_transform(df['clean_documents'])

# 查看词汇表
print("词汇表（词汇 -> 对应的列索引）:")
print(vectorizer.vocabulary_)
print()

# 查看 TF-IDF 矩阵（转为数组）
print("TF-IDF 矩阵（每一行是一个文档的向量表示）:")
print(X.toarray())
print()

# 转为 DataFrame 更直观
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(X.toarray(), columns=feature_names)
print("TF-IDF 矩阵（DataFrame 形式）:")
print(df_tfidf.round(3))  # 保留三位小数便于查看

词汇表（词汇 -> 对应的列索引）:
{'good': 4, 'dog': 3, 'lazy': 5, 'brown': 1, 'cat': 2, 'active': 0}

TF-IDF 矩阵（每一行是一个文档的向量表示）:
[[0.         0.         0.         0.55645052 0.83088075 0.        ]
 [0.         0.         0.         0.55645052 0.         0.83088075]
 [0.         0.76944707 0.63871058 0.         0.         0.        ]
 [0.83088075 0.         0.55645052 0.         0.         0.        ]
 [0.         0.64846263 0.53828256 0.53828256 0.         0.        ]]

TF-IDF 矩阵（DataFrame 形式）:
              active              brown                cat                dog  \
0 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.5560000000000000   
1 0.0000000000000000 0.0000000000000000 0.0000000000000000 0.5560000000000000   
2 0.0000000000000000 0.7690000000000000 0.6390000000000000 0.0000000000000000   
3 0.8310000000000000 0.0000000000000000 0.5560000000000000 0.0000000000000000   
4 0.0000000000000000 0.6480000000000000 0.5380000000000000 0.5380000000000000   

                good       

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X

<5x6 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [16]:
vectorizer.get_feature_names_out()

array(['active', 'brown', 'cat', 'dog', 'good', 'lazy'], dtype=object)

In [17]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.55645052, 0.83088075,
        0.        ],
       [0.        , 0.        , 0.        , 0.55645052, 0.        ,
        0.83088075],
       [0.        , 0.76944707, 0.63871058, 0.        , 0.        ,
        0.        ],
       [0.83088075, 0.        , 0.55645052, 0.        , 0.        ,
        0.        ],
       [0.        , 0.64846263, 0.53828256, 0.53828256, 0.        ,
        0.        ]])

In [18]:
# SVD 分解，假设有 2 个主题
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100)
lsa = svd_model.fit_transform(X)

In [20]:
dictionary = vectorizer.get_feature_names_out()
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T

In [22]:
dictionary
encoding_matrix

Unnamed: 0,topic_1,topic_2
active,0.2003541259081117,-0.2424408501618362
brown,0.5965117122287048,-0.2018098984872578
cat,0.6293380994160956,-0.3298859088715316
dog,0.4158307960649448,0.6169033286639758
good,0.132382602846649,0.4533766476433695
lazy,0.1323826028466497,0.4533766476433689


In [23]:
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])

Unnamed: 0,documents,topic_1,topic_2
0,good dog,0.3413834191239962,0.7199781067501037
1,dog lazy,0.3413834191239967,0.7199781067501032
2,brown cat,0.860949091930217,-0.3659836550739513
3,cat active,0.5166658991993216,-0.3850046207843261
4,brown cat dog,0.9494117370834868,0.023630294066115
