# one-hot表示（基于numpy实现）

In [1]:
import jieba

data = ["我爱我的祖国",'我喜欢祖国的大好河山']
samples = [' '.join(jieba.cut(doc)) for doc in data]
print(samples)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\priv_\AppData\Local\Temp\jieba.cache
Loading model cost 0.541 seconds.
Prefix dict has been built successfully.


['我 爱 我 的 祖国', '我 喜欢 祖国 的 大好河山']


In [3]:
# 词级别的one-hot编码
import numpy as np

token_index = {}  # 构造一个空的索引集合
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index)  # 给每个唯一单词指定一个唯一索引
max_length = len(token_index)  # 对样本进行分词，只考虑每个样本前max-length个单词
results = np.zeros(shape=(len(samples), max_length, max(token_index.values())+1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1
print(token_index)
print(results)

{'我': 0, '爱': 1, '的': 2, '祖国': 3, '喜欢': 4, '大好河山': 5}
[[[1. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0.]]

 [[1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 1. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0.]]]


# Bag of words表示（基于sklearn的实现）

In [6]:
samples

['我 爱 我 的 祖国', '我 喜欢 祖国 的 大好河山']

In [9]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', binary=False)
X = vectorizer.fit_transform(samples)
print(vectorizer.get_feature_names())
print(X.toarray())

['喜欢', '大好河山', '我', '爱', '的', '祖国']
[[0 0 2 1 1 1]
 [1 1 1 0 1 1]]


# TF_IDF表示（基于sklearn实现）