# 文本表示（word representation）
## 离散表示
### One-hot

In [91]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np

In [103]:
# OrdinalEncoder 顺序编码（扩展知识）
Ordinal_enc = OrdinalEncoder() # 创建Ordinal 编码对象
X = [['Sean', 'love', 'nlp'],['I', 'like', 'badminton'],['Mom', 'likes', 'singing']] # 数据
Ordinal_enc.fit(X)
Ordinal_array = Ordinal_enc.transform([['Sean', 'like', 'badminton']])
print('Sean like badminton 的向量为:\n{}'.format(Ordinal_array))
Ordinal_enc.categories_

Sean like badminton 的向量为:
[[2. 0. 0.]]


[array(['I', 'Mom', 'Sean'], dtype=object),
 array(['like', 'likes', 'love'], dtype=object),
 array(['badminton', 'nlp', 'singing'], dtype=object)]

In [119]:
# OneHot 热独编码
Onehot_enc = OneHotEncoder()
X = [['Sean', 'love', 'nlp'],['I', 'like', 'badminton'],['Mom', 'love', 'Sean']] # 数据
Onehot_enc.fit(X)
print(Onehot_enc.categories_)
Onehot_array = Onehot_enc.transform([['Sean', 'like', 'badminton']]).toarray()
print('Sean like badminton 的向量为:\n{}'.format(Onehot_array))

[array(['I', 'Mom', 'Sean'], dtype=object), array(['like', 'love'], dtype=object), array(['Sean', 'badminton', 'nlp'], dtype=object)]
Sean like badminton 的向量为:
[[0. 0. 1. 1. 0. 0. 1. 0.]]


In [122]:
# OneHot 热独编码
Onehot_enc = OneHotEncoder() # 不设置设置为ignore， 转化onehot向量时不会因为不存在的类型而报错
X = [['Sean', 'love', 'nlp'],['I', 'like', 'badminton'],['Mom', 'love', 'Sean']] # 数据
Onehot_enc.fit(X)
print(Onehot_enc.categories_)
Onehot_array = Onehot_enc.transform([['Sean', 'like', 'money']]).toarray()
print('Sean like badminton 的向量为:\n{}'.format(Onehot_array))

[array(['I', 'Mom', 'Sean'], dtype=object), array(['like', 'love'], dtype=object), array(['Sean', 'badminton', 'nlp'], dtype=object)]


ValueError: Found unknown categories ['money'] in column 2 during transform

In [123]:
# OneHot 热独编码
Onehot_enc = OneHotEncoder(handle_unknown='ignore') # 设置为ignore， 转化onehot向量时不会因为不存在的类型而报错
X = [['Sean', 'love', 'nlp'],['I', 'like', 'badminton'],['Mom', 'love', 'Sean']] # 数据
Onehot_enc.fit(X)
print(Onehot_enc.categories_)
Onehot_array = Onehot_enc.transform([['Sean', 'like', 'money']]).toarray()
print('Sean like badminton 的向量为:\n{}'.format(Onehot_array))

[array(['I', 'Mom', 'Sean'], dtype=object), array(['like', 'love'], dtype=object), array(['Sean', 'badminton', 'nlp'], dtype=object)]
Sean like badminton 的向量为:
[[0. 0. 1. 1. 0. 0. 0. 0.]]


### 词袋(Bag-of-Word)

In [196]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer 的作用是将文本文档转换为计数的稀疏矩阵

# 下面是一个文本文档的列表
text = ["Sean Sean Sean love nlp",
     "Sean like badminton",
     "I love you"]

CountVec = CountVectorizer() # 实例化 CountVectorizer 类
CountVec.fit(text) # 用fit()函数从一个或者多个中文档中建立索引
print(CountVec.vocabulary_) # 查看索引结果

X = CountVec.transform(text)
print('维度为',X.shape)
X.toarray()

{'sean': 4, 'love': 2, 'nlp': 3, 'like': 1, 'badminton': 0, 'you': 5}
维度为 (3, 6)


array([[0, 0, 1, 1, 3, 0],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1]])

In [227]:
# attention!
# fit_transform()函数可以学习词汇词典并返回文档术语矩阵
# 可以省去先后调用fit() transform()两个函数
text = ["Sean Sean Sean love nlp",
     "Sean like badminton",
     "I love you"]

CountVec = CountVectorizer() # 实例化 CountVectorizer 类
X = CountVec.fit_transform(text)
X.toarray()

array([[0, 0, 1, 1, 3, 0],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1]])

### N-gram
N-gram就是在词袋模型的基础上加入了基于字节的滑窗，得到每个长度为n组成的一个文字序列。  
当N = 2时, 例句 `Sean like badminton I like you`为  
[Sean like, like badminton, badminton I, I like, like you]

N-gram可以通过调整`CountVectorizer()`里面的参数实现。

In [215]:
text = ["Sean Sean Sean love nlp",
     "Sean like badminton",
     "I love you"]

CountVec = CountVectorizer(analyzer = 'word', ngram_range = (2,2)) # 实例化 CountVectorizer 类
CountVec.fit(text) # 用fit()函数从一个或者多个中文档中建立索引
print(CountVec.vocabulary_) # 查看索引结果

X = CountVec.transform(text)
print('维度为',X.shape)
X.toarray()

{'sean sean': 5, 'sean love': 4, 'love nlp': 1, 'sean like': 3, 'like badminton': 0, 'love you': 2}
维度为 (3, 6)


array([[0, 1, 0, 0, 1, 2],
       [1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0]])

### TF-Idf
使用计算 tf-idf。tf 表示词频率，而 tf-idf 表示词频率乘以逆文档频率。这是信息检索中常用的一种术语权重方案，在文档分类中也得到了很好的应用。

In [243]:
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVec = TfidfVectorizer() # 实例化 CountVectorizer 类
Y = TfidfVec.fit_transform(text)
print(TfidfVec.get_feature_names())
Y.toarray()

['badminton', 'like', 'love', 'nlp', 'sean', 'you']


array([[0.        , 0.        , 0.29199216, 0.3839346 , 0.87597648,
        0.        ],
       [0.62276601, 0.62276601, 0.        , 0.        , 0.4736296 ,
        0.        ],
       [0.        , 0.        , 0.60534851, 0.        , 0.        ,
        0.79596054]])

In [244]:
# TfidfVectorizer 函数等于 CountVectorizer +  TfidfTransformer 函数
from sklearn.feature_extraction.text import TfidfTransformer

CountVec = CountVectorizer() 
tfidf_transform = TfidfTransformer()
X = CountVec.fit_transform(text)
Y = tfidf_transform.fit_transform(X)
Y.toarray()
# it is same as TfiderVectorizer

array([[0.        , 0.        , 0.29199216, 0.3839346 , 0.87597648,
        0.        ],
       [0.62276601, 0.62276601, 0.        , 0.        , 0.4736296 ,
        0.        ],
       [0.        , 0.        , 0.60534851, 0.        , 0.        ,
        0.79596054]])

***小结***:  
本质上来说，可以有以下几种组合  
BOW, BOW + NGram, BOW + TFIDF, BOW + NGram + TFIDF  
BoW(`Countvectorizer()`)   
BOW + NGram (`Countvectorizer(ngram_range = (2,2)`)  
BOW + TFIDF (`TfidfVectorizer()`)  
BOW + NGram + TFIDF (`TfidfVectorizer(ngram_range = (2,2))`)  
其实还可以再推进一步：BOW + NGram + TFIDF + SVD()  

## 分布式表示