# 特征提取

## 从类别变量中提取特征

In [15]:
# 使用类DicVectorizere 进行one -hot 编码转换
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()


In [16]:
X = [
    {'city':'New York'},
    {'city':'San Franciso'},
    {'city':'Chapel Hill'}
]
print(onehot_encoder.fit_transform(X).toarray())  # toarray()方法将fit_transform后的结果转换为array数组


[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


## 特征标准化 
———— tandardScaler类、RobustScalar类、preprocessing类的scale方法

In [17]:
from sklearn import preprocessing
import numpy as np
X = np.array([
    [0., 0., 5., 13., 9., 1.],
    [0., 0., 13., 15., 10., 15.],
    [0., 3., 15., 2., 0., 11.]
])
print(preprocessing.scale(X))  # preprocessing类的scale（）函数可单独对任何轴进行标准化

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


In [18]:
# RobustScalar类是StandardScaler类的另一个选择。
# StandardScaler类会在每个实例值上减去特征均值，然后除以特征值标准差。
# obustScalar类会减去中位数，然后除以四分位差。

## 从文本中提取特征

### 词袋模型
1.最常用的文本表示法，可看作是one-hot编码的一种扩展，对文本中关注的**每一个词创建一个特征**     

2.使用一个**特征向量**表示每个文档，其中的每个元素和词表的一个单词相对应

In [26]:
# 使用一个包含两个文档的语料库来检验词包模型（语料库是一个文档的集合）
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]  # 10个单词，包含8个独特单词,即由包含八个元素的特征向量进行表示，元素数量为向量维度


In [27]:
# CountVectorizer类
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer() # 创建实例,实例将用于将文本转换为数值向量。
print(vectorizer.fit_transform(corpus).todense()) # 如果某个词汇在文档中出现，对应的列值为1，否则为0。
# todense()方法将转换后的稀疏矩阵转换为密集矩阵（如果需要），这样更容易阅读
# 在scikit-learn中，许多文本处理和特征提取方法（如CountVectorizer）默认返回稀疏矩阵。
print(vectorizer.vocabulary_) #打印CountVectorizer的词汇表，这是一个包含所有在语料库中出现过的词汇的列表。词汇表的顺序与fit_transform方法返回的向量中的列顺序相对应。

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


In [28]:
# 再增加一个文档
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_) # 发现共有十个独特的单词。
# 'I' 和 'a' 没有匹配正则表达式，因此没有被提取

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [31]:
# 通过L^2范数，计算文档之间的特征向量的相似度
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus)
# 计算第一个文档和第二个文档的L^2范数
print('Distance between 1st and 2nd documents:',euclidean_distances(X[0], X[1]))
# 计算第一个文档和第三个文档的L^2范数
print('Distance between 1st and 3rd documents:',euclidean_distances(X[0], X[2]))
# 计算第二个文档和第三个文档的L^2范数
print('Distance between 2nd and 3rd documents:',euclidean_distances(X[1], X[2]))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


In [32]:
# 或者

# 计算文档之间的L^2范数距离
distances = euclidean_distances(X)

# 打印文档之间的距离
print('Distance between 1st and 2nd documents:', distances[0, 1])
print('Distance between 1st and 3rd documents:', distances[0, 2])
print('Distance between 2nd and 3rd documents:', distances[1, 2])

Distance between 1st and 2nd documents: 2.449489742783178
Distance between 1st and 3rd documents: 2.6457513110645907
Distance between 2nd and 3rd documents: 2.6457513110645907
