# 特征提取

## 从类别变量中提取特征

In [2]:
# 使用类DicVectorizere 进行one -hot 编码转换
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()


In [3]:
X = [
    {'city':'New York'},
    {'city':'San Franciso'},
    {'city':'Chapel Hill'}
]
print(onehot_encoder.fit_transform(X).toarray())  # toarray()方法将fit_transform后的结果转换为array数组


[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


## 特征标准化 
———— tandardScaler类、RobustScalar类、preprocessing类的scale方法

In [4]:
from sklearn import preprocessing
import numpy as np
X = np.array([
    [0., 0., 5., 13., 9., 1.],
    [0., 0., 13., 15., 10., 15.],
    [0., 3., 15., 2., 0., 11.]
])
print(preprocessing.scale(X))  # preprocessing类的scale（）函数可单独对任何轴进行标准化

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


In [5]:
# RobustScalar类是StandardScaler类的另一个选择。
# StandardScaler类会在每个实例值上减去特征均值，然后除以特征值标准差。
# obustScalar类会减去中位数，然后除以四分位差。

## 从文本中提取特征

### 词袋模型
1.最常用的文本表示法，可看作是one-hot编码的一种扩展，对文本中关注的**每一个词创建一个特征**     

2.使用一个**特征向量**表示每个文档，其中的每个元素和词表的一个单词相对应，使用一个二元值表示特征向量的每个元素

In [6]:
# 使用一个包含两个文档的语料库来检验词包模型（语料库是一个文档的集合）
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]  # 10个单词，包含8个独特单词,即由包含八个元素的特征向量进行表示，元素数量为向量维度


In [7]:
# CountVectorizer类
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer() # 创建实例,实例将用于将文本转换为数值向量。
print(vectorizer.fit_transform(corpus).todense()) # 如果某个词汇在文档中出现，对应的列值为1，否则为0。
# todense()方法将转换后的稀疏矩阵转换为密集矩阵（如果需要），这样更容易阅读
# 在scikit-learn中，许多文本处理和特征提取方法（如CountVectorizer）默认返回稀疏矩阵。
print(vectorizer.vocabulary_) #打印CountVectorizer的词汇表，这是一个包含所有在语料库中出现过的词汇的列表。词汇表的顺序与fit_transform方法返回的向量中的列顺序相对应。

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


In [8]:
# 再增加一个文档
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_) # 发现共有十个独特的单词。
# 'I' 和 'a' 没有匹配正则表达式，因此没有被提取

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [9]:
# 通过L^2范数，计算文档之间的特征向量的相似度
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus)
# 计算第一个文档和第二个文档的L^2范数
print('Distance between 1st and 2nd documents:',euclidean_distances(X[0], X[1]))
# 计算第一个文档和第三个文档的L^2范数
print('Distance between 1st and 3rd documents:',euclidean_distances(X[0], X[2]))
# 计算第二个文档和第三个文档的L^2范数
print('Distance between 2nd and 3rd documents:',euclidean_distances(X[1], X[2]))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


In [10]:
# 或者

# 计算文档之间的L^2范数距离
distances = euclidean_distances(X)

# 打印文档之间的距离
print('Distance between 1st and 2nd documents:', distances[0, 1])
print('Distance between 1st and 3rd documents:', distances[0, 2])
print('Distance between 2nd and 3rd documents:', distances[1, 2])

Distance between 1st and 2nd documents: 2.449489742783178
Distance between 1st and 3rd documents: 2.6457513110645907
Distance between 2nd and 3rd documents: 2.6457513110645907


### 停用词过滤
去除大部分文档中常见的单词，如：限定词'the''a''an',助动词'do''be''will',介词'on''around''beneath'等等


In [94]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
] 
# stop_words 参数接受的值应该是 ‘english’、一个包含停用词的列表，或者 None
vectorizer = CountVectorizer(stop_words= 'english')  # 用英语的默认停用词列表———停用限定词'the''a''an',助动词'do''be''will',介词'on''around''beneath'等等
print(vectorizer.fit_transform(corpus).todense())  
print(vectorizer.vocabulary_)  # 打印vocabulary_属性

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


### 词干提取和词形还原

In [12]:
# 创建一个由两个文档组成的语料库
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
]
vectorizer = CountVectorizer(binary=True, stop_words='english') # 停用词过滤
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)


[[1 1 1 0]
 [1 0 0 1]]
{'gathering': 0, 'ingredients': 1, 'sandwich': 2, 'wizards': 3}


In [13]:
# 词形还原（lemmatization）（考虑单词词性）
# 是一种更精细的过程，它不仅去除词缀，还考虑单词的词性，将单词还原到其词典形式。
from nltk.stem.wordnet import WordNetLemmatizer
# WordNetLemmatizer 是一个基于 WordNet 数据库的词元还原器，它能够根据单词的词性（名词、动词、形容词或副词）来还原单词。
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering','v'))  # lemmatize 方法会查找 'gathering' 作为动词的基本形式
print(lemmatizer.lemmatize('gathering','n'))


gather
gathering


In [14]:
# 词干提取(删除词缀)
# 目标是将单词减少到一个基本形式，通常不考虑词性，而是基于一系列规则去除单词的词缀
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('gathering'))

gather


对玩具语料库进行词干提取和词形还原

In [15]:
# 下载nltk的punkt数据包
import nltk
nltk.download() 

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [75]:
# 词干提取： 1.对语料库分词 2.进行词干提取
# 词形还原： 1.对语料库分词 2.得到词形标签 3.根据标签进行词形还原

from nltk import word_tokenize # 函数用于将文本分割成单词，是 NLTK 提供的多种分词方法之一。  'tokenization' 分词  
from nltk import pos_tag # 用于词性标注，给每个单词分配一个词形标签。 part-of-speech 词性
from nltk.stem import PorterStemmer #用于词干提取，将单词还原到基本形式
from nltk.stem.wordnet import WordNetLemmatizer # 用于词形还原，将单词还原到其词典形式。

corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]

# 词干提取
# word_tokenize 用于分词
# stemmer.stem 用于提取词干
stemmer = PorterStemmer()
"""
使用了列表推导式(list comprehension)来创建一个嵌套列表
 先运行 for document in corpus 再运行 for token in word_tokenize(document)
 """
print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])
# 词形还原
wordnet_tags = ['n', 'v']  # 词性标签，用于词形还原。
lemmatizer = WordNetLemmatizer()
def lemmatize(token, tag):
    """ 
    这个函数接受一个单词(token)和一个词性标签(tag)
    如果标签是名词或动词，它将使用 lemmatizer.lemmatize 方法进行词形还原；
    否则，它将返回原始单词。 
    """
    if tag[0].lower() in wordnet_tags:
        return lemmatizer.lemmatize(token, tag[0].lower()) # tag[0].lower()得到'n'或者'v', tag得到'NN'或者'VB'
    return token


# 分词并且贴词性标签
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
#　调用lemmatize函数，对单词进行还原
print('Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus])


Stemmed: [['he', 'ate', 'the', 'sandwich'], ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


In [68]:
# 提高可读性
# 词干提取
stemmer = PorterStemmer()
stemmed_corpus = [] # 处理后的语料库
for document in corpus:
    """分词"""
    tokens = word_tokenize(document)
    stemmed_document = [] # 处理后的文档
    for token in tokens:
        """词干提取"""
        stemmed_token = stemmer.stem(token)
        stemmed_document.append(stemmed_token)
    stemmed_corpus.append(stemmed_document)
print('stemmed:',stemmed_corpus)

# 词形还原
wordnet_tag = ['n','v']  # 标签
lemmatizer = WordNetLemmatizer()

def lemmatize(token, tag):
    """
    接受单词及其标签
    如果是动词或者名词，则进行相应的词形还原并且返回
    否则返回原词
    """
    if tag[0].lower() in wordnet_tag:
        return lemmatizer.lemmatize(token,tag[0].lower())
    return token


# 分词、贴标签，进行词形还原
lemmatized_corpus = []
for document in corpus:
    # 分词、贴标签
    tagged_tokens = []
    tagged_tokens = pos_tag(word_tokenize(document))

    lemmatized_document = []
    for token,tag in tagged_tokens:
        # 进行词形还原
        lemmatized_document.append(lemmatize(token,tag))
    lemmatized_corpus.append(lemmatized_document)

print('lemmatized:', lemmatized_corpus)


stemmed: [['he', 'ate', 'the', 'sandwich'], ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


### ti-idf 权重扩展包
1.使用一个整数来表示单词在文档中出现的次数，而不是使用一个二元值表示特征向量中的每个元素  
2.tf：单词的频数 idf：逆文档频率   tf-idf：tf * idf   
3.使用TfidfVectorizer类，其封装了CountVectorizer类和TfidfTransformer类



In [81]:
#　计算一个文档里单词的频数
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['The dog ate a sandwich, the wozard transfigured a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english') # binary参数默认为False，将返回单词出现的真正频数而不是一个二元值
frequencies = np.array(vectorizer.fit_transform(corpus).todense())[0]  # 获取词频列表，由于文档只有一行，则列表只有一行
print(frequencies)
print('Token indices %s' % vectorizer.vocabulary_)
for token, index in vectorizer.vocabulary_.items():
    # 获取某词及其索引
    print('The token "%s" appears %s times' % (token,frequencies[index]))

[2 1 3 1 1]
Token indices {'dog': 1, 'ate': 0, 'sandwich': 2, 'wozard': 4, 'transfigured': 3}
The token "dog" appears 1 times
The token "ate" appears 2 times
The token "sandwich" appears 3 times
The token "wozard" appears 1 times
The token "transfigured" appears 1 times


In [96]:
# 标准化单词频数
# 计算每个单词的tf-idf值
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())

[[0.75458397 0.37729199 0.53689271 0.         0.        ]
 [0.         0.         0.44943642 0.6316672  0.6316672 ]]


### 空间有效特征向量化与哈希技巧 