# 特征提取

## 从类别变量中提取特征

In [15]:
# 使用类DicVectorizere 进行one -hot 编码转换
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()


In [16]:
X = [
    {'city':'New York'},
    {'city':'San Franciso'},
    {'city':'Chapel Hill'}
]
print(onehot_encoder.fit_transform(X).toarray())  # toarray()方法将fit_transform后的结果转换为array数组


[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


## 特征标准化 
———— tandardScaler类、RobustScalar类、preprocessing类的scale方法

In [17]:
from sklearn import preprocessing
import numpy as np
X = np.array([
    [0., 0., 5., 13., 9., 1.],
    [0., 0., 13., 15., 10., 15.],
    [0., 3., 15., 2., 0., 11.]
])
print(preprocessing.scale(X))  # preprocessing类的scale（）函数可单独对任何轴进行标准化

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


In [18]:
# RobustScalar类是StandardScaler类的另一个选择。
# StandardScaler类会在每个实例值上减去特征均值，然后除以特征值标准差。
# obustScalar类会减去中位数，然后除以四分位差。

## 从文本中提取特征

### 词袋模型
1.最常用的文本表示法，可看作是one-hot编码的一种扩展，对文本中关注的**每一个词创建一个特征**     

2.使用一个**特征向量**表示每个文档，其中的每个元素和词表的一个单词相对应

In [1]:
# 使用一个包含两个文档的语料库来检验词包模型（语料库是一个文档的集合）
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]  # 10个单词，包含8个独特单词,即由包含八个元素的特征向量进行表示，元素数量为向量维度


In [2]:
# CountVectorizer类
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer() # 创建实例,实例将用于将文本转换为数值向量。
print(vectorizer.fit_transform(corpus).todense()) # 如果某个词汇在文档中出现，对应的列值为1，否则为0。
# todense()方法将转换后的稀疏矩阵转换为密集矩阵（如果需要），这样更容易阅读
# 在scikit-learn中，许多文本处理和特征提取方法（如CountVectorizer）默认返回稀疏矩阵。
print(vectorizer.vocabulary_) #打印CountVectorizer的词汇表，这是一个包含所有在语料库中出现过的词汇的列表。词汇表的顺序与fit_transform方法返回的向量中的列顺序相对应。

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


In [3]:
# 再增加一个文档
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_) # 发现共有十个独特的单词。
# 'I' 和 'a' 没有匹配正则表达式，因此没有被提取

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [4]:
# 通过L^2范数，计算文档之间的特征向量的相似度
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus)
# 计算第一个文档和第二个文档的L^2范数
print('Distance between 1st and 2nd documents:',euclidean_distances(X[0], X[1]))
# 计算第一个文档和第三个文档的L^2范数
print('Distance between 1st and 3rd documents:',euclidean_distances(X[0], X[2]))
# 计算第二个文档和第三个文档的L^2范数
print('Distance between 2nd and 3rd documents:',euclidean_distances(X[1], X[2]))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


In [5]:
# 或者

# 计算文档之间的L^2范数距离
distances = euclidean_distances(X)

# 打印文档之间的距离
print('Distance between 1st and 2nd documents:', distances[0, 1])
print('Distance between 1st and 3rd documents:', distances[0, 2])
print('Distance between 2nd and 3rd documents:', distances[1, 2])

Distance between 1st and 2nd documents: 2.449489742783178
Distance between 1st and 3rd documents: 2.6457513110645907
Distance between 2nd and 3rd documents: 2.6457513110645907


### 停用词过滤
去除大部分文档中常见的单词，如：限定词'the''a''an',助动词'do''be''will',介词'on''around''beneath'等等


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
] 
vectorizer = CountVectorizer(stop_words= 'english')  # 用英语的默认停用词列表———停用限定词'the''a''an',助动词'do''be''will',介词'on''around''beneath'等等
print(vectorizer.fit_transform(corpus).todense())  
print(vectorizer.vocabulary_)  # 打印vocabulary_属性

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


### 词干提取和词形还原

In [1]:
# 创建一个由两个文档组成的语料库
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
]
vectorizer = CountVectorizer(binary=True, stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)


[[1 1 1 0]
 [1 0 0 1]]
{'gathering': 0, 'ingredients': 1, 'sandwich': 2, 'wizards': 3}


In [4]:
# 词形还原（lemmatization）（考虑单词词性）
# 是一种更精细的过程，它不仅去除词缀，还考虑单词的词性，将单词还原到其词典形式。
from nltk.stem.wordnet import WordNetLemmatizer
# WordNetLemmatizer 是一个基于 WordNet 数据库的词元还原器，它能够根据单词的词性（名词、动词、形容词或副词）来还原单词。
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering','v'))  # lemmatize 方法会查找 'gathering' 作为动词的基本形式
print(lemmatizer.lemmatize('gathering','n'))


gather
gathering


In [5]:
# 词干提取(删除词缀)
# 目标是将单词减少到一个基本形式，通常不考虑词性，而是基于一系列规则去除单词的词缀
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('gathering'))

gather


对玩具语料库进行词形还原

In [7]:
# 创建一个由两个文档组成的语料库
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
]
vectorizer = CountVectorizer(binary=True, stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 1 1 0]
 [1 0 0 1]]
{'gathering': 0, 'ingredients': 1, 'sandwich': 2, 'wizards': 3}


In [10]:
# 
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

wordnet_tags = ['n', 'v']
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]
stemmer = PorterStemmer()
print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus ])

def lemmatize(token, tag):
    """   """
    if tag[0].lower() in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
print('Lemmatized:', [[lemmatize(token, tag) for token ,tag in document] for  document in tagged_corpus])

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\L.Ling/nltk_data'
    - 'd:\\anaconda\\nltk_data'
    - 'd:\\anaconda\\share\\nltk_data'
    - 'd:\\anaconda\\lib\\nltk_data'
    - 'C:\\Users\\L.Ling\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [16]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True