In [1]:
import numpy as np
import nltk

In [2]:
# 对文本进行分块，每一块相当于一个袋子
# 该函数和FireAI_037一样
def split(dataset,words_num):
    '''
    将dataset这一整段文本分割成N个小块，
    使得每个小块中含有单词的数目等于words_num'''
    words=dataset.split(' ') # 此处用空格来区分单词是否合适？
    
    rows=int(np.ceil(len(words)/words_num)) # 即行数
    result=[] # 预计里面装的元素是rows行words_num列，最后一行可能少于words_num，故不能用np.array

    # words是list，可以用切片的方式获取
    for row in range(rows):
        result.append(words[row*words_num:(row+1)*words_num])
    return result


In [3]:
# 数据集暂时用简·奥斯丁的《爱玛》中的文本
dataset=nltk.corpus.gutenberg.words('austen-emma.txt')
# print(len(dataset)) # 192427 代表读入正常
chunks=split(" ".join(dataset[:10000]), 2000) # 将前面的10000个单词分成五个词袋，每个袋子装2000个单词

# 构建一个文档-词矩阵，该矩阵记录了文档中每个单词出现的频次
# 用sk-learn的CountVectorizer函数来实现这种构建过程
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=4, max_df=.99)
# fit_transform函数需要输入一维数组，且数组元素是用空格连起来的文本
chunks=[" ".join(chunk) for chunk in chunks] # 故而需要转换一下
doc_term_matrix = vectorizer.fit_transform(chunks)
feature_names=vectorizer.get_feature_names() # 获取
print(len(feature_names)) 
print(doc_term_matrix.shape) 
# print(doc_term_matrix.T.toarray())

78
(5, 78)


In [4]:
# 打印看看doc_term_matrix这个文档-词矩阵里面的内容
print('Document Term Matrix------>>>>')
bag_names=['Bag_'+str(i) for i in range(5)] # 5个词袋
formatted_row='{:>12}'*(1+len(bag_names)) # 每一行第一列是单词，后面是每个词袋中的频率
print(formatted_row.format('Word', *bag_names))
for word, freq in zip(feature_names,doc_term_matrix.T.toarray()): # 需要装置矩阵
    # 此处的freq是csr_matrix数据结构
    output = [str(x) for x in freq.data]
    print(formatted_row.format(word,*output))

Document Term Matrix------>>>>
        Word       Bag_0       Bag_1       Bag_2       Bag_3       Bag_4
       about           3           4           0           1           1
       among           1           1           1           1           0
     because           1           1           0           1           1
     believe           0           1           1           1           3
    believed           0           1           1           1           2
        best           1           2           1           1           0
      better           0           3           1           1           2
      beyond           1           0           1           2           3
        care           0           1           2           2           1
   certainly           0           2           1           1           1
     chapter           1           1           0           1           1
        come           0           3           1           2           2
 comfortable        

In [5]:
chunks=['The brown dog is running', 
        'The black dog is in the black room.',
        'Running in the room is forbidden']
vectorizer = CountVectorizer(min_df=1, max_df=.90)
doc_term_matrix = vectorizer.fit_transform(chunks)
feature_names=vectorizer.get_feature_names() # 获取
print(len(feature_names)) 
print(feature_names)
print(doc_term_matrix.shape) 
print(doc_term_matrix.T[0])

for key,value in vectorizer.vocabulary_.items():
    print(key,value )
    
# 打印看看doc_term_matrix这个文档-词矩阵里面的内容
print('Document Term Matrix------>>>>')
bag_names=['Bag_'+str(i) for i in range(3)] # 3个词袋
formatted_row='{:>12}'*(1+len(bag_names)) # 每一行第一列是单词，后面是每个词袋中的频率
print(formatted_row.format('Word', *bag_names))
for word, freq in zip(feature_names,doc_term_matrix.T.toarray()): # 需要装置矩阵
    # 此处的freq是csr_matrix数据结构
    output = [str(x) for x in freq.data]
    print(formatted_row.format(word,*output))

7
['black', 'brown', 'dog', 'forbidden', 'in', 'room', 'running']
(3, 7)
  (0, 1)	2
brown 1
dog 2
running 6
black 0
in 4
room 5
forbidden 3
Document Term Matrix------>>>>
        Word       Bag_0       Bag_1       Bag_2
       black           0           2           0
       brown           1           0           0
         dog           1           1           0
   forbidden           0           0           1
          in           0           1           1
        room           0           1           1
     running           1           0           1


In [6]:
chunks=['The brown dog is running', 
        'The black dog is in the black room.',
        'Running in the room is forbidden'] 
# 通过将chunks 和上面的Document Term Matrix对比，发现black在Bag_0（第一句话）中出现0次，在Bag_1中出现2次，以此类推。

In [7]:
chunks=['这是【火炉炼AI】系列文章之一，作者炼丹老顽童', 
        '这个文章主要讲解怎么构建词袋模型',
        '文章的标题为【火炉炼AI】机器学习038-NLP创建词袋模型']
vectorizer = CountVectorizer(min_df=1, max_df=.90)
doc_term_matrix = vectorizer.fit_transform(chunks)
feature_names=vectorizer.get_feature_names() # 获取
print(len(feature_names)) 
print(feature_names)
print(doc_term_matrix.shape) 
print(doc_term_matrix.T[0])

for key,value in vectorizer.vocabulary_.items():
    print(key,value )
    
# 打印看看doc_term_matrix这个文档-词矩阵里面的内容
print('Document Term Matrix------>>>>')
bag_names=['Bag_'+str(i) for i in range(3)] # 3个词袋
formatted_row='{:>12}'*(1+len(bag_names)) # 每一行第一列是单词，后面是每个词袋中的频率
print(formatted_row.format('Word', *bag_names))
for word, freq in zip(feature_names,doc_term_matrix.T.toarray()): # 需要装置矩阵
    # 此处的freq是csr_matrix数据结构
    output = [str(x) for x in freq.data]
    print(formatted_row.format(word,*output))
    
# 这里的结果不好的原因是，我们没有用jieba对中文进行分词，而采用split函数中的空格作为分隔符，
# 故而要使用中文，split函数还需要为用jieba分词的形式

8
['nlp创建词袋模型', '作者炼丹老顽童', '文章的标题为', '机器学习038', '火炉炼ai', '系列文章之一', '这个文章主要讲解怎么构建词袋模型', '这是']
(3, 8)
  (0, 2)	1
这是 7
火炉炼ai 4
系列文章之一 5
作者炼丹老顽童 1
这个文章主要讲解怎么构建词袋模型 6
文章的标题为 2
机器学习038 3
nlp创建词袋模型 0
Document Term Matrix------>>>>
        Word       Bag_0       Bag_1       Bag_2
   nlp创建词袋模型           0           0           1
     作者炼丹老顽童           1           0           0
      文章的标题为           0           0           1
     机器学习038           0           0           1
       火炉炼ai           1           0           1
      系列文章之一           1           0           0
这个文章主要讲解怎么构建词袋模型           0           1           0
          这是           1           0           0


In [8]:

# 数据集暂时用简·奥斯丁的《爱玛》中的文本
dataset=nltk.corpus.gutenberg.words('austen-emma.txt')
# print(len(dataset)) # 192427 代表读入正常
chunks=split(" ".join(dataset[:10000]), 2000) # 将前面的10000个单词分成五个词袋，每个袋子装2000个单词

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
# fit_transform函数需要输入一维数组，且数组元素是用空格连起来的文本
chunks=[" ".join(chunk) for chunk in chunks] # 故而需要转换一下
doc_term_matrix = vectorizer.fit_transform(chunks)
feature_names=vectorizer.get_feature_names() # 获取
print(len(feature_names)) 
print(doc_term_matrix.shape) 


1635
(5, 1635)


In [9]:
# 打印看看doc_term_matrix这个文档-词矩阵里面的内容
print('Document Term Matrix------>>>>')
bag_names=['Bag_'+str(i) for i in range(5)] # 5个词袋
formatted_row='{:>12}'*(1+len(bag_names)) # 每一行第一列是单词，后面是每个词袋中的tf-idf权重
print(formatted_row.format('Word', *bag_names))
for word, freq in zip(feature_names,doc_term_matrix.T.toarray()): # 需要装置矩阵
    # 此处的freq是csr_matrix数据结构
    output = [str(x) for x in freq.data]
    print(formatted_row.format(word,*output))

Document Term Matrix------>>>>
        Word       Bag_0       Bag_1       Bag_2       Bag_3       Bag_4
        18160.012515402524101736         0.0         0.0         0.0         0.0
        23rd         0.0         0.0         0.0         0.00.012519437238422809
         8th         0.0         0.0         0.0         0.00.012519437238422809
      _half_         0.0         0.0         0.0         0.00.012519437238422809
       _her_         0.0         0.0         0.0         0.00.012519437238422809
    _little_         0.0         0.0         0.0         0.00.012519437238422809
       _one_         0.00.013909720439783415         0.0         0.0         0.0
       _she_         0.0         0.0         0.00.0097481847921927420.010100603135172559
     _small_         0.0         0.0         0.0         0.00.012519437238422809
      _that_         0.00.013909720439783415         0.0         0.0         0.0
      _them_0.010097347953062942         0.0         0.0         0.00.01010060

        june         0.0         0.0         0.0         0.00.012519437238422809
        just0.007050959472647545         0.00.012768340154919270.013614278961394070.014106465120634926
        keep0.012515402524101736         0.0         0.0         0.0         0.0
        kind0.0059636563607681290.0198841689552069520.0053996901610855220.0115148699464731620.011931157847520567
    kindness0.01676342186361384         0.0         0.00.0080918739670856780.008384413027010535
     kindred         0.0         0.00.011331856127054363         0.0         0.0
    kingston         0.0         0.0         0.0         0.00.012519437238422809
        knew         0.00.0093155018900903160.015178152232001310.03236749586834271         0.0
   knightley0.008381710931806920.11178602268108379         0.00.03236749586834271         0.0
        know0.0211528784179426320.047018963189348620.006384170077459635         0.00.035266162801587314
     knowing0.012515402524101736         0.0         0.0         0.0   

         shy         0.0         0.0         0.00.01208262378604184         0.0
        sigh0.008381710931806920.0093155018900903160.007589076116000655         0.0         0.0
       sight         0.0         0.0         0.00.0097481847921927420.010100603135172559
       silly         0.00.013909720439783415         0.0         0.0         0.0
  simplicity         0.0         0.0         0.00.0097481847921927420.010100603135172559
       since         0.00.0093155018900903160.007589076116000655         0.00.008384413027010535
        sing         0.0         0.0         0.0         0.00.025038874476845618
     singing         0.0         0.0         0.0         0.00.012519437238422809
      single         0.00.0156729877297828720.0063841700774596350.0068071394806970350.007053232560317463
        sink         0.0         0.0         0.0         0.00.012519437238422809
         sir         0.00.05563888175913366         0.0         0.0         0.0
      sister0.025030805048203472        