In [1]:
import numpy as np
import nltk
doc = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."
doc

"Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

In [2]:
"""
分句子
"""
sents = nltk.sent_tokenize(doc)
for i in range(len(sents)):
    print(i+1,":",sents[i])

1 : Are you curious about tokenization?
2 : Let's see how it works!
3 : We need to analyze a couple of sentences with punctuations to see it in action.


In [3]:
"""
分单词
"""
words = nltk.word_tokenize(doc)
for i in range(len(words)):
    print(i+1,":",words[i])

1 : Are
2 : you
3 : curious
4 : about
5 : tokenization
6 : ?
7 : Let
8 : 's
9 : see
10 : how
11 : it
12 : works
13 : !
14 : We
15 : need
16 : to
17 : analyze
18 : a
19 : couple
20 : of
21 : sentences
22 : with
23 : punctuations
24 : to
25 : see
26 : it
27 : in
28 : action
29 : .


### 词袋模型

In [8]:
sents = ["This hotel is very bad",
        "The toilet in this hotel smells bad. ",
        "The environment of this hotel is very good."]
import sklearn.feature_extraction.text as ft
cv = ft.CountVectorizer()
bow = cv.fit_transform(sents)
"""
以下输出的输出结果是稀疏矩阵
"""
print(bow)

  (0, 9)	1
  (0, 3)	1
  (0, 5)	1
  (0, 11)	1
  (0, 0)	1
  (1, 9)	1
  (1, 3)	1
  (1, 0)	1
  (1, 8)	1
  (1, 10)	1
  (1, 4)	1
  (1, 7)	1
  (2, 9)	1
  (2, 3)	1
  (2, 5)	1
  (2, 11)	1
  (2, 8)	1
  (2, 1)	1
  (2, 6)	1
  (2, 2)	1


In [9]:
"""
获得原始矩阵
"""
print(bow.toarray())

[[1 0 0 1 0 1 0 0 0 1 0 1]
 [1 0 0 1 1 0 0 1 1 1 1 0]
 [0 1 1 1 0 1 1 0 1 1 0 1]]


In [10]:
"""
获得特征名称
"""
print(cv.get_feature_names())

['bad', 'environment', 'good', 'hotel', 'in', 'is', 'of', 'smells', 'the', 'this', 'toilet', 'very']


### TF - IDF

In [12]:
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow)
print(tfidf)

  (0, 11)	0.4875913479575303
  (0, 9)	0.37865817843463756
  (0, 5)	0.4875913479575303
  (0, 3)	0.37865817843463756
  (0, 0)	0.4875913479575303
  (1, 10)	0.45386826657073503
  (1, 9)	0.2680619096684997
  (1, 8)	0.34517851538731575
  (1, 7)	0.45386826657073503
  (1, 4)	0.45386826657073503
  (1, 3)	0.2680619096684997
  (1, 0)	0.34517851538731575
  (2, 11)	0.32628713817645505
  (2, 9)	0.2533910700140413
  (2, 8)	0.32628713817645505
  (2, 6)	0.4290283757733418
  (2, 5)	0.32628713817645505
  (2, 3)	0.2533910700140413
  (2, 2)	0.4290283757733418
  (2, 1)	0.4290283757733418


In [15]:
np.round(tfidf.toarray(),2)

array([[0.49, 0.  , 0.  , 0.38, 0.  , 0.49, 0.  , 0.  , 0.  , 0.38, 0.  ,
        0.49],
       [0.35, 0.  , 0.  , 0.27, 0.45, 0.  , 0.  , 0.45, 0.35, 0.27, 0.45,
        0.  ],
       [0.  , 0.43, 0.43, 0.25, 0.  , 0.33, 0.43, 0.  , 0.33, 0.25, 0.  ,
        0.33]])

In [16]:
cv.get_feature_names()

['bad',
 'environment',
 'good',
 'hotel',
 'in',
 'is',
 'of',
 'smells',
 'the',
 'this',
 'toilet',
 'very']