In [1]:
import numpy as np
import nltk.tokenize as tk

In [2]:
doc = "Are you curious about tokenization? " \
      "Let's see how it works! " \
      "We need to analyze a couple of sentences " \
      "with punctuations to see it in action."
print(doc)
sents = tk.sent_tokenize(doc)
for i,sent in enumerate(sents):
    print(i, ':', sent)

Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action.
0 : Are you curious about tokenization?
1 : Let's see how it works!
2 : We need to analyze a couple of sentences with punctuations to see it in action.


In [3]:
words = tk.word_tokenize(doc)
for i, word in enumerate(words):
    print(i, ':', word)

0 : Are
1 : you
2 : curious
3 : about
4 : tokenization
5 : ?
6 : Let
7 : 's
8 : see
9 : how
10 : it
11 : works
12 : !
13 : We
14 : need
15 : to
16 : analyze
17 : a
18 : couple
19 : of
20 : sentences
21 : with
22 : punctuations
23 : to
24 : see
25 : it
26 : in
27 : action
28 : .


In [4]:
punctTokenizer = tk.WordPunctTokenizer() 
word_list = punctTokenizer.tokenize(doc)
for i, word in enumerate(word_list):
    print(i, ':', word)

0 : Are
1 : you
2 : curious
3 : about
4 : tokenization
5 : ?
6 : Let
7 : '
8 : s
9 : see
10 : how
11 : it
12 : works
13 : !
14 : We
15 : need
16 : to
17 : analyze
18 : a
19 : couple
20 : of
21 : sentences
22 : with
23 : punctuations
24 : to
25 : see
26 : it
27 : in
28 : action
29 : .


In [5]:
words = tk.word_tokenize('前门到了请后门下车，欢迎下次光临。')
for i, word in enumerate(words):
    print(i, ':', word)

0 : 前门到了请后门下车，欢迎下次光临。


In [6]:
doc = 'This hotel is very bad. The toilet in this hotel smells bad. The environment of this hotel is very good.'
sents = tk.sent_tokenize(doc)
sents

['This hotel is very bad.',
 'The toilet in this hotel smells bad.',
 'The environment of this hotel is very good.']

# 词袋模型

In [7]:
import sklearn.feature_extraction.text as ft
cv = ft.CountVectorizer()
bow = cv.fit_transform(sents)
print(bow.toarray())

[[1 0 0 1 0 1 0 0 0 1 0 1]
 [1 0 0 1 1 0 0 1 1 1 1 0]
 [0 1 1 1 0 1 1 0 1 1 0 1]]


In [8]:
words = cv.get_feature_names()
words

['bad',
 'environment',
 'good',
 'hotel',
 'in',
 'is',
 'of',
 'smells',
 'the',
 'this',
 'toilet',
 'very']

# TF-IDF

In [11]:
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow)
print(np.round(tfidf.toarray(), 3))
print(cv.get_feature_names())

[[0.488 0.    0.    0.379 0.    0.488 0.    0.    0.    0.379 0.    0.488]
 [0.345 0.    0.    0.268 0.454 0.    0.    0.454 0.345 0.268 0.454 0.   ]
 [0.    0.429 0.429 0.253 0.    0.326 0.429 0.    0.326 0.253 0.    0.326]]
['bad', 'environment', 'good', 'hotel', 'in', 'is', 'of', 'smells', 'the', 'this', 'toilet', 'very']


In [12]:
import pandas as pd
pd.DataFrame(np.round(tfidf.toarray(),3), columns = cv.get_feature_names())

Unnamed: 0,bad,environment,good,hotel,in,is,of,smells,the,this,toilet,very
0,0.488,0.0,0.0,0.379,0.0,0.488,0.0,0.0,0.0,0.379,0.0,0.488
1,0.345,0.0,0.0,0.268,0.454,0.0,0.0,0.454,0.345,0.268,0.454,0.0
2,0.0,0.429,0.429,0.253,0.0,0.326,0.429,0.0,0.326,0.253,0.0,0.326
