In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# 词袋

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# 创建词袋特征矩阵
count_vector = CountVectorizer()
bag_of_words = count_vector.fit_transform(text_data)
bag_of_words.shape  # 一个稀疏矩阵

(3, 8)

In [4]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

In [6]:
# 获取特征名称
feature_names = count_vector.get_feature_names()
feature_names

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [7]:
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0


# TF-IDF

Term Frequency  词频
$$TF_w = \frac {该文档中单词w的出现次数}{该文档中所有的词条总数}$$

 Inverse Document Frequency 逆文档频率
$$IDF = log \frac {语料库的文档总数}{1 + 包含词条w的文档数}$$

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [10]:
tfidf.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [11]:
pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0.0,0.0,0.0,0.894427,0.0,0.0,0.447214,0.0
1,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735
2,0.57735,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0


# 解析 HTML

In [25]:
from bs4 import BeautifulSoup

# 创建一些 HTML 代码
html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"
soup = BeautifulSoup(html, 'lxml')
# 寻找带有 "full_name" 类的 <div>，展示文本
soup.find('div', {'class': "full_name"}).text

'Masego Azra'

# 移除标点

In [14]:
s = """"
“你说，刘家要我三更死，我活不到五更？”
萧青帝对历刚微微抬手，而后，目光向刘雅城，带着似笑非笑之色。https://www.1kanshu.cc
小七也冷笑了一声，“愚昧的人何其之多，殊不知，就算是真正的阎王见到了王爷也要绕道走。”
“你你...”
刘雅城神色呆滞，整个人犹如傻眼了一般。
原本以为只要将萧青帝两人抓紧警司，那么，以他们刘家的人脉关系，想要弄死两人实在是太简单了。
"""

In [22]:
import string

# 创建文本
text_data = ['Hi!!!! I. Love. This. Song....', 
             '10000% Agree!!!! #LoveIT', 
             'Right?!?!']
text_data

['Hi!!!! I. Love. This. Song....', '10000% Agree!!!! #LoveIT', 'Right?!?!']

In [23]:
# 创建函数，使用 string.punctuation 移除所有标点
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))
# 应用函数
[remove_punctuation(sentence) for sentence in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

# 移除停止词

In [26]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ulysses/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
from nltk.corpus import stopwords

In [31]:
# 创建单词标记
tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']

# 加载停止词
stopwords = stopwords.words('english')

In [32]:
stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [33]:
[word for word in tokenized_words if word not in stopwords]

['going', 'go', 'store', 'park']

# 替换字符

In [34]:
# 导入库
import re

# 创建文本
text_data = ['Interrobang. By Aishwarya Henriette',
             'Parking And Going. By Karl Gautier',
             'Today Is The night. By Jarek Prakash']
# 移除句号
remove_periods = [s.replace('.', '') for s in text_data]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [35]:
# 创建函数
def replace_letters_with_X(s):
    return re.sub(r'[a-zA-Z]', 'X', s)
[replace_letters_with_X(s) for s in text_data]

['XXXXXXXXXXX. XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX. XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX. XX XXXXX XXXXXXX']

# 词干提取

In [37]:
from nltk.stem.porter import PorterStemmer

# 创建单词标记
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

词干提取通过识别和删除词缀（例如动名词）同时保持词的根本意义，将词语简化为词干。 NLTK 的PorterStemmer实现了广泛使用的 Porter 词干算法。

In [38]:
porter = PorterStemmer()
[porter.stem(s) for s in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

# 词性标签

```
标签	词性
NNP	专有名词，单数
NN	名词，单数或集体
RB	副词
VBD	动词，过去式
VBG	动词，动名词或现在分词
JJ	形容词
PRP	人称代词
```


In [42]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ulysses/nltk_data...


KeyboardInterrupt: 

In [39]:
from nltk import pos_tag, word_tokenize

text_data = "Chris loved outdoor running"

word_tokenize(text_data)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/ulysses/nltk_data'
    - '/home/ulysses/anaconda3/nltk_data'
    - '/home/ulysses/anaconda3/share/nltk_data'
    - '/home/ulysses/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [40]:
text_tagged = pos_tag(word_tokenize(text_data))

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/ulysses/nltk_data'
    - '/home/ulysses/anaconda3/nltk_data'
    - '/home/ulysses/anaconda3/share/nltk_data'
    - '/home/ulysses/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


# 中文分词

In [43]:
import jieba

- 精确模式：试图将句子最精确地切开，适合文本分析。
- 全模式：把句子中所有可以成词的词语都扫描出来，速度非常快，但是不能解决歧义。
- 搜索引擎模式：在精确模式的基础上，对长词再次切分，提高召回率，适合用于搜索引擎分词。

In [44]:
sentence = "中文分词是文本处理不可或缺的一步!"

In [45]:
sen_list = jieba.cut(sentence, cut_all=True)

In [48]:
print( '全模式：', '/ '.join(sen_list))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.781 seconds.
Prefix dict has been built successfully.
全模式： 中文/ 分词/ 是/ 文本/ 文本处理/ 本处/ 处理/ 不可/ 不可或缺/ 或缺/ 的/ 一步/ !


In [49]:
seg_list = jieba.cut(sentence, cut_all= False)
print('默认精确模式：', '/ '.join(seg_list))

默认精确模式： 中文/ 分词/ 是/ 文本处理/ 不可或缺/ 的/ 一步/ !


In [50]:
seg_list = jieba.cut_for_search(sentence)
print( '搜索引擎模式', '/ '.join(seg_list))

搜索引擎模式 中文/ 分词/ 是/ 文本/ 本处/ 处理/ 文本处理/ 不可/ 或缺/ 不可或缺/ 的/ 一步/ !
