# 第2章 自然语言处理

## 1 文本分词

In [1]:
import spacy

In [15]:
nlp = spacy.load("en_core_web_sm")
text = "Mary, don't slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [5]:
from nltk.tokenize import TweetTokenizer

In [6]:
tweet = u"Snow White and the Seven Degrees #MakeAMovieClod@midnight:)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamovieclod', '@midnight', ':)']


## 2 从文本生成n元模型

In [7]:
def n_grams(text, n):
    return [text[i:i+n] for i in range(len(text) -n + 1)]

In [8]:
cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


## 3 词形还原：将单词还原为词根形式

In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"he was running late")
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


## 4 单词分类：词性标注

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print('{} - {}'.format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - PROPN
witch - NOUN
. - PUNCT


## 5 广度分类：名词短语（NP）分块

In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print('{} - {}'.format(chunk, chunk.label_))

Mary - NP
the green witch - NP
