# [spaCy](https://spacy.io/#example)介绍

In [2]:
# 导入工具包与英文模型
# 管理员身份打开CMD
# python -m spacy download en

import spacy

nlp = spacy.load('en_core_web_sm')

## 文本处理

In [3]:
doc = nlp('Weather is good, very windy and sunny. We have no class in the afternoon.')

In [4]:
# 分词

for token in doc:
    print(token)

Weather
is
good
,
very
windy
and
sunny
.
We
have
no
class
in
the
afternoon
.


In [5]:
# 分句

for sent in doc.sents:
    print(sent)

Weather is good, very windy and sunny.
We have no class in the afternoon.


## 词性

[参考](https://www.winwaed.com/blog/2011/11/08/part-of-speech-tags/)

In [7]:
for token in doc:
    print('{} - {}'.format(token, token.pos_))

Weather - NOUN
is - AUX
good - ADJ
, - PUNCT
very - ADV
windy - ADJ
and - CCONJ
sunny - ADJ
. - PUNCT
We - PRON
have - VERB
no - DET
class - NOUN
in - ADP
the - DET
afternoon - NOUN
. - PUNCT


## 命名实体识别

In [8]:
doc2 = nlp('I went to Paris to meet my old friend Jack from Uni.')

for ent in doc2.ents:
    print('{} - {}'.format(ent, ent.label_))

Paris - GPE
Jack - PERSON
Uni - GPE


In [10]:
from spacy import displacy

displacy.render(doc2, style='ent', jupyter=True)

### 找到书中所有人名字

In [11]:
def read_file(file_name):
    with open(file_name, 'r') as file:
        return file.read()

In [12]:
text = read_file('./text/jane-austen-pride-prejudice.txt')

processed_text = nlp(text)

In [13]:
sentences = [sent for sent in processed_text.sents]

len(sentences)

5732

In [14]:
sentences[0: 5]

[锘縏he Project Gutenberg eBook, Pride and Prejudice, by Jane Austen, Edited
 by R. W. (Robert William) Chapman
 
 
 This eBook is for the use of anyone anywhere at no cost and with
 almost no restrictions whatsoever.  ,
 You may copy it, give it away or
 re-use it under the terms of the Project Gutenberg License included
 with this eBook or online at www.gutenberg.org
 
 
 
 
 
 Title: Pride and Prejudice
 
 
 Author: Jane Austen
 
 Editor: R. W. (Robert William) Chapman
 
 Release Date: May 9, 2013  ,
 [eBook #42671]
 
 Language: English
 
 
 ***START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE***
 
 
 E-text prepared by Greg Weeks, Jon Hurst, Mary Meehan, and the Online
 Distributed Proofreading Team (http://www.pgdp.net) from page images
 generously made available by Internet Archive (https://archive.org)
 
 
 ,
 Note: Project Gutenberg also has an HTML version of this
       file which includes the original illustrations.
       ,
 See 42671-h.htm or 42671-h.zip:
       ]

In [15]:
from collections import Counter

# 计算每个人名出现的次数
def find_person(doc):
    c = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'PERSON':
            c[ent.lemma_] += 1
            
    return c.most_common(10)

print(find_person(processed_text))

[('Elizabeth', 614), ('Darcy', 406), ('Jane', 280), ('Bennet', 241), ('Collins', 176), ('Bingley', 163), ('Wickham', 107), ('Gardiner', 94), ('Lizzy', 93), ('Lady Catherine', 75)]
