## Xử lí ngôn ngữ tự nhiên
 - Sentence tokenization
 - Word tokenization
 - Xóa stop words
 - Viết thường các kí tự
 - Sửa lại các từ viết tắt, viết sai
 - POS tagging
 - Phân tích cú pháp

### Chuẩn bị dữ liệu

In [None]:
url = "http://www.gutenberg.org/ebooks/1661.txt.utf-8"
file_name = 'sherlock.txt'

In [None]:
import urllib.request
with urllib.request.urlopen(url) as response:
    with open(file_name, 'wb') as out_file:
        data = response.read()
        out_file.write(data)
text = open(file_name, 'r', encoding='utf-8').read()

##### Spacy for Tokenization 

In [None]:
import spacy
sentence = "Manchester United isn't looking to sign a forward for $90 million"
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)

for word in doc:
    print(word.text)

Manchester
United
is
n't
looking
to
sign
a
forward
for
$
90
million


##### Sentence Tokenization

In [9]:
import urllib.request
url = "http://www.gutenberg.org/ebooks/1661.txt.utf-8"
file_name = 'sherlock.txt'

with urllib.request.urlopen(url) as response:
    with open(file_name, 'wb') as out_file:
        data = response.read()
        out_file.write(data)
text = open(file_name, 'r', encoding='utf-8').read()
doc = nlp(text)

sentences = list(doc.sents)
for sentence in sentences[15:18]:
    for word in sentence:
        print(word.text)
    print([token for token in sentence])

To
Sherlock
Holmes
she
is
always
_
the
_
woman
.
[To, Sherlock, Holmes, she, is, always, _, the, _, woman, .]
I
have
seldom
heard
him


mention
her
under
any
other
name
.
[I, have, seldom, heard, him, 
, mention, her, under, any, other, name, .]
In
his
eyes
she
eclipses
and


predominates
the
whole
of
her
sex
.
[In, his, eyes, she, eclipses, and, 
, predominates, the, whole, of, her, sex, .]


##### Xóa stop words, dấu câu

In [10]:
sentence_example = "the AI/AGI uprising cannot happen without the progress of NLP"

[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, False, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, False, False)]

In [None]:
[token for token in nlp(sentence_example) if not token.is_stop]

[the, AI, AGI, uprising, can, not, happen, without, the, progress, of, NLP]

In [15]:
[token for token in nlp(sentence_example) if not token.is_punct]

[the, AI, AGI, uprising, can, not, happen, without, the, progress, of, NLP]

##### Stemming

- Porter Stemmer

In [17]:
from nltk.stem.porter import *

stemmer = PorterStemmer()

tokens = ['compute', 'computer', 'computed', 'computing']

for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


- Snowball Stemmer

In [18]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

tokens = ['compute', 'computer', 'computed', 'computing']

for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


##### Lemmatization

In [19]:
lemma_sentence_example = "Their apples & banana fruit salads are amazing. Would you like meeting me at the cafe?"
[(token, token.lemma_, token.lemma) for token in nlp(lemma_sentence_example)]

[(Their, 'their', 4244585616942201722),
 (apples, 'apple', 8566208034543834098),
 (&, '&', 15473034735919704609),
 (banana, 'banana', 2525716904149915114),
 (fruit, 'fruit', 17674554054627885835),
 (salads, 'salad', 16382906660984395826),
 (are, 'be', 10382539506755952630),
 (amazing, 'amazing', 12968186374132960503),
 (., '.', 12646065887601541794),
 (Would, 'would', 6992604926141104606),
 (you, 'you', 7624161793554793053),
 (like, 'like', 18194338103975822726),
 (meeting, 'meet', 6880656908171229526),
 (me, 'I', 4690420944186131903),
 (at, 'at', 11667289587015813222),
 (the, 'the', 7425985699627899538),
 (cafe, 'cafe', 10569699879655997926),
 (?, '?', 8205403955989537350)]

In [20]:
sentence = nlp("compute computer computed computing")
for word in sentence:
    print(word.text, word.lemma_)

compute compute
computer computer
computed compute
computing computing


##### POS tagging

In [21]:
POS_sentence_example = "Their apples & banana fruit salads are amazing. Would you like meeting me at the cafe?"
[(token, token.pos_, token.pos) for token in nlp(POS_sentence_example)]

[(Their, 'PRON', 95),
 (apples, 'NOUN', 92),
 (&, 'CCONJ', 89),
 (banana, 'NOUN', 92),
 (fruit, 'NOUN', 92),
 (salads, 'NOUN', 92),
 (are, 'AUX', 87),
 (amazing, 'ADJ', 84),
 (., 'PUNCT', 97),
 (Would, 'AUX', 87),
 (you, 'PRON', 95),
 (like, 'AUX', 87),
 (meeting, 'VERB', 100),
 (me, 'PRON', 95),
 (at, 'ADP', 85),
 (the, 'DET', 90),
 (cafe, 'NOUN', 92),
 (?, 'PUNCT', 97)]

In [22]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text,
          token.lemma_,
          token.pos_,
          token.tag_,
          token.dep_,
          token.shape_,
          token.is_alpha,
          token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP nsubj X.X. False False
startup startup VERB VBD ccomp xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [23]:
import spacy 
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence")
displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


##### Sửa lại từ viết tắt

In [24]:
import contractions
text = "Their apples & banana fruit salads are amazing. I'd like meeting me at the cafe?"
contractions.fix(text)

'Their apples & banana fruit salads are amazing. I would like meeting me at the cafe?'

##### Sửa lại các từ viết sai

In [30]:
from spellchecker import Spellchecker

spell = Spellchecker()
text = "This bouks are nices"
words = spell.split_words(text)
ans = [spell.correction(word) for word in words]
ans

ModuleNotFoundError: No module named 'indexer'

##### NER

In [27]:
text = "Madam Pomfrey, the nurse, was kept busy by a sudden spate of colds among the staff and students. Her Pepperup potion worked instantly, though it left the drinker smoking at the ears for several hours afterward. Ginny Weasley, who had been looking pale, was bullied into taking some by Percy."
doc1 = nlp(text)

for entity in doc1.ents:
    print(f"{entity.text} ({entity.label_})")

Pomfrey (PERSON)
several hours (TIME)
Ginny Weasley (PERSON)
Percy (PERSON)


In [28]:
def redact_names(text):
    doc = nlp(text)
    redacted_sentence = []
    for token in doc:
        if(token.ent_type_ == "PERSON"):
            redacted_sentence.append("[REDACTED]")
        else:
            redacted_sentence.append(token.text)
    return " ".join(redacted_sentence)

redact_names(text)

'Madam [REDACTED] , the nurse , was kept busy by a sudden spate of colds among the staff and students . Her Pepperup potion worked instantly , though it left the drinker smoking at the ears for several hours afterward . [REDACTED] [REDACTED] , who had been looking pale , was bullied into taking some by [REDACTED] .'

In [29]:
displacy.render(doc1, style='ent', jupyter=True)