# 단어 토큰화

구두점(. , ? ; ! 등) 과 같은 문자는 제외시키고 토큰으로 나누는 것

입력 : Time is an illusion. Lunchtime double so! \
출력 : "Time", "is", "an", "illusion", "Lunchtime", "double", "so"


# 토큰화 중 생기는 문제
- 아포스트로피(')가 들어가있는 단어

ex) \
Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop.

**Don't 와 Jone's를 어떻게 토큰화 할 것인지**

**word_toeknize**

In [1]:
from nltk.tokenize import word_tokenize
print(word_tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


**WordPunctTokenizer**

In [2]:
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


**keras의 text_to_word_sequence**

In [3]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


**Penn Treebank Tokenization**
- 규칙1. 하이폰으로 구성된 단어는 하나로 유지
- 규칙2. doesn't와 같이 아포스트로피로 '접어'가 함께하는 단어는 분리

In [7]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


# 문장 토큰화

- nltk는 단순히 온점(.)으로만 문장을 구분하지 않는다.

In [8]:
from nltk.tokenize import sent_tokenize
text="His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to mae sure no one was near.']


In [9]:
from nltk.tokenize import sent_tokenize
text="I am actively looking for Ph.D. students. and you are a Ph.D student."
print(sent_tokenize(text))

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


# 한국어 문장 토큰화

In [10]:
!pip install kss

Collecting kss
  Downloading kss-1.3.1.tar.gz (6.1 kB)
Building wheels for collected packages: kss
  Building wheel for kss (setup.py): started
  Building wheel for kss (setup.py): finished with status 'done'
  Created wheel for kss: filename=kss-1.3.1-cp38-cp38-win_amd64.whl size=35629 sha256=8c454bf1083a44920ae50b0b32a9d8f404719a1b8ab6ada74e7ed9b34da5e211
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\84\9d\86\2b2fef9e791536a718b27b4e3d9f252df07afaec3d53cb9ce7
Successfully built kss
Installing collected packages: kss
Successfully installed kss-1.3.1


In [12]:
import kss
text='딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?'
print(kss.split_sentences(text))

['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어려워요.', '농담아니에요.', '이제 해보면 알걸요?']


# 실습

In [13]:
from nltk.tokenize import word_tokenize
text = "I am actively looking for Ph.D. students. and you are a Ph.D. student."
print(word_tokenize(text))

['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']


- PRP : 인칭 대명사\
- VBP : 동사\
- RB : 부사\
- VBG : 현재부사\
- IN :전치사\
- NNP : 고유 명사\
- NNS : 복수형 명사\
- CC : 접속사\
- DT : 관사

In [14]:
from nltk.tag import pos_tag
x=word_tokenize(text)
pos_tag(x)

[('I', 'PRP'),
 ('am', 'VBP'),
 ('actively', 'RB'),
 ('looking', 'VBG'),
 ('for', 'IN'),
 ('Ph.D.', 'NNP'),
 ('students', 'NNS'),
 ('.', '.'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('a', 'DT'),
 ('Ph.D.', 'NNP'),
 ('student', 'NN'),
 ('.', '.')]

**한국어 자연어 처리를 위해서는 KoNLPy 파이썬 패키지 사용**

# Okt
- morphs : 형태소 추출
- pos : 품사 태깅(태그를 다는것)
- nouns : 명사 추출

In [15]:
from konlpy.tag import Okt
okt = Okt()
print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']


In [17]:
print(okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

[('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]


In [18]:
print(okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

['코딩', '당신', '연휴', '여행']


# Kkma

In [19]:
from konlpy.tag import Kkma
kkma = Kkma()
print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']


In [20]:
print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

[('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]


In [21]:
print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

['코딩', '당신', '연휴', '여행']
