# Tokenizer

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hojae\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
text = "Hello, world! He isn't. New York is a city."
word_tokenize(text)

['Hello',
 ',',
 'world',
 '!',
 'He',
 'is',
 "n't",
 '.',
 'New',
 'York',
 'is',
 'a',
 'city',
 '.']

Keras Tokenizer

In [None]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [None]:
text_to_word_sequence(text)

['hello', 'world', 'he', "isn't", 'new', 'york', 'is', 'a', 'city']

Textblob

In [None]:
# !pip install textblob
from textblob import TextBlob

blob = TextBlob(text)
blob.words



WordList(['Hello', 'world', 'He', 'is', "n't", 'New', 'York', 'is', 'a', 'city'])

## 작업별로 유용한 토크나이저

Tweet

In [None]:
from nltk.tokenize import TweetTokenizer   # emoji에 특화되어 있음
tweet = TweetTokenizer()
tweet.tokenize(text)

['Hello',
 ',',
 'world',
 '!',
 'He',
 "isn't",
 '.',
 'New',
 'York',
 'is',
 'a',
 'city',
 '.']

In [None]:
tweet.tokenize(":))))))))))))))")

[':)', ')', ')']

whitespace

In [None]:
from nltk.tokenize import WhitespaceTokenizer   # 단순히 whitespace 기준으로 자르는것
ws_tokenizer = WhitespaceTokenizer()
ws_tokenizer.tokenize(text)

['Hello,', 'world!', 'He', "isn't.", 'New', 'York', 'is', 'a', 'city.']

wordpunct tokenizer

In [None]:
from nltk.tokenize import WordPunctTokenizer
wo_tokenizer = WordPunctTokenizer()
wo_tokenizer.tokenize(text)

['Hello',
 ',',
 'world',
 '!',
 'He',
 'isn',
 "'",
 't',
 '.',
 'New',
 'York',
 'is',
 'a',
 'city',
 '.']

mwe tokenizer: 원하는 방식으로 토크나이저를 수정, 첨가하는 토크나이저

In [None]:
from nltk.tokenize import MWETokenizer

text = "In a little or a litttlie bit or a lot in spite of"
# mwe_tokenizer = MWETokenizer(([('a', 'little', 'bit'), ('a lot')]))   # 한 단어로 인식하게끔

mwe_tokenizer = MWETokenizer()
mwe_tokenizer.add_mwe(('a little bit'))
mwe_tokenizer.tokenize(text.split())   # 토큰을 넣어야됨

['In',
 'a',
 'little',
 'or',
 'a',
 'litttlie',
 'bit',
 'or',
 'a',
 'lot',
 'in',
 'spite',
 'of']

# 한국어 토크나이저

In [None]:
!pip install konlpy

Collecting konlpy
  Using cached konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
Installing collected packages: konlpy
Successfully installed konlpy-0.5.2


In [None]:
from konlpy.tag import Hannanum, Okt, Kkma, Komoran

In [None]:
from konlpy.corpus import kolaw
kolaw.fileids()

['constitution.txt']

In [None]:
# konlpy에서 제공하는 법률 텍스트
ko_text = kolaw.open("constitution.txt").read()

In [None]:
# 객체 생성
okt = Okt()

# 토크나이즈
okt.morphs(ko_text[:20])

['대한민국', '헌법', '\n\n', '유구', '한', '역사', '와', '전통', '에']

In [None]:
hannanum = Hannanum()
hannanum.morphs(ko_text[:20])

['대한민국헌법', '유구', '하', 'ㄴ', '역사', '와', '전통', '에']

In [None]:
# 명사 추출
okt.nouns(ko_text[:20])

['대한민국', '헌법', '유구', '역사', '전통']

## Soynlp

In [None]:
!pip install soynlp

Collecting soynlp
  Downloading soynlp-0.0.493-py3-none-any.whl (416 kB)
[?25l[K     |▉                               | 10 kB 18.7 MB/s eta 0:00:01[K     |█▋                              | 20 kB 25.2 MB/s eta 0:00:01[K     |██▍                             | 30 kB 12.1 MB/s eta 0:00:01[K     |███▏                            | 40 kB 9.4 MB/s eta 0:00:01[K     |████                            | 51 kB 5.4 MB/s eta 0:00:01[K     |████▊                           | 61 kB 5.9 MB/s eta 0:00:01[K     |█████▌                          | 71 kB 5.6 MB/s eta 0:00:01[K     |██████▎                         | 81 kB 6.3 MB/s eta 0:00:01[K     |███████                         | 92 kB 4.9 MB/s eta 0:00:01[K     |███████▉                        | 102 kB 5.2 MB/s eta 0:00:01[K     |████████▋                       | 112 kB 5.2 MB/s eta 0:00:01[K     |█████████▍                      | 122 kB 5.2 MB/s eta 0:00:01[K     |██████████▏                     | 133 kB 5.2 MB/s eta 0:00:01[K

In [None]:
from soynlp.tokenizer import LTokenizer
l_tokenizer = LTokenizer()

ko_text = ko_text[:50]
l_tokenizer.tokenize(ko_text)

['대한민국헌법',
 '유구한',
 '역사와',
 '전통에',
 '빛나는',
 '우리',
 '대한국민은',
 '3·1운동으로',
 '건립된',
 '대한민국임']

In [None]:
# 아이오아이, 트와이스 들어간 코퍼스를 학습한 다음에 토크나이저를 돌려서 인식하도록
sent = "트와이스 그리고 아이오아이 좋아요. tt가 저번에 1위 했었죠?"

# 학습하기 전 토크나이저
l_tokenizer.tokenize(sent)   # space단위로 나뉨

['트와이스', '그리고', '아이오아이', '좋아요.', 'tt가', '저번에', '1위', '했었죠?']

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
from soynlp.word import WordExtractor

In [None]:
%%time

word_extractor = WordExtractor(
    min_frequency=100,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0

)

CPU times: user 27 µs, sys: 2 µs, total: 29 µs
Wall time: 34.1 µs


In [None]:
cohesion_score = {word:score.cohesion_forwrd for word, score in words.items()}   # 시험 x

NameError: ignored

In [None]:
tokenizer = LTokenizer(scores=cohesion_score)
toenizer.tokenize(sent)

NameError: ignored

# Sentence

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('brown')
brown = nltk.corpus.brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
text = "I am a boy. You are a girl? Mr. Kim is a doctor."
sent_tokenize(text)

['I am a boy.', 'You are a girl?', 'Mr. Kim is a doctor.']

## 한국어 sentence tokenizr: kss

In [None]:
!pip install kss

Collecting kss
  Downloading kss-3.2.0.tar.gz (42.4 MB)
[K     |████████████████████████████████| 42.4 MB 52 kB/s 
[?25hCollecting emoji
  Downloading emoji-1.4.2.tar.gz (184 kB)
[K     |████████████████████████████████| 184 kB 61.1 MB/s 
[?25hBuilding wheels for collected packages: kss, emoji
  Building wheel for kss (setup.py) ... [?25l[?25hdone
  Created wheel for kss: filename=kss-3.2.0-py3-none-any.whl size=42447995 sha256=22a369de430b6ed8e8e134e3a6e54d1d4ac371978f7d40f2b060139a75c6547e
  Stored in directory: /root/.cache/pip/wheels/a1/47/9c/a5f83b5ab6096e3c4a33643fc553b26098c23e72b6539b86f4
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186469 sha256=4ff857f5e68aa0a61b67cc33011998d729d83a390c5ef3230aef4019af304e95
  Stored in directory: /root/.cache/pip/wheels/e4/61/e7/2fc1ac8f306848fc66c6c013ab511f0a39ef4b1825b11363b2
Successfully built kss emoji
Installing collected packages: emoji, kss
Succ

In [None]:
import kss

kss.split_sentences(sent)

[Korean Sentence Splitter]: Initializing Kss...


'트와이스 그리고 아이오아이 좋아요. tt가 저번에 1위 했었죠?'