# 토큰화

Link : https://wikidocs.net/21694

'를 어떻게 처리하나를 비교해보자.

In [1]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer

In [2]:
sentence = "Don't be fooled by the dark sounding name, \
Mr. Jone's Orphanage is as cheery as cheery goes for a pstry shop."

In [3]:
print(word_tokenize(sentence))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pstry', 'shop', '.']


여기서는 Don't가 Do, n't  /   Jone's는  Jone'   s로 분류된다.

In [7]:
print(WordPunctTokenizer().tokenize(sentence))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pstry', 'shop', '.']


여기서는 Don ' t 3개로 분리된다. 

## 표준 토큰화 : Penn TreebankWordTokenizer

* 하이픈으로 구성된 단어는 하나로 유지,
* don't 같은 단어는 do와 n't로 분리.

In [8]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

sentence = "Starting a home-based restaurant may be an ideal. \
it doesn't have a food chain or restaurant of their own."

print(tokenizer.tokenize(sentence))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


## 문장과 문장을 나누는 sent_tokenize

In [9]:
from nltk.tokenize import sent_tokenize

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. \
Finally, the barber went up a mountain and almost to the edge of a cliff. \
He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."

print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


# Lemmatization (표제어 추출)

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

단어를, 토큰화 한 후, 표제어를 통해 품사를 어느정도 통일화 시킬 수 있다.

In [14]:
# lemmatizer.lemmatize
lemmatizer.lemmatize('has', 'v')

'have'

다음과 같이, 품사를 명시하면 Lemmatization이 이루어진다.

문장에서는 토큰화 한 이후, 표제어 추출로 진행될 수 있다.

In [17]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
sentence = "The quick Brown fox jumpes over the lazying dogs."
tokens = tokenizer.tokenize(sentence)
print([lemmatizer.lemmatize(word) for word in tokens])

['The', 'quick', 'Brown', 'fox', 'jumpes', 'over', 'the', 'lazying', 'dog', '.']


# 어간 추출

In [21]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
print([porter_stemmer.stem(word) for word in tokens])
print([lancaster_stemmer.stem(word) for word in tokens])

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']
['the', 'quick', 'brown', 'fox', 'jump', 'ov', 'the', 'lazy', 'dog', '.']


# 불용어

In [22]:
from nltk.corpus import stopwords

In [24]:
# 불용어 출력
stop_word_list = stopwords.words('english')
print(stop_word_list[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


## 불용어 제거

불용어 제거는 토큰화를 사전에 진행하고 먼저 하자.

In [26]:
tokenizer = TreebankWordTokenizer()
example = "Family is not an important thing. It's everything you have."
stop_words = set(stopwords.words('english'))


word_tokens = tokenizer.tokenize(example)
result = []

for word in word_tokens:
    if word not in stop_words:
        result.append(word)

In [28]:
print(word_tokens)
print(result)

['Family', 'is', 'not', 'an', 'important', 'thing.', 'It', "'s", 'everything', 'you', 'have', '.']
['Family', 'important', 'thing.', 'It', "'s", 'everything', '.']
