<a href="https://colab.research.google.com/github/PSLeon24/TextMining/blob/main/The_basis_of_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 토큰화(Tokenization)

## 1. import dependencies

In [1]:
import nltk
nltk.download('punkt')
nltk.download('webtext')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## 2. Tokenization

### 2_1. Sentence Tokenization

In [2]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"

from nltk.tokenize import sent_tokenize

# 주어진 텍스트를 문장 단위로 토큰화. 주로 . ! ? 등을 이용
print(sent_tokenize(para))

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


In [3]:
para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 저는 동국대학교 고영민입니다!"
print(sent_tokenize(para_kor))

['안녕하세요, 여러분.', '만나서 반갑습니다.', '저는 동국대학교 고영민입니다!']


### 2_2. Word Tokenization

In [4]:
from nltk.tokenize import word_tokenize

print(word_tokenize(para))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']


In [5]:
from nltk.tokenize import WordPunctTokenizer

print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


In [6]:
print(word_tokenize(para_kor))

['안녕하세요', ',', '여러분', '.', '만나서', '반갑습니다', '.', '저는', '동국대학교', '고영민입니다', '!']


### 2_3. Tokenization using regex

In [7]:
import re
re.findall("[abc]", "How are you, boy?")

['a', 'b']

In [8]:
re.findall("[0123456789]", "3a7b5c9d")

['3', '7', '5', '9']

In [9]:
re.findall("[\w]", "3a 7b_ '.^&5c9d")

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

In [10]:
re.findall("[_]+", "a_b, c__d, e___f")

['_', '__', '___']

In [11]:
re.findall("[\w]+", "How are you, boy?")

['How', 'are', 'you', 'boy']

In [13]:
re.findall("[o]{2,4}", "oh, hoow are yoooou, boooooooy?")

['oo', 'oooo', 'oooo', 'ooo']

In [15]:
from nltk.tokenize import RegexpTokenizer

# regular expression(정규식)을 이용한 Tokenizer
# 단어 단위로 tokenize
# \w: 문자나 숫자를 의미. 즉 문자나 숫자 혹은 '가 반복되는 것을 찾아냄

tokenizer = RegexpTokenizer("[\w']+")

print(tokenizer.tokenize("Sorry, I can't go there."))

['Sorry', 'I', "can't", 'go', 'there']


### 2_4. noise & stopword removal

In [21]:
from nltk.corpus import stopwords

english_stops = set(stopwords.words('english'))

text1 = "Sorry, I couldn't go to movie yesterday."

tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())

result = [word for word in tokens if word not in english_stops]
'''
result = []
for word in tokens:
  if word not in english_stops:
    result.append(word)
'''

print(result)

['sorry', 'go', 'movie', 'yesterday']


In [22]:
print(english_stops)

{"shan't", 'o', 'being', 'because', "don't", 'm', 'doesn', 'as', 'for', 'am', 'above', 'haven', 'are', "mustn't", 'ourselves', 'myself', 'all', "doesn't", 'hasn', 'don', "you'll", 'no', 'have', 'in', 'most', "wasn't", 'aren', 'your', "she's", 'now', 'some', "won't", 'what', "that'll", 'once', 'was', "didn't", 's', 'our', "it's", "needn't", 'any', 'i', 'isn', 'here', 'about', 'again', 'me', "weren't", 'y', 'just', 'themselves', 'their', 'didn', 'shan', 'under', 'ain', 'should', 'she', 'mustn', 'that', 'against', 'other', "aren't", 'does', 'than', 'its', 'the', 'off', "isn't", "mightn't", 'wouldn', "you'd", 'whom', 'so', 'through', 'few', 'ours', 'not', 'to', 'will', 'wasn', 'they', 'but', "hasn't", 'where', 'hadn', 'her', 'is', 'you', 'itself', 'him', 'own', "hadn't", 'during', 'when', 'won', 'nor', 'theirs', 'd', 'himself', 'then', 'weren', 'we', "should've", 'been', "haven't", "couldn't", 'each', 'a', 'while', 'it', 'between', 'he', 'after', 'having', 'has', 'at', 'down', 'both', 'doi

In [24]:
# 나만의 stopwords를 만들기
my_stopword = ['i', 'go', 'to']
result = [word for word in tokens if word not in my_stopword]
print(result)

['sorry', "couldn't", 'movie', 'yesterday']


## 3. Normalization

### 3_1. Stemming

In [25]:
# 1. Poter Stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookeri cookbook


In [27]:
# 2. The Lancaster Stemmer
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookery cookbook


### 3_2. Lemmatization

In [28]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking', pos='v'))
print(lemmatizer.lemmatize('cookery'))
print(lemmatizer.lemmatize('cookbooks'))

cooking
cook
cookery
cookbook


### 3_3. The difference between stemming and lemmatizing

In [29]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print('stemming result:', stemmer.stem('believes'))
print('lemmatizing result:', lemmatizer.lemmatize('believes'))
print('lemmatizing result:', lemmatizer.lemmatize('believes', pos='v'))

stemming result: believ
lemmatizing result: belief
lemmatizing result: believe


## 4. POS-Tagging

In [30]:
import nltk
from nltk.tokenize import word_tokenize

tokens = word_tokenize("Hello everyone. It's good to see you. Let's start our text mining class!")
print(nltk.pos_tag(tokens))

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


In [33]:
nltk.help.upenn_tagset('CC')

LookupError: 
**********************************************************************
  Resource [93mtagsets[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('tagsets')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mhelp/tagsets/PY3/upenn_tagset.pickle[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [34]:
my_tag_set = ['NN', 'VB', 'JJ']
my_words = [word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
print(my_words)

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']


In [35]:
words_with_tag = ['/'.join(item) for item in nltk.pos_tag(tokens)]
print(words_with_tag)

['Hello/NNP', 'everyone/NN', './.', 'It/PRP', "'s/VBZ", 'good/JJ', 'to/TO', 'see/VB', 'you/PRP', './.', 'Let/VB', "'s/POS", 'start/VB', 'our/PRP$', 'text/NN', 'mining/NN', 'class/NN', '!/.']


In [37]:
sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비추어줄까.
정희성, 희망 공부'''

tokens = word_tokenize(sentence)
print(tokens)
print(nltk.pos_tag(tokens))

['절망의', '반대가', '희망은', '아니다', '.', '어두운', '밤하늘에', '별이', '빛나듯', '희망은', '절망', '속에', '싹트는', '거지', '만약에', '우리가', '희망함이', '적다면', '그', '누가', '세상을', '비추어줄까', '.', '정희성', ',', '희망', '공부']
[('절망의', 'JJ'), ('반대가', 'NNP'), ('희망은', 'NNP'), ('아니다', 'NNP'), ('.', '.'), ('어두운', 'VB'), ('밤하늘에', 'JJ'), ('별이', 'NNP'), ('빛나듯', 'NNP'), ('희망은', 'NNP'), ('절망', 'NNP'), ('속에', 'NNP'), ('싹트는', 'NNP'), ('거지', 'NNP'), ('만약에', 'NNP'), ('우리가', 'NNP'), ('희망함이', 'NNP'), ('적다면', 'NNP'), ('그', 'NNP'), ('누가', 'NNP'), ('세상을', 'NNP'), ('비추어줄까', 'NNP'), ('.', '.'), ('정희성', 'NN'), (',', ','), ('희망', 'NNP'), ('공부', 'NNP')]


In [39]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [40]:
from konlpy.tag import Okt
t = Okt()

In [41]:
print('형태소:', t.morphs(sentence))
print()
print('명사:', t.nouns(sentence))

형태소: ['절망', '의', '반대', '가', '희망', '은', '아니다', '.', '\n', '어', '두운', '밤하늘', '에', '별', '이', '빛나듯', '\n', '희망', '은', '절망', '속', '에', '싹트는', '거지', '\n', '만약', '에', '우리', '가', '희망', '함', '이', '적다면', '\n', '그', '누가', '세상', '을', '비추어줄까', '.', '\n', '정희성', ',', '희망', '공부']

명사: ['절망', '반대', '희망', '어', '두운', '밤하늘', '별', '희망', '절망', '속', '거지', '만약', '우리', '희망', '함', '그', '누가', '세상', '정희성', '희망', '공부']
