# 텍스트 전처리
---
 - 패키지 설치
   - NLTK: !pip install nltk
   - KoNLPy: !pip install konlpy

## [1] 토큰화(Tokenization)
---
 - 문장/문서를 의미를 지닌 작은 단위로 나누는 것
 - 나누어진 단어를 토큰(Token)이라고 함
 - 종류
   - 문장 토큰화
   - 단어 토큰화

In [1]:
import nltk

# NLTK Corpus 말뭉치 데이터셋 다운로드 받기
# nltk.download('all')

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
raw_text1='''The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing. A free online book is available. (If you use the library for academic research, please cite the book.)

Steven Bird, Ewan Klein, and Edward Loper (2009). Natural Language Processing with Python. O’Reilly Media Inc.'''

raw_text2='''Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string.

This particular tokenizer requires the Punkt sentence tokenization models to be installed. NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation.'''

In [4]:
# 단어 단위 토큰화
result1=word_tokenize(raw_text1)

In [5]:
result1

['The',
 'Natural',
 'Language',
 'Toolkit',
 '(',
 'NLTK',
 ')',
 'is',
 'an',
 'open',
 'source',
 'Python',
 'library',
 'for',
 'Natural',
 'Language',
 'Processing',
 '.',
 'A',
 'free',
 'online',
 'book',
 'is',
 'available',
 '.',
 '(',
 'If',
 'you',
 'use',
 'the',
 'library',
 'for',
 'academic',
 'research',
 ',',
 'please',
 'cite',
 'the',
 'book',
 '.',
 ')',
 'Steven',
 'Bird',
 ',',
 'Ewan',
 'Klein',
 ',',
 'and',
 'Edward',
 'Loper',
 '(',
 '2009',
 ')',
 '.',
 'Natural',
 'Language',
 'Processing',
 'with',
 'Python',
 '.',
 'O',
 '’',
 'Reilly',
 'Media',
 'Inc',
 '.']

In [6]:
sent_result=sent_tokenize(raw_text2)

In [7]:
sent_result, len(sent_result)

(['Tokenizers divide strings into lists of substrings.',
  'For example, tokenizers can be used to find the words and punctuation in a string.',
  'This particular tokenizer requires the Punkt sentence tokenization models to be installed.',
  'NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation.'],
 4)

## 여러 문장에 토큰 추출
---

In [8]:
# 문장단위로 추출
raw_text=[raw_text1, raw_text2]
total_token=[]

# 문장 추출
for sent in raw_text:
    print(f'sent: {sent}', '-------', sep='\n')
    
    # 문장에서 추출한 토큰
    sentToken=word_tokenize(sent)
    print(sentToken, '-------', sep='\n')
    # 모든 문장의 토큰에 추가
    total_token.append(sentToken)
    
print(total_token)

sent: The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing. A free online book is available. (If you use the library for academic research, please cite the book.)

Steven Bird, Ewan Klein, and Edward Loper (2009). Natural Language Processing with Python. O’Reilly Media Inc.
-------
['The', 'Natural', 'Language', 'Toolkit', '(', 'NLTK', ')', 'is', 'an', 'open', 'source', 'Python', 'library', 'for', 'Natural', 'Language', 'Processing', '.', 'A', 'free', 'online', 'book', 'is', 'available', '.', '(', 'If', 'you', 'use', 'the', 'library', 'for', 'academic', 'research', ',', 'please', 'cite', 'the', 'book', '.', ')', 'Steven', 'Bird', ',', 'Ewan', 'Klein', ',', 'and', 'Edward', 'Loper', '(', '2009', ')', '.', 'Natural', 'Language', 'Processing', 'with', 'Python', '.', 'O', '’', 'Reilly', 'Media', 'Inc', '.']
-------
sent: Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation i

In [9]:
# 문장단위로 추출
raw_text=[raw_text1, raw_text2]
total_token=[]

# 문장 추출
for sent in raw_text:
    print(f'sent: {sent}', '-------', sep='\n')
    sentResult=sent_tokenize(sent)
    
    for ele in sentResult:
        wordResult=word_tokenize(ele)
        print(wordResult, '-------', sep='\n')
        total_token.append(wordResult)
    
print(total_token)




sent: The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing. A free online book is available. (If you use the library for academic research, please cite the book.)

Steven Bird, Ewan Klein, and Edward Loper (2009). Natural Language Processing with Python. O’Reilly Media Inc.
-------
['The', 'Natural', 'Language', 'Toolkit', '(', 'NLTK', ')', 'is', 'an', 'open', 'source', 'Python', 'library', 'for', 'Natural', 'Language', 'Processing', '.']
-------
['A', 'free', 'online', 'book', 'is', 'available', '.']
-------
['(', 'If', 'you', 'use', 'the', 'library', 'for', 'academic', 'research', ',', 'please', 'cite', 'the', 'book', '.', ')']
-------
['Steven', 'Bird', ',', 'Ewan', 'Klein', ',', 'and', 'Edward', 'Loper', '(', '2009', ')', '.']
-------
['Natural', 'Language', 'Processing', 'with', 'Python', '.']
-------
['O', '’', 'Reilly', 'Media', 'Inc', '.']
-------
sent: Tokenizers divide strings into lists of substrings. For example, tokenizers ca

## 한글
---

In [10]:
from konlpy.tag import Okt

# 형태소 분리 객체
okt=Okt()

In [11]:
# 형태소 분리
result=okt.morphs('오늘은 월요일입니다.')
result

['오늘', '은', '월요일', '입니다', '.']

In [12]:
# 형태소 분리 후 태깅(Tagging) -> 품사
result2=okt.pos('오늘은 월요일입니다.')
result2

[('오늘', 'Noun'),
 ('은', 'Josa'),
 ('월요일', 'Noun'),
 ('입니다', 'Adjective'),
 ('.', 'Punctuation')]

In [13]:
result2=okt.pos('오늘은 월요일입니다.', stem=True)
result2

[('오늘', 'Noun'),
 ('은', 'Josa'),
 ('월요일', 'Noun'),
 ('이다', 'Adjective'),
 ('.', 'Punctuation')]

### [2] 정제 & 정규화
---
 - 불용어 제거 -> 노이즈 제거
 - 텍스트의 동일화
   - 대문자 또는 소문자로 통일
   - 문장의 길이

### [2-1] 불용어(Stopword)

In [14]:
en_stopwords=nltk.corpus.stopwords.words('english')
len(en_stopwords), en_stopwords[:10]

(179,
 ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're"])

### [2-2] 어간 및 표제어 처리

In [15]:
from nltk.stem import LancasterStemmer

In [16]:
# 어간 추출
lstem=LancasterStemmer()

In [17]:
lstem.stem('working'),lstem.stem('works'),lstem.stem('worked')

('work', 'work', 'work')

In [18]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [19]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [20]:
# 표제어(사전에 등록된 단어 추출)
from nltk.stem import WordNetLemmatizer

In [21]:
wlemma=WordNetLemmatizer()

In [22]:
wlemma.lemmatize('working', 'v'), wlemma.lemmatize('worked', 'v')

('work', 'work')

In [23]:
wlemma.lemmatize('amusing', 'v'), wlemma.lemmatize('amused', 'v')

('amuse', 'amuse')

### [3] 텍스트 벡터화
---
 - 텍스트 -> 수치화
 - 희소벡터(OHE): BOW방식 -> Count기반, TF-IDF 기반
 - 밀집벡터: Embedding 방식, Word2Vect

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [25]:
corpus=[raw_text1, raw_text2]

In [26]:
ohe=CountVectorizer()

In [27]:
ohe.fit(corpus)
ret=ohe.transform(corpus)

In [28]:
print(ret)

  (0, 0)	1
  (0, 1)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 8)	1
  (0, 9)	2
  (0, 11)	1
  (0, 13)	1
  (0, 14)	1
  (0, 18)	2
  (0, 19)	1
  (0, 20)	1
  (0, 22)	1
  (0, 25)	2
  (0, 26)	1
  (0, 27)	3
  (0, 28)	2
  (0, 30)	1
  (0, 31)	1
  (0, 33)	3
  (0, 34)	1
  (0, 37)	1
  (0, 38)	1
  (0, 40)	1
  :	:
  (1, 35)	1
  (1, 36)	1
  (1, 39)	1
  (1, 42)	1
  (1, 43)	2
  (1, 44)	1
  (1, 46)	1
  (1, 48)	1
  (1, 50)	1
  (1, 51)	1
  (1, 53)	1
  (1, 55)	1
  (1, 56)	1
  (1, 57)	1
  (1, 58)	1
  (1, 59)	2
  (1, 60)	1
  (1, 61)	2
  (1, 62)	1
  (1, 63)	2
  (1, 64)	2
  (1, 67)	1
  (1, 68)	1
  (1, 69)	1
  (1, 71)	1


In [29]:
ret=ret.toarray()

In [30]:
print(ret.shape, ret)

(2, 73) [[1 1 0 1 1 1 0 0 1 2 0 1 0 1 1 0 0 0 2 1 1 0 1 0 0 2 1 3 2 0 1 1 0 3 1 0
  0 1 1 0 1 2 0 0 0 2 0 1 0 1 0 0 1 0 1 0 0 0 0 3 0 0 0 0 0 1 1 0 0 0 1 0
  1]
 [0 0 1 0 2 0 1 2 0 0 1 0 1 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 1 1
  1 0 0 1 0 0 1 2 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 2 1 2 1 2 2 0 0 1 1 1 0 1
  0]]


In [31]:
## TF-IDF 기반
tfidf=TfidfVectorizer()

In [32]:
tf_corpus=tfidf.fit_transform(corpus)

In [33]:
tf_corpus=tf_corpus.toarray()

In [34]:
print(tf_corpus)

[[0.11901923 0.11901923 0.         0.11901923 0.08468317 0.11901923
  0.         0.         0.11901923 0.23803847 0.         0.11901923
  0.         0.11901923 0.11901923 0.         0.         0.
  0.16936633 0.11901923 0.11901923 0.         0.11901923 0.
  0.         0.23803847 0.11901923 0.3570577  0.23803847 0.
  0.11901923 0.11901923 0.         0.3570577  0.08468317 0.
  0.         0.11901923 0.11901923 0.         0.11901923 0.23803847
  0.         0.         0.         0.23803847 0.         0.11901923
  0.         0.11901923 0.         0.         0.11901923 0.
  0.11901923 0.         0.         0.         0.         0.2540495
  0.         0.         0.         0.         0.         0.11901923
  0.11901923 0.         0.         0.         0.11901923 0.
  0.11901923]
 [0.         0.         0.13238075 0.         0.18837999 0.
  0.13238075 0.2647615  0.         0.         0.13238075 0.
  0.13238075 0.         0.         0.13238075 0.13238075 0.13238075
  0.09418999 0.         0.     

In [35]:
# 단어 단위 토큰화, 불용어 제거

sent='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [36]:
word_result=word_tokenize(sent)

word_result

['Wiki',
 'is',
 'in',
 'Ward',
 'is',
 'original',
 'description',
 ':',
 'The',
 'simplest',
 'online',
 'database',
 'that',
 'could',
 'possibly',
 'work.Wiki',
 'is',
 'a',
 'piece',
 'of',
 'server',
 'software',
 'that',
 'allows',
 'users',
 'to',
 'freely',
 'create',
 'and',
 'edit',
 'Web',
 'page',
 'content',
 'using',
 'any',
 'Web',
 'browser',
 '.',
 'Wiki',
 'supports',
 'hyperlinks',
 'and',
 'has',
 'a',
 'simple',
 'text',
 'syntax',
 'for',
 'creating',
 'new',
 'pages',
 'and',
 'crosslinks',
 'between',
 'internal',
 'pages',
 'on',
 'the',
 'fly.Wiki',
 'is',
 'unusual',
 'among',
 'group',
 'communication',
 'mechanisms',
 'in',
 'that',
 'it',
 'allows',
 'the',
 'organization',
 'of',
 'contributions',
 'to',
 'be',
 'edited',
 'in',
 'addition',
 'to',
 'the',
 'content',
 'itself.Like',
 'many',
 'simple',
 'concepts',
 ',',
 '``',
 'open',
 'editing',
 "''",
 'has',
 'some',
 'profound',
 'and',
 'subtle',
 'effects',
 'on',
 'Wiki',
 'usage',
 '.',
 'Allo

In [37]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [38]:
result=[]

for w in word_result:
    if w not in en_stopwords:
        result.append(w)

In [39]:
len(result)

85

In [40]:
result2=[word for word in word_result if word not in en_stopwords]
len(result2)

85

### Tokenizer 객체 생성
---

In [41]:
from keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [42]:
raw_text=sent

In [43]:
# 토큰으로 나누기
tokens=text_to_word_sequence(raw_text)
tokens

['wiki',
 'is',
 'in',
 'ward',
 'is',
 'original',
 'description',
 'the',
 'simplest',
 'online',
 'database',
 'that',
 'could',
 'possibly',
 'work',
 'wiki',
 'is',
 'a',
 'piece',
 'of',
 'server',
 'software',
 'that',
 'allows',
 'users',
 'to',
 'freely',
 'create',
 'and',
 'edit',
 'web',
 'page',
 'content',
 'using',
 'any',
 'web',
 'browser',
 'wiki',
 'supports',
 'hyperlinks',
 'and',
 'has',
 'a',
 'simple',
 'text',
 'syntax',
 'for',
 'creating',
 'new',
 'pages',
 'and',
 'crosslinks',
 'between',
 'internal',
 'pages',
 'on',
 'the',
 'fly',
 'wiki',
 'is',
 'unusual',
 'among',
 'group',
 'communication',
 'mechanisms',
 'in',
 'that',
 'it',
 'allows',
 'the',
 'organization',
 'of',
 'contributions',
 'to',
 'be',
 'edited',
 'in',
 'addition',
 'to',
 'the',
 'content',
 'itself',
 'like',
 'many',
 'simple',
 'concepts',
 'open',
 'editing',
 'has',
 'some',
 'profound',
 'and',
 'subtle',
 'effects',
 'on',
 'wiki',
 'usage',
 'allowing',
 'everyday',
 'user

In [44]:
myToken=Tokenizer()

In [45]:
myToken.fit_on_texts(tokens)

In [46]:
print(myToken.word_index)

{'and': 1, 'wiki': 2, 'is': 3, 'in': 4, 'the': 5, 'that': 6, 'to': 7, 'web': 8, 'a': 9, 'of': 10, 'users': 11, 'content': 12, 'allows': 13, 'create': 14, 'edit': 15, 'page': 16, 'any': 17, 'has': 18, 'simple': 19, 'pages': 20, 'on': 21, 'it': 22, 'ward': 23, 'original': 24, 'description': 25, 'simplest': 26, 'online': 27, 'database': 28, 'could': 29, 'possibly': 30, 'work': 31, 'piece': 32, 'server': 33, 'software': 34, 'freely': 35, 'using': 36, 'browser': 37, 'supports': 38, 'hyperlinks': 39, 'text': 40, 'syntax': 41, 'for': 42, 'creating': 43, 'new': 44, 'crosslinks': 45, 'between': 46, 'internal': 47, 'fly': 48, 'unusual': 49, 'among': 50, 'group': 51, 'communication': 52, 'mechanisms': 53, 'organization': 54, 'contributions': 55, 'be': 56, 'edited': 57, 'addition': 58, 'itself': 59, 'like': 60, 'many': 61, 'concepts': 62, 'open': 63, 'editing': 64, 'some': 65, 'profound': 66, 'subtle': 67, 'effects': 68, 'usage': 69, 'allowing': 70, 'everyday': 71, 'site': 72, 'exciting': 73, 'enc

In [47]:
print(myToken.word_counts)

OrderedDict([('wiki', 5), ('is', 5), ('in', 5), ('ward', 1), ('original', 1), ('description', 1), ('the', 5), ('simplest', 1), ('online', 1), ('database', 1), ('that', 4), ('could', 1), ('possibly', 1), ('work', 1), ('a', 3), ('piece', 1), ('of', 3), ('server', 1), ('software', 1), ('allows', 2), ('users', 3), ('to', 4), ('freely', 1), ('create', 2), ('and', 6), ('edit', 2), ('web', 4), ('page', 2), ('content', 3), ('using', 1), ('any', 2), ('browser', 1), ('supports', 1), ('hyperlinks', 1), ('has', 2), ('simple', 2), ('text', 1), ('syntax', 1), ('for', 1), ('creating', 1), ('new', 1), ('pages', 2), ('crosslinks', 1), ('between', 1), ('internal', 1), ('on', 2), ('fly', 1), ('unusual', 1), ('among', 1), ('group', 1), ('communication', 1), ('mechanisms', 1), ('it', 2), ('organization', 1), ('contributions', 1), ('be', 1), ('edited', 1), ('addition', 1), ('itself', 1), ('like', 1), ('many', 1), ('concepts', 1), ('open', 1), ('editing', 1), ('some', 1), ('profound', 1), ('subtle', 1), ('ef

In [51]:
myToken.texts_to_sequences(['and','is'])

[[1], [3]]

---
 - 제공한 문서/문장에 대한 단어사전(vaca)
 - 단어사전(voca)에 존재하지 않는 단어 -> Out Of Voca: OOV

In [84]:
sentences = [
  'I love my dog',
  'I love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?'
]

In [86]:
tokenizer=Tokenizer()
                    # num_words=:.texts_to_sequences에 표시할 빈도높은 단어갯수
                    # oov_token=: oov의 인덱스를 설정

# 단어 빈도수가 높은 순으로 낮은 정수 인덱스 부여
tokenizer.fit_on_texts(sentences)

In [87]:
tokenizer.word_index

{'my': 1,
 'love': 2,
 'dog': 3,
 'i': 4,
 'you': 5,
 'cat': 6,
 'do': 7,
 'think': 8,
 'is': 9,
 'amazing': 10}

In [88]:
# 단어 출력 갯수
tokenizer.word_counts

OrderedDict([('i', 2),
             ('love', 3),
             ('my', 4),
             ('dog', 3),
             ('cat', 1),
             ('you', 2),
             ('do', 1),
             ('think', 1),
             ('is', 1),
             ('amazing', 1)])

In [93]:
# 문장을 생성된 사전(vaca)을 기반으로 수치화
seq_voca=tokenizer.texts_to_sequences(sentences)

## One-Hot-Encording 변환
---
 - sklearn OneHotEncoder 객체생성
 - keras 함수

In [78]:
from keras.utils import to_categorical

In [99]:
len(seq_voca), seq_voca

(4, [[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]])

In [100]:
to_categorical(seq_voca[0])

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [105]:
tokenizer.texts_to_matrix(sentences)

array([[0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1.]])

### 패딩(Padding)
---
 - 길이가 모두 다른 문장들을 동일 길이로 맞추기 위한 과정
 - 길이 기준 설정
 - 긴 경우 -> 앞/뒤 중 선택(값 제거 위치)
 - 짧은 경우 -> 앞/뒤 중 선택(값 채울 위치)
 - 값 -> 패딩에 들어갈 값
 - OHE 전에 하는듯 아마

In [106]:
from keras.utils import pad_sequences

In [107]:
result=tokenizer.texts_to_sequences(sentences)
result

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

In [109]:
encoding=pad_sequences(result)
encoding

array([[ 0,  0,  0,  4,  2,  1,  3],
       [ 0,  0,  0,  4,  2,  1,  6],
       [ 0,  0,  0,  5,  2,  1,  3],
       [ 7,  5,  8,  1,  3,  9, 10]])