### Vectorization of Statement

In [14]:
import pandas as pd
# BOW(Bag of Word)
from sklearn.feature_extraction.text import CountVectorizer
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
sentences = ['I love my dog.',
             'I love my cat.',
             'I love my dog and love my cat.',
             'You love my dog!',
             'Do you think my dog is amazing?']

#### 1.Bag of Word(BOW)

In [12]:
## Tokenization text data, 단어 빈도 수를 기반으로 하는 feature vector를 생성하는 데 사용됨
count_vectorizer = CountVectorizer()
## sentences data에 대한 피처 변환 수행 
# fit_transform : 모델은 training data에 있는 평균, 분산을 학습 / 학습된 파라미터는 test data를 스케일하는 데 사용
features = count_vectorizer.fit_transform(sentences)

## features 객체를 NumPy 배열로 변환
vectorized_sentences = features.toarray()
## feature의 단어 list
feature_names = count_vectorizer.get_feature_names_out()

## 벡터화된 문장과 피처 이름을 이용해 DataFrame 생성
df = pd.DataFrame(vectorized_sentences, columns=feature_names)
## dataframe의 index name 지정
df.index.name='sentence'
df

Unnamed: 0_level_0,amazing,and,cat,do,dog,is,love,my,think,you
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,0,1,0,1,1,0,0
1,0,0,1,0,0,0,1,1,0,0
2,0,1,1,0,1,0,2,2,0,0
3,0,0,0,0,1,0,1,1,0,1
4,1,0,0,1,1,1,0,1,1,1


#### 2. TF-IDF(Term Frequency-Inverse Document Frequency)


In [18]:
tfidf_vectorizer = TfidfVectorizer()
## sentences 데이터에 대한 TF-IDF 기반 피처 변환 수행
tfidf_sentences = tfidf_vectorizer.fit_transform(sentences)
## TF-IDF feature 객체를 NumPy array로 변환
tfidf_vect_sentences = tfidf_sentences.toarray()
## TfidfVectorizer를 통해 추출한 feature name들을 가져옴
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
## 문서 상호간의 발생 빈도까지 감안이 된, 정규화된 TF-IDF Matrix
# TF-IDF 벡터화된 문장과 피처 이름을 이용해 DataFrame 생성
df = pd.DataFrame(tfidf_vect_sentences, columns=tfidf_feature_names)
df

Unnamed: 0,amazing,and,cat,do,dog,is,love,my,think,you
0,0.0,0.0,0.0,0.0,0.606856,0.0,0.606856,0.513275,0.0,0.0
1,0.0,0.0,0.737922,0.0,0.0,0.0,0.51529,0.435829,0.0,0.0
2,0.0,0.491109,0.396224,0.0,0.276682,0.0,0.553364,0.468032,0.0,0.0
3,0.0,0.0,0.0,0.0,0.458054,0.0,0.458054,0.387419,0.0,0.655957
4,0.438724,0.0,0.0,0.438724,0.24717,0.438724,0.0,0.209054,0.438724,0.35396


### 3. Keras word encoding

In [23]:
## keras API 이용
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

Tokenize

In [25]:
# 문장으로부터 상위 100개 단어로 vocabulary 작성
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
## Word Index Vocabulary 작성
# sentences에 포함된 문장들을 기반으로 단어의 토큰화를 수행, 각 단어에 고유한 인덱스를 할당
tokenizer.fit_on_texts(sentences)
# 각 단어에 부여된 고유 인덱스 추출(양방향으로 확인)
print(tokenizer.word_index)
print(tokenizer.index_word)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'cat': 6, 'you': 7, 'and': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}
{1: '<OOV>', 2: 'my', 3: 'love', 4: 'dog', 5: 'i', 6: 'cat', 7: 'you', 8: 'and', 9: 'do', 10: 'think', 11: 'is', 12: 'amazing'}


text의 sentence 변환 및 padding
<br>
text_to_sentences : text list 내의 각 text를 수열로 convert
- 입력 : text(strings) list
- 반환 : sequence list
<br>
pad_sequences : 동일한 길이로 sequence를 Zero padding

In [26]:
## sentences 데이터를 sequence로 변환
sequences = tokenizer.texts_to_sequences(sentences)
## 시퀀스에 패딩 적용(문장의 뒤쪽을 패딩하고, 필요 시 뒤쪽을 잘라냄)
padded = pad_sequences(sequences, padding='post',truncating='post')
print(sequences)
print()
print(padded)

[[5, 3, 2, 4], [5, 3, 2, 6], [5, 3, 2, 4, 8, 3, 2, 6], [7, 3, 2, 4], [9, 7, 10, 2, 4, 11, 12]]

[[ 5  3  2  4  0  0  0  0]
 [ 5  3  2  6  0  0  0  0]
 [ 5  3  2  4  8  3  2  6]
 [ 7  3  2  4  0  0  0  0]
 [ 9  7 10  2  4 11 12  0]]


sequenced sentence를 word sentence로 환원

In [29]:
for seq in sequences:
    sent=[]
    for idx in seq:
        sent.append(tokenizer.index_word[idx])
    print(' '.join(sent))

i love my dog
i love my cat
i love my dog and love my cat
you love my dog
do you think my dog is amazing


one-hot Encoding 표현

In [30]:
to_categorical(padded)

array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

    