In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

target = "화산의 검은 매화를 흉내 내지 않는다. \
화산의 검은 매화를 피워낸다. 매화가 아니라 개화이다."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([target])

for key, idx in tokenizer.word_index.items():
    print(f"{key} : {idx}")

화산의 : 1
검은 : 2
매화를 : 3
흉내 : 4
내지 : 5
않는다 : 6
피워낸다 : 7
매화가 : 8
아니라 : 9
개화이다 : 10


In [4]:
# %% One-hot encoding 구현
encoded = tokenizer.texts_to_sequences([target])[0]
print(encoded)

onehot_encoded = tf.keras.utils.to_categorical(encoded)
print(onehot_encoded)

[1, 2, 3, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10]
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [6]:
# Bag-of-words 구현
document_1 = "화산의 검은 매화를 흉내 내지 않는다. \
화산의 검은 매화를 피워낸다. 매화가 아니라 개화이다."

document_2 = "매화의 화려함에 눈을 빼앗기고, \
    검의 날카로움에 영혼이 홀린 이들은 결코 화산 검학의 진수에 도달할 수 없다."
    
document_3 = "저에게 있어 화산은 그저 화산입니다."
#%%
from sklearn.feature_extraction.text import CountVectorizer

training_documents = [document_1, document_2, document_3]

bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(training_documents)

word_index = bow_vectorizer.vocabulary_

for key, idx in sorted(word_index.items()):
    print(f"({key}: {idx})")

(개화이다: 0)
(검은: 1)
(검의: 2)
(검학의: 3)
(결코: 4)
(그저: 5)
(날카로움에: 6)
(내지: 7)
(눈을: 8)
(도달할: 9)
(매화가: 10)
(매화를: 11)
(매화의: 12)
(빼앗기고: 13)
(아니라: 14)
(않는다: 15)
(없다: 16)
(영혼이: 17)
(이들은: 18)
(있어: 19)
(저에게: 20)
(진수에: 21)
(피워낸다: 22)
(홀린: 23)
(화려함에: 24)
(화산: 25)
(화산은: 26)
(화산의: 27)
(화산입니다: 28)
(흉내: 29)


In [7]:
bow_vector_1 = bow_vectorizer.transform([document_1])
bow_vector_2 = bow_vectorizer.transform([document_2])
bow_vector_3 = bow_vectorizer.transform([document_3])

print(bow_vector_1.toarray())
print(bow_vector_2.toarray())
print(bow_vector_3.toarray())
# %%
import pandas as pd

result = []
vocab = list(word_index.keys())

for i in range(0, len(training_documents)):
    result.append([])
    d = training_documents[i]
    for j in range(0, len(vocab)):
        target = vocab[j]
        result[-1].append(d.count(target))
        
tf_ = pd.DataFrame(result, columns = vocab)
tf_

[[1 2 0 0 0 0 0 1 0 0 1 2 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 2 0 1]]
[[0 0 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0]]
[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0]]


Unnamed: 0,화산의,검은,매화를,흉내,내지,않는다,피워낸다,매화가,아니라,개화이다,...,화산,검학의,진수에,도달할,없다,저에게,있어,화산은,그저,화산입니다
0,2,2,2,1,1,1,1,1,1,1,...,2,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,1,1,1,1,1


In [8]:
# TF-IDF using sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfv = TfidfVectorizer().fit(training_documents)
sk_tf_idf = tfidfv.transform(training_documents).toarray()
print(sk_tf_idf)
print(tfidfv.vocabulary_)

[[0.22941573 0.45883147 0.         0.         0.         0.
  0.         0.22941573 0.         0.         0.22941573 0.45883147
  0.         0.         0.22941573 0.22941573 0.         0.
  0.         0.         0.         0.         0.22941573 0.
  0.         0.         0.         0.45883147 0.         0.22941573]
 [0.         0.         0.25819889 0.25819889 0.25819889 0.
  0.25819889 0.         0.25819889 0.25819889 0.         0.
  0.25819889 0.25819889 0.         0.         0.25819889 0.25819889
  0.25819889 0.         0.         0.25819889 0.         0.25819889
  0.25819889 0.25819889 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.4472136
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.4472136  0.4472136  0.         0.         0.
  0.         0.         0.4472136  0.         0.4472136  0.        ]]
{'화산의': 27, '검은': 1, '매화를': 11, '흉내': 29,

In [9]:
from sklearn.metrics.pairwise import euclidean_distances

def jaccard_similarity(doc1, doc2):
    s1 = set(doc1)
    s2 = set(doc2)
    
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))