<a href="https://colab.research.google.com/github/SeongwonTak/TIL_swtak/blob/master/0826_tfidf_with_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF를 통한 문장 유사도 확인

In [1]:
# 유사도는 코사인 유사도를 활용하고자 한다.
import numpy as np

def cos_similarity(x1, x2):
    dot_prod = np.dot(x1, x2)
    x1_norm = np.sqrt(sum(np.square(x1)))
    x2_norm = np.sqrt(sum(np.square(x2)))
    sim = dot_prod / (x1_norm * x2_norm)

    return sim

In [2]:
# 유사도 분석에 활용할 문장 리스트를 정리한다.
sentence_list = ['I hope that my dreams come true',
                 'I hope that you can dream happy life',
                 'Unless you fail, there may be dream',
                 'That makes me disappointed after you fail that',
                 'Take some red pills from there to be cured']

In [7]:
# stop words를 고려하지 않은 tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
word_vector = tfidf.fit_transform(sentence_list)
print(word_vector.todense())

[[0.         0.         0.         0.4428322  0.         0.
  0.         0.4428322  0.         0.         0.         0.35727423
  0.         0.         0.         0.         0.4428322  0.
  0.         0.         0.         0.29656989 0.         0.
  0.4428322  0.         0.        ]
 [0.         0.         0.43857711 0.         0.         0.
  0.35384125 0.         0.         0.         0.43857711 0.35384125
  0.43857711 0.         0.         0.         0.         0.
  0.         0.         0.         0.2937202  0.         0.
  0.         0.         0.2937202 ]
 [0.         0.35894109 0.         0.         0.         0.
  0.35894109 0.         0.35894109 0.         0.         0.
  0.         0.         0.44489823 0.         0.         0.
  0.         0.         0.         0.         0.35894109 0.
  0.         0.44489823 0.29795353]
 [0.38087336 0.         0.         0.         0.         0.38087336
  0.         0.         0.30728623 0.         0.         0.
  0.         0.38087336 0.  

In [8]:
word_matrix = word_vector.todense()
vect1 = np.array(word_matrix[0]).reshape(-1, )
vect2 = np.array(word_matrix[1]).reshape(-1, )
vect3 = np.array(word_matrix[2]).reshape(-1, )
vect4 = np.array(word_matrix[3]).reshape(-1, )
vect5 = np.array(word_matrix[4]).reshape(-1, )

In [13]:
# 유사 문장
print(cos_similarity(vect1, vect2))
# 유사이나, 반의어 포함
print(cos_similarity(vect1, vect3))
# 반대 문장
print(cos_similarity(vect1, vect4))
# 무관 문장
print(cos_similarity(vect1, vect5))

0.21352692493230035
0.0
0.151295320939704
0.0


In [14]:
# stopwords 고려
tfidf = TfidfVectorizer(stop_words = 'english')
word_vector = tfidf.fit_transform(sentence_list)
print(word_vector.todense())

word_matrix = word_vector.todense()
vect1 = np.array(word_matrix[0]).reshape(-1, )
vect2 = np.array(word_matrix[1]).reshape(-1, )
vect3 = np.array(word_matrix[2]).reshape(-1, )
vect4 = np.array(word_matrix[3]).reshape(-1, )
vect5 = np.array(word_matrix[4]).reshape(-1, )

[[0.52335825 0.         0.         0.         0.52335825 0.
  0.         0.42224214 0.         0.         0.         0.
  0.52335825 0.        ]
 [0.         0.         0.         0.44400208 0.         0.
  0.55032913 0.44400208 0.55032913 0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.53177225 0.         0.53177225
  0.         0.         0.         0.         0.         0.
  0.         0.659118  ]
 [0.         0.         0.61418897 0.         0.         0.49552379
  0.         0.         0.         0.61418897 0.         0.
  0.         0.        ]
 [0.         0.57735027 0.         0.         0.         0.
  0.         0.         0.         0.         0.57735027 0.57735027
  0.         0.        ]]


In [15]:
# 유사 문장
print(cos_similarity(vect1, vect2))
# 유사이나, 반의어 포함
print(cos_similarity(vect1, vect3))
# 반대 문장
print(cos_similarity(vect1, vect4))
# 무관 문장
print(cos_similarity(vect1, vect5))

0.18747638838165476
0.0
0.0
0.0


In [16]:
#Count_Vectorizer을 활용, stop_words 고려
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer(stop_words = 'english')
word_vector = cnt_vect.fit_transform(sentence_list)
print(word_vector.todense())

[[1 0 0 0 1 0 0 1 0 0 0 0 1 0]
 [0 0 0 1 0 0 1 1 1 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 1 1 0 0]]


In [18]:
word_matrix = word_vector.todense()
vect1 = np.array(word_matrix[0]).reshape(-1, )
vect2 = np.array(word_matrix[1]).reshape(-1, )
vect3 = np.array(word_matrix[2]).reshape(-1, )
vect4 = np.array(word_matrix[3]).reshape(-1, )
vect5 = np.array(word_matrix[4]).reshape(-1, )

# 유사 문장
print(cos_similarity(vect1, vect2))
# 유사이나, 반의어 포함
print(cos_similarity(vect1, vect3))
# 반대 문장
print(cos_similarity(vect1, vect4))
# 무관 문장
print(cos_similarity(vect1, vect5))

0.25
0.0
0.0
0.0
