In [1]:
import numpy as np

전처리: corpus, word_to_id, id_to_word 제작

In [2]:
def preprocessing(text):
    text=text.lower()
    text=text.replace('.',' .')
    words=text.split(' ')

    # Assin word to id and id to word
    word_to_id={}
    id_to_word={}
    for word in words:
        if word not in word_to_id:
            new_id=len(word_to_id)
            word_to_id[word]=new_id
            id_to_word[new_id]=word
    corpus=np.array([word_to_id[w] for w in words])
    return corpus, word_to_id, id_to_word

In [3]:
text="You want to buy an icecream and I want to eat yours."
corpus,word_to_id, id_to_word = preprocessing(text)
print(corpus,'corpus size:', len(corpus))
print(word_to_id,'word_to_id size:', len(word_to_id))
print(id_to_word,'id_to_word size:', len(id_to_word))

[ 0  1  2  3  4  5  6  7  1  2  8  9 10] corpus size: 13
{'you': 0, 'want': 1, 'to': 2, 'buy': 3, 'an': 4, 'icecream': 5, 'and': 6, 'i': 7, 'eat': 8, 'yours': 9, '.': 10} word_to_id size: 11
{0: 'you', 1: 'want', 2: 'to', 3: 'buy', 4: 'an', 5: 'icecream', 6: 'and', 7: 'i', 8: 'eat', 9: 'yours', 10: '.'} id_to_word size: 11


co-occurrence matrix: 단어 부근에 어떤 단어가 오는지 체크 (대칭 행렬)

In [4]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size=len(corpus)
    co_matrix=np.zeros((vocab_size,vocab_size),dtype=np.int32)

    for idx, word_id in enumerate(corpus):
        for i in range(1,window_size+1):
            left_idx=idx-1
            right_idx=idx+1

            if left_idx>=0:
                left_word_id=corpus[left_idx]
                co_matrix[word_id,left_word_id]+=1
            
            if right_idx < corpus_size:
                right_word_id=corpus[right_idx]
                co_matrix[word_id,right_word_id]+=1
    
    return co_matrix

In [5]:
co_matrix=create_co_matrix(corpus,vocab_size=len(word_to_id))
print(co_matrix)

[[0 1 0 0 0 0 0 0 0 0 0]
 [1 0 2 0 0 0 0 1 0 0 0]
 [0 2 0 1 0 0 0 0 1 0 0]
 [0 0 1 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 0 0 0 0 1 0]]


cos similarity: 코사인 유사도

In [6]:
def cos_similarity(x,y,eps=1e-8):
    nx=x/np.sqrt(np.sum(x**2)+eps) # x/||x|| -> 1차원 벡터
    ny=y/np.sqrt(np.sum(y**2)+eps) # y/||y|| -> 1차원 벡터

    return np.dot(nx,ny) # 내적 

In [7]:
x_idx=word_to_id['i']
y_idx=word_to_id['you']

x=co_matrix[x_idx] # x = 'I' vector
y=co_matrix[y_idx] # y = 'You' vector
print(cos_similarity(x,y))

0.7071067758832467
