# 자연어와 단어의 분산 표현

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#1.-통계-기반-기법" data-toc-modified-id="1.-통계-기반-기법-1">1. 통계 기반 기법</a></span><ul class="toc-item"><li><span><a href="#1.1.-말뭉치-전처리" data-toc-modified-id="1.1.-말뭉치-전처리-1.1">1.1. 말뭉치 전처리</a></span></li><li><span><a href="#1.2.-동시발생-행렬" data-toc-modified-id="1.2.-동시발생-행렬-1.2">1.2. 동시발생 행렬</a></span></li><li><span><a href="#2.3.5.-벡터-간-유사도" data-toc-modified-id="2.3.5.-벡터-간-유사도-1.3">2.3.5. 벡터 간 유사도</a></span></li></ul></li></ul></div>

## 1. 통계 기반 기법

### 1.1. 말뭉치 전처리

In [10]:
text = 'You say goodbye and I say hello'
text = text.lower()
text = text.replace('.', ' .')
words = text.split()
words

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello']

In [11]:
# use the regular expression
import re
text = 'You say goodby and I say hello'
p = re.compile('\w+')
p.findall(text.lower())

['you', 'say', 'goodby', 'and', 'i', 'say', 'hello']

In [12]:
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word


In [13]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5}

In [14]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello'}

In [15]:
id_to_word[1]

'say'

In [17]:
word_to_id['and']

3

In [20]:
import numpy as np
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus

array([0, 1, 2, 3, 4, 1, 5])

In [31]:
def preprocess(text):
    import re
    p = re.compile('\w+')
    text = text.lower()
    text = p.findall(text)
    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    corpus = np.array([word_to_id[w] for w in words])
    
    return corpus, word_to_id, id_to_word

In [32]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

In [33]:
corpus

array([0, 1, 2, 3, 4, 1, 5])

In [34]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5}

In [35]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello'}

### 1.2. 동시발생 행렬

In [36]:
import sys
sys.path.append('..')
from common.util import preprocess
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print(corpus)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [48]:
# 동시 발생 행렬 구현
C = np.array([
    [0, 1, 0, 0, 0, 0, 0],
    [1, 0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 1, 0, 0],
    [0, 1, 0, 0, 0, 0, 1],
    [0, 1, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 1, 0]
], dtype=np.float32)

C

array([[0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 1., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0.]], dtype=float32)

In [51]:
print(C[0])
print(C[word_to_id['say']])

[0. 1. 0. 0. 0. 0. 0.]
[1. 0. 1. 0. 1. 1. 0.]


In [52]:
# 동시 발생 행렬 자동화
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size+1):
            left_idx = idx-i
            right_idx = idx+i
            
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
                
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    return co_matrix

### 2.3.5. 벡터 간 유사도

In [59]:
# 코사인 유사도 구현
def cos_similarity(x, y, eps=1e-8):
    nx = x / np.sqrt(np.sum(x**2)+eps)
    ny = y / np.sqrt(np.sum(y**2)+eps)
    return np.dot(nx, ny)


In [60]:
x = np.array([-9, 7, 3, 1,1 ,1 ,1 ,1 ])
y = np.array([32, 2, -10, 100,1 ,1 ,1 ,1])

cos_similarity(x, y)

-0.1579654651891977

In [64]:
from common.util import preprocess, create_co_matrix, cos_similarity
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, 1)

c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
print(cos_similarity(c0, c1))

0.7071067691154799


In [79]:
from common.util import preprocess, create_co_matrix, most_similar

text = 'You say goodbye and I say hello'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

most_similar('you', word_to_id, id_to_word, C, top=5)


[query] you
 hello: 0.9999999800000005
 goodbye: 0.7071067691154799
 i: 0.7071067691154799
 say: 0.0
 and: 0.0


array([ -9,  -4,  -1,   0,  -1,  -4,  -9, -16, -25, -36, -49, -64])