In [2]:
import numpy as np

c = np.array([[1, 0, 0, 0, 0, 0, 0]]) # input
W = np.random.randn(7, 3)          # weight
h = np.matmul(c, W)                # 중간 노드
print(h)

[[0.34881815 1.72459666 1.04010204]]


In [6]:
class MatMul:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.x = None

    def forward(self, x):
        W, = self.params
        out = np.dot(x, W)
        self.x = x
        return out

    def backward(self, dout):
        W, = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        self.grads[0][...] = dW
        return dx

In [7]:
W = np.random.randn(2, 3)
new_W = [W]
new_W

[array([[-1.00454817,  0.81768279, -0.74224535],
        [ 0.098616  , -1.08278578, -1.30831411]])]

In [10]:
import sys
sys.path.append('..')
import numpy as np
from common.layers import MatMul

c = np.array([[1, 0, 0, 0, 0, 0, 0]]) # input (1,7)
W = np.random.randn(7, 3)             # weight (7, 3)
layer = MatMul(W) # 인스턴스 변수 : (params, grads)
h = layer.forward(c)                  # output (1, 3)

print(h)

[[-0.59647672 -1.64404408 -1.72826619]]


In [39]:
import sys
sys.path.append('..')
import numpy as np
from common.layers import MatMul

# 샘플 맥락 데이터
c0 = np.array([[1, 0, 0, 0, 0, 0, 0]])
c1 = np.array([[0, 0, 1, 0, 0, 0, 0]])

# 가중치 초기화
W_in = np.random.randn(7, 3) # 분산 표현
W_out = np.random.randn(3, 7)

# 계층 생성
in_layer0 = MatMul(W_in)
in_layer1 = MatMul(W_in)
out_layer = MatMul(W_out)

# 순전파
h0 = in_layer0.forward(c0)   # 1. matmul
h1 = in_layer1.forward(c1)
h = 0.5 * (h0 + h1)          # 2. 더한다.
                             # 3. 평균
s = out_layer.forward(h)     # 4. matmul

print(s)                     # 각 단어의 점수 (높은 점수, 높은 확률)

[[-0.07974227 -0.66596643 -0.178495    0.58824254 -0.50354894 -0.58708193
   0.62156087]]


In [40]:
import sys
sys.path.append('..')
from common.util import preprocess

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
print(corpus)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [41]:
def create_contexts_target(corpus, window_size=1):
    # 1. target : 양 끝 제외!!
    target = corpus[window_size:-window_size] 
    
    # 2. contexts    
    contexts = [] # list
    for idx in range(window_size, len(corpus)-window_size): # target기준, corpus[window_size:-window_size]
        cs = []   # list
        for t in range(-window_size, window_size + 1): # 윈도우만큼 왼/오
            if t == 0: # 가운데는 target
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)
        
    return np.array(contexts), np.array(target)

In [8]:
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [9]:
corpus[1:-1]

array([1, 2, 3, 4, 1, 5])

In [10]:
corpus[2:-2]

array([2, 3, 4, 1])

In [11]:
corpus[3:-3]

array([3, 4])

In [42]:
contexts, target = create_contexts_target(corpus, window_size=1)

print(f'corpus : {corpus} \n')

print(f'contexts \n{contexts}')
print(f'\ntarget \n{target}')

corpus : [0 1 2 3 4 1 5 6] 

contexts 
[[0 2]
 [1 3]
 [2 4]
 [3 1]
 [4 5]
 [1 6]]

target 
[1 2 3 4 1 5]


In [27]:
import sys
sys.path.append('..')
from common.util import preprocess, create_contexts_target, convert_one_hot

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

contexts, target = create_contexts_target(corpus, window_size=1)

vocab_size = len(word_to_id)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)

target

array([[0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0]], dtype=int32)

In [28]:
contexts

array([[[1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0]],

       [[0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0]],

       [[0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0]],

       [[0, 0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 0]],

       [[0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1]]], dtype=int32)

In [29]:
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [30]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}

In [45]:
def convert_one_hot(corpus, vocab_size):
    '''원핫 표현으로 변환

    :param corpus: target/contexts 단어 ID 목록(1차원 또는 2차원 넘파이 배열)
    :param vocab_size: 어휘 수 -> 고정 길이 벡터 위함!!
    :return: 원핫 표현(2차원 또는 3차원 넘파이 배열)
    '''
    N = corpus.shape[0] # target(1차원)/contexts(2차원) 개수

    if corpus.ndim == 1:  # target이라면 (N,) -> (N, vocab_size)
        one_hot = np.zeros((N, vocab_size), dtype=np.int32)
        for idx, word_id in enumerate(corpus):
            one_hot[idx, word_id] = 1  # [2] -> [0 0 1 0 0 ...]

    elif corpus.ndim == 2: # contexts라면 (N, win*2) -> (N, win*2, vocab_size)
        C = corpus.shape[1] # win*2
        one_hot = np.zeros((N, C, vocab_size), dtype=np.int32) # 어휘 당 vocab_size 고정길이벡터
        for idx_0, word_ids in enumerate(corpus): # contexts IDs 한 줄씩(N,C) [0,2] [1,3]...
            for idx_1, word_id in enumerate(word_ids): # 한 줄안에 IDs 각각 변환(vocab_size)
                one_hot[idx_0, idx_1, word_id] = 1     # [0, 0, 2(2번째,해당context)]

    return one_hot

In [43]:
corpus = contexts
N = corpus.shape[0]
C = corpus.shape[1]
one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
for idx_0, word_ids in enumerate(corpus): # contexts ID
    print(f'corpus : {idx_0, word_ids}')
    for idx_1, word_id in enumerate(word_ids):
        one_hot[idx_0, idx_1, word_id] = 1

corpus : (0, array([0, 2]))
corpus : (1, array([1, 3]))
corpus : (2, array([2, 4]))
corpus : (3, array([3, 1]))
corpus : (4, array([4, 5]))
corpus : (5, array([1, 6]))


In [49]:
import sys
sys.path.append('..')
import numpy as np
from common.layers import MatMul, SoftmaxWithLoss

class SimpleCBOW:
    def __init__(self, vocab_size, hidden_size):
        '''
        인스턴스 변수
        params, grads, ! word_vecs !
        '''
        
        V, H = vocab_size, hidden_size

        # 1. 가중치 초기화
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(H, V).astype('f')

        # 2. 계층 생성 (contexts 2개)
        self.in_layer0 = MatMul(W_in) # context1
        self.in_layer1 = MatMul(W_in) # context2
        self.out_layer = MatMul(W_out)
        self.loss_layer = SoftmaxWithLoss()

        # 3. 모든 가중치와 기울기를 리스트에 모은다.
        layers = [self.in_layer0, self.in_layer1, self.out_layer]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # 4. 인스턴스 변수에 단어의 분산 표현을 저장한다.
        self.word_vecs = W_in

In [51]:
mat = np.random.randn(2, 3).astype('f')
mat.dtype

dtype('float32')

In [52]:
def forward(self, contexts, target):
    '''
    - contexts 3차원
        0차원 : 미니배치 수
        1차원 : 윈도우 크기*2
        2차원 : 원핫 벡터(vocab_size)

    - target 2차원(배치 수, 원핫)
    '''
    
    h0 = self.in_layer0.forward(contexts[:,0]) # MatMul.forward()
    h1 = self.in_layer1.forward(contexts[:,1])
    h = (h0 + h1) * 0.5 # 평균
    score = self.out_layer.forward(h)
    loss = self.loss_layer.forward(score, target)
    return loss

def backward(self, dout=1):
    ds = self.loss_layer.backward(dout)
    da = self.out_layer.backward(ds)
    da *= 0.5 # 순전파 입력을 서로 바꿔 기울기에 곱
    self.in_layer1.backward(da) # 그대로 흘림
    self.in_layer0.backward(da) # 그대로 흘림
    return None   

In [None]:
import sys
sys.path.append('..')
from common.trainer import Trainer
from common.optimizer import Adam
from simple_cbow import SimpleCBOW
from common.util import preprocess, create_contexts_target, convert_one_hot
import time
import matplotlib.pyplot as plt

# 1. hyperparameter
window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000

# 2. data preprcessing
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

# 3. ont hot encoding
vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)

# 4. model, optmizing 설정
model = SimpleCBOW(vocab_size, hidden_size)
optimizer = Adam()

# 5. training
trainer = Trainer(model, optimizer) # 초기화
start = time.time()
trainer.fit(contexts, target, max_epoch, batch_size) # 학습
end = time.time()
print(f'train 시간 : {end - start}')
plt.rcParams['font.family'] = 'NanumGothic'
trainer.plot()

In [62]:
word_vecs = model.word_vecs # 각 row에는 vocab의 분산 표현! = 밀집 벡터!!
for word_id, word in id_to_word.items():
    print(word, word_vecs[word_id])

you [ 1.1532531 -1.1418407  1.2303407  1.110218   1.2824609]
say [-1.166328   1.1573849 -1.1366129 -1.1592271 -1.1936855]
goodbye [ 0.7540316 -0.8852815  0.7349647  0.8277173  0.5871788]
and [-1.3707105  0.5840955 -1.3476586 -1.2760708 -1.0317676]
i [ 0.7664322  -0.8615656   0.72736514  0.79449564  0.6079713 ]
hello [ 1.1727468 -1.1495329  1.2120796  1.1009737  1.2720854]
. [-0.3369583   1.3809209  -0.24255209 -0.58218616 -1.0300624 ]


In [70]:
import sys
sys.path.append('..')
import numpy as np
from common.layers import MatMul, SoftmaxWithLoss

class SimpleSkipGram:
    def __init__(self, vocab_size, hidden_size):
        V, H = vocab_size, hidden_size
        
        # 1. 가중치 초기화
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(H, V).astype('f')

        # 2. 계층 생성
        self.in_layer = MatMul(W_in)
        self.out_layer = MatMul(W_out)
        # window만큼 output/loss 층!!
        self.loss_layer1 = SoftmaxWithLoss()
        self.loss_layer2 = SoftmaxWithLoss()

        # 3. 모든 가중치와 기울기를 리스트에 모은다.
        layers = [self.in_layer, self.out_layer]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # 4. 인스턴스 변수에 단어의 분산 표현을 저장한다.
        self.word_vecs = W_in

    def forward(self, contexts, target):
        h = self.in_layer.forward(target)
        s = self.out_layer.forward(h)
        l1 = self.loss_layer1.forward(s, contexts[:, 0]) # window_size 1이면,
                                                         # contexts 2개 중 첫번째
        l2 = self.loss_layer2.forward(s, contexts[:, 1])
        loss = l1 + l2 # 합!
        return loss

    def backward(self, dout=1):
        # loss 합 -> loss 각자 -> affine 출력 -> affine 입력 -> 입력 
        dl1 = self.loss_layer1.backward(dout)
        dl2 = self.loss_layer2.backward(dout)
        ds = dl1 + dl2 # 총 손실 기울기
        dh = self.out_layer.backward(ds)
        self.in_layer.backward(dh)
        return None

In [65]:
contexts[:,0]

array([[1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0]], dtype=int32)

In [68]:
contexts

array([[[1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0]],

       [[0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0]],

       [[0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0]],

       [[0, 0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 0]],

       [[0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1]]], dtype=int32)