<a href="https://colab.research.google.com/github/Shushukang/NLP/blob/main/%EC%8B%A4%EC%8A%B506.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### 필요한 라이브러리 임폴트
import tensorflow as tf
import numpy as np
# import random

In [None]:
### 랜덤 시드 설정
tf.random.set_seed(0)

# 단어 임베딩 행렬 생성

In [None]:
### 실습용 데이터 생성

'''
1. sentence = "I love you"
2. 토큰화 --> [I, love, you]
3. 각 토큰 --> 512 차원의 임베딩 벡터로 변환  --> (3, 512) 임베딩 행렬 생성
4. 표준 정규 분포로부터 랜덤한 실수 샘플링 --> (3, 512) 임베딩 행렬에 해당하는 데이터 생성
'''

# (3, 512) 임베딩 행렬에 해당하는 데이터 생성
embedding_shape = (3,512)
embedding_matrix = tf.random.normal(shape = embedding_shape)

# 결과 확인하기
print(embedding_matrix)

tf.Tensor(
[[ 1.5110626   0.42292204 -0.41969493 ... -1.1673577   0.777814
   0.58657396]
 [-0.13087033 -0.4497122   3.3774817  ...  0.2500961  -0.69026154
  -0.8148735 ]
 [ 0.40156764  0.3129424  -0.87114996 ... -0.3053926   0.18731174
  -1.6565207 ]], shape=(3, 512), dtype=float32)


# 가중치 행렬 생성

In [None]:
### 가중치 행렬 W_q, W_k, W_v 생성

'''
1. 가중치 행렬 W_q, W_k, W_v의 모양 : (512, 512)
2. 가중치 행렬의 초기 값 --> 랜덤한 실수로 설정
3. 표준 정규 분포로부터 랜덤한 실수 샘플링 --> (512, 512) 가중치 행렬에 해당하는 데이터 W_q, W_k, W_v 생성
'''

# (512, 152*3) 모양의 가중치 행렬에 해당하는 데이터 생성
weights_shape = (512, 512*3)
W = tf.random.normal(shape=weights_shape)

# 결과 확인하기
print(W)

tf.Tensor(
[[ 1.0668802   0.19454929 -0.53082895 ...  0.28124705 -0.41999227
  -1.731296  ]
 [-0.88284314  0.07581621  0.36982587 ... -0.98544943 -0.07025491
  -0.35464865]
 [-1.4630128   1.008406    0.00408987 ...  2.3250976  -1.2281883
  -0.36647767]
 ...
 [ 0.10821439 -0.23559454 -0.17931989 ... -0.1931868   0.06187644
   0.02346488]
 [ 0.17702705  1.1367483   0.44510984 ...  0.46767908  2.3448489
  -0.61274064]
 [-0.32561806  0.1451457  -0.6740147  ...  0.5818147  -1.1355666
   0.1629009 ]], shape=(512, 1536), dtype=float32)


In [None]:
### 가중치 행렬 W_q, W_k, W_v 추출
W_q = W[:, 0:512]
W_k = W[:, 512:1204]
W_v = W[:, 1024:]

# 결과 확인하기
print(W_q)
print('-'*80)
print(W_k)
print('-'*80)
print(W_v)

tf.Tensor(
[[ 1.0668802   0.19454929 -0.53082895 ...  1.245727    0.15457077
   0.8158888 ]
 [-0.88284314  0.07581621  0.36982587 ...  1.4192551   0.05542991
  -1.0262617 ]
 [-1.4630128   1.008406    0.00408987 ... -0.42333117 -1.147531
  -0.22865014]
 ...
 [ 0.10821439 -0.23559454 -0.17931989 ... -1.1973188  -1.721341
  -0.71973807]
 [ 0.17702705  1.1367483   0.44510984 ...  2.1946557   0.63261014
  -1.2325677 ]
 [-0.32561806  0.1451457  -0.6740147  ...  0.69355434 -0.3900291
   0.08229202]], shape=(512, 512), dtype=float32)
--------------------------------------------------------------------------------
tf.Tensor(
[[ 1.1781547  -1.0898786   0.98570585 ...  0.01896639 -0.53865725
   0.12272159]
 [-0.07226859  0.8808753   0.807439   ... -0.31135246 -0.33696178
   0.23726879]
 [ 0.8329731  -0.27327135 -0.8017407  ... -1.9315009   0.5699578
  -1.1604967 ]
 ...
 [ 0.8052585   0.28040078  1.6850793  ...  0.37750173 -0.21134175
   0.27176344]
 [ 1.9855129  -0.63976276 -0.3220173  ...  0.372

# q, k, v 행렬 생성

In [None]:
### q, k, v 생성

"""
### 전체 단어의 임베딩 행렬과 가중치 행렬의 행렬 곱 → 각 단어 별 q, k, v 벡터 생성
"""

# 전체 단어의 임베딩 행렬과 가중치 행렬의 행렬 곱
qkv = tf.linalg.matmul(a=embedding_matrix, b=W)

# 각 단어의 q, k, v 행렬 추출
q = qkv[:, 0:512]
k = qkv[:, 512:512*2]
v = qkv[:, 512*2:512*3]

# 결과 확인
print(qkv)
print('-'*80)
print(q)
print('-'*80)
print(k)
print('-'*80)
print(v)

tf.Tensor(
[[ 52.855316   -3.3651943  31.226112  ...  19.497292  -23.665293
    7.7344437]
 [  9.105577  -18.511465  -21.972692  ...  63.532726   -0.5750847
    6.839353 ]
 [ -2.2933378  15.658791  -27.894466  ...  12.54751    38.962685
   -3.5825343]], shape=(3, 1536), dtype=float32)
--------------------------------------------------------------------------------
tf.Tensor(
[[ 52.855316   -3.3651943  31.226112  ... -30.888561   31.74374
   27.605654 ]
 [  9.105577  -18.511465  -21.972692  ... -36.2921    -13.293503
   18.735483 ]
 [ -2.2933378  15.658791  -27.894466  ...  20.479921   28.057518
   -6.0979385]], shape=(3, 512), dtype=float32)
--------------------------------------------------------------------------------
tf.Tensor(
[[ -2.94004  -34.09545  -10.575945 ...  20.819622  18.779625  29.647951]
 [-10.883896  32.65699   38.728813 ...  11.901602 -11.612812  43.71332 ]
 [ 50.216934 -23.480341  16.782907 ...  30.11206   52.02745  -30.785648]], shape=(3, 512), dtype=float32)
------

# scaled_dot_product_attention

## attention score

In [None]:
attention_score = tf.linalg.matmul(a=q, b=k, transpose_b=True)
print(f'attention_score : \n{attention_score}')

attention_score : 
[[-18987.393    6699.816   -9973.194 ]
 [ 20949.951   21679.52     1974.8209]
 [  3651.587   23628.8     -6038.9497]]


## scaling

In [None]:
print(k.shape[1])

512


In [None]:
dk = k.shape[1]
dk = tf.cast(dk, tf.float32)
scaled_attention_score = attention_score / tf.math.sqrt(dk)
print(f'scaled_attention_score " \n{scaled_attention_score}')

scaled_attention_score " 
[[-839.13214  296.09283 -440.75708]
 [ 925.8658   958.10846   87.27558]
 [ 161.37888 1044.2554  -266.88638]]


## attention weight

In [None]:
attention_weight = tf.nn.softmax(scaled_attention_score)
print(f'attention_weight : \n{attention_weight}')

attention_weight : 
[[0.0000000e+00 1.0000000e+00 0.0000000e+00]
 [9.9353644e-15 1.0000000e+00 0.0000000e+00]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00]]


## attention output : contextual embedding vector

In [None]:
output = tf.linalg.matmul(a= attention_weight, b=v)
print(f'output : \n{output}')

output : 
[[-33.50708    27.014906   -2.748848  ...  63.532726   -0.5750847
    6.839353 ]
 [-33.50708    27.014906   -2.748848  ...  63.532726   -0.5750847
    6.839353 ]
 [-33.50708    27.014906   -2.748848  ...  63.532726   -0.5750847
    6.839353 ]]
