In [2]:
import os
import re
import yaml
import torch
import numpy as np
import pandas as pd
import tensorflow as tf

from konlpy.tag import Okt
from transformers import BertTokenizer, BertModel
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
with open('./package.yaml') as f:
    file = yaml.load(f, Loader=yaml.FullLoader)
    POSEDATAPATH = file["path"]["pose_keypoints"]   # Pose Keypoints 저장 경로
    FACEDATAPATH = file["path"]["faceMesh_keypoints"]   # FaseMesh Keypoints 저장 경로
    LABELPATH = file["path"]["labelPATH"]   # metadata 경로

In [5]:
metadata = pd.read_excel(LABELPATH).sort_values("번호") # 데이터셋 메타데이터 로드 & "번호" 컬럼을 기준으로 내림차순 정렬
metadata.head(10)  # 상위 10개 항목 도출

Unnamed: 0,번호,언어 제공자 ID,취득연도,방향,타입(단어/문장),파일명,한국어,Unnamed: 7
10460,1,1,2017,정면,단어,KETI_SL_0000000001.MOV,0,
10440,2,1,2017,정면,단어,KETI_SL_0000000002.MOV,1,
10420,3,1,2017,정면,단어,KETI_SL_0000000003.MOV,2,
10399,4,1,2017,정면,단어,KETI_SL_0000000004.MOV,3,
10379,5,1,2017,정면,단어,KETI_SL_0000000005.MOV,4,
10359,6,1,2017,정면,단어,KETI_SL_0000000006.MOV,5,
10339,7,1,2017,정면,단어,KETI_SL_0000000007.MOV,6,
10319,8,1,2017,정면,단어,KETI_SL_0000000008.MOV,7,
10299,9,1,2017,정면,단어,KETI_SL_0000000009.MOV,8,
10279,10,1,2017,정면,단어,KETI_SL_0000000010.MOV,9,


In [6]:
metadata = metadata[["번호", "방향", "파일명", "한국어"]]   # 컬럼 필터링
metadata = metadata[metadata["방향"] == "정면"] # "방향"컬럼 정면 값을 가진 열 필터링
metadata

Unnamed: 0,번호,방향,파일명,한국어
10460,1,정면,KETI_SL_0000000001.MOV,0
10440,2,정면,KETI_SL_0000000002.MOV,1
10420,3,정면,KETI_SL_0000000003.MOV,2
10399,4,정면,KETI_SL_0000000004.MOV,3
10379,5,정면,KETI_SL_0000000005.MOV,4
...,...,...,...,...
318,10371,정면,KETI_SL_0000010371.MOV,허리가 아파서 일어날 수 없어요
3309,10372,정면,KETI_SL_0000010372.MOV,어떤 사람이 칼에 찔려서 피를 많이 흘리고 있어요
3898,10373,정면,KETI_SL_0000010373.MOV,아이가 말벌에 쏘여서 기절했어요
5838,10374,정면,KETI_SL_0000010374.MOV,무릎 인대를 다친 것 같아요


In [8]:
def remove_josa(tokens):
    josa_pattern = r'이|가|은|는|을|를|에|의|와|과'     # 필터링 대상 조사
    return [token for token in tokens if not re.fullmatch(josa_pattern, token)]

def kor2word_okt(string):
    okt = Okt()
    tokens = okt.morphs(string, norm=True, stem=True)
    process_tokens = remove_josa(tokens)
    print(process_tokens)
    return process_tokens

In [32]:
metadata['한국어']

10460                              0
10440                              1
10420                              2
10399                              3
10379                              4
                    ...             
318                허리가 아파서 일어날 수 없어요
3309     어떤 사람이 칼에 찔려서 피를 많이 흘리고 있어요
3898               아이가 말벌에 쏘여서 기절했어요
5838                 무릎 인대를 다친 것 같아요
3129             엄마가 신장병에 있는데 쓰러지셨어요
Name: 한국어, Length: 5240, dtype: object

In [38]:
test_list = []

for data in metadata["한국어"]:
    # kor2word_kiwi(str(data))
    val = kor2word_okt(str(data))   # Tokenizer 입력은 String - str casting
    test_list.extend(val)

['0']
['1']
['2']
['3']
['4']
['5']
['6']
['7']
['8']
['9']
['10']
['11']
['12']
['13']
['14']
['15']
['16']
['17']
['18']
['19']
['20']
['21']
['22']
['23']
['24']
['25']
['26']
['27']
['28']
['29']
['30']
['31']
['32']
['33']
['34']
['35']
['36']
['37']
['38']
['39']
['40']
['41']
['42']
['43']
['44']
['45']
['46']
['47']
['48']
['49']
['50']
['51']
['52']
['53']
['54']
['55']
['56']
['57']
['58']
['59']
['60']
['61']
['62']
['63']
['64']
['65']
['66']
['67']
['68']
['69']
['70']
['71']
['72']
['73']
['74']
['75']
['76']
['77']
['78']
['79']
['80']
['81']
['82']
['83']
['84']
['85']
['86']
['87']
['88']
['89']
['90']
['91']
['92']
['93']
['94']
['95']
['96']
['97']
['98']
['99']
['100']
['112']
['119']
['1000']
['10000']
['가렵다']
['가스']
['가슴']
['가시']
['각목']
['갇히다']
['감금']
['감전']
['강']
['강남구']
['강동구']
['강북구']
['강서구']
['강풍']
['개']
['거실']
['걸리다']
['결박']
['경운기']
['경찰']
['경찰차']
['계곡']
['계단']
['고속도로']
['고압', '전선']
['고열']
['고장']
['부러지다']
['골절']
['탈골']
['곰']
['공', '사장']
['공원']
['놀이터']
['공장']


In [34]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [39]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    # Take the last layer's hidden states
    embeddings = hidden_states[-1].mean(dim=1)  # Average over sequence tokens
    return embeddings.numpy()

In [40]:
embeddings = np.array([get_embeddings(word) for word in test_list])

In [31]:
embeddings.shape

(9630, 1, 768)

In [14]:
str_kor = []

for idx, data in metadata.iterrows():
    str_data = str(data['한국어'])  # 한국어 컬럼 데이터 String 변환
    str_kor.append(str_data)

print(f"Total class : {metadata['한국어'].nunique()}")

Total class : 419


In [22]:
class embedding_dnn_model(tf.keras.Model):
    def __init__(self):
        super(embedding_dnn_model, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=527, output_dim=512, input_length=1)
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(2048, activation='relu')
        self.dense3 = tf.keras.layers.Dense(4096, activation='relu')
        self.output_dense = tf.keras.layers.Dense(210 * 33 * 3, activation='linear')
        self.reshape = tf.keras.layers.Reshape((210, 33, 3))

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.output_dense(x)
        return self.reshape(x)

model = embedding_dnn_model()
model.compile(
    optimizer='adam', 
    loss='mean_squared_error',
    metrics=['mse']
)

model.build(input_shape=(None, 1)) # model input shape 지정



In [10]:
history = model.fit(processed_label, keypoints_data, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 942ms/step - loss: 1.0927 - mse: 1.0927 - val_loss: 0.1617 - val_mse: 0.1617
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 923ms/step - loss: 0.1664 - mse: 0.1664 - val_loss: 0.0863 - val_mse: 0.0863
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 894ms/step - loss: 0.0795 - mse: 0.0795 - val_loss: 0.0761 - val_mse: 0.0761
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 893ms/step - loss: 0.0642 - mse: 0.0642 - val_loss: 0.0561 - val_mse: 0.0561
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 888ms/step - loss: 0.0583 - mse: 0.0583 - val_loss: 0.0650 - val_mse: 0.0650
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 889ms/step - loss: 0.0577 - mse: 0.0577 - val_loss: 0.0556 - val_mse: 0.0556
Epoch 7/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 892ms/

In [35]:
input_string = ["바다"]
encoded_string = label_encoder.transform(input_string)
array = np.array(encoded_string).reshape(-1, 1)

predicted = model.predict(array)

print(f"Predicted_shape: {predicted.shape}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Predicted_shape: (1, 210, 33, 3)
