In [1]:
import numpy as np
from konlpy.tag import Kkma
from sklearn.preprocessing import LabelEncoder
import re

class SelfAttention:
    def __init__(self, d_model, num_heads, seed):
        self.d_model = d_model
        self.num_heads = num_heads
        assert d_model % num_heads == 0, "d_model / num_heads 는 나머지가 0이어야 합니다."
        self.depth = d_model // num_heads

        # 무작위 시드 고정
        np.random.seed(seed)

        # Query, Key, Value를 위한 가중치 초기화
        self.wq = np.random.randn(d_model, d_model)  # Q의 가중치
        self.wk = np.random.randn(d_model, d_model)  # K의 가중치
        self.wv = np.random.randn(d_model, d_model)  # V의 가중치
        self.wo = np.random.randn(d_model, d_model)  # Output의 가중치

    def split_heads(self, x):
        batch_size = x.shape[0]
        x = x.reshape(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(0, 2, 1, 3)

    def scaled_dot_product_attention(self, q, k, v):
        matmul_qk = np.matmul(q, k.transpose(0, 1, 3, 2))
        dk = k.shape[-1]
        scaled_attention_logits = matmul_qk / np.sqrt(dk)
        attention_weights = self.softmax(scaled_attention_logits)
        output = np.matmul(attention_weights, v)
        return output, attention_weights

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def call(self, x):
        batch_size = x.shape[0]
        q = np.matmul(x, self.wq)
        k = np.matmul(x, self.wk)
        v = np.matmul(x, self.wv)

        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)

        attention_output, attention_weights = self.scaled_dot_product_attention(q, k, v)

        attention_output = attention_output.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model)
        output = np.matmul(attention_output, self.wo)

        return output, attention_weights

def preprocess_and_embed(text, d_model):
    kkma = Kkma()

    # 특수문자 제거
    text = re.sub(r'[^\w\s]', '', text)

    # 전체 문장을 대상으로 품사 태깅
    pos_tags = kkma.pos(text)
    print(f'@ pos_tags: {pos_tags}')
    # @ pos_tags: [('안녕', 'NNG'), ('하세', 'VV'), ('요', 'EFN'), ('오늘', 'NNG'), ...]

    tokens = [word for word, pos in pos_tags]
    unique_tokens = list(set(tokens))
    print(f'@ tokens: {tokens}')
    # @ tokens: ['안녕', '하세', '요', '오늘', '날씨', '가', '참', '좋', '네요', '어떻', '게', '지내', '세요']

    print(f'@ unique_tokens: {unique_tokens}')
    # @ unique_tokens: ['날씨', '요', '안녕', '참', '가', '하세', '좋', '오늘', '네요', '세요', '어떻', '게', '지내']

    # 단어를 정수 인덱스로 변환
    encoder = LabelEncoder()
    encoder.fit(unique_tokens)
    indexed_tokens = encoder.transform(tokens)
    print(f'@ indexed_tokens: {indexed_tokens}')
    # @ indexed_tokens: [ 8  5  1  7  0  4  3  6  9 10 11 12  2]

    # One-hot 임베딩 (간단한 예시용, 실제로는 더 복잡한 임베딩 사용)
    embeddings = np.eye(len(unique_tokens))[indexed_tokens]
    print(f'@ embeddings: {embeddings}')
    # @ embeddings: [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.], ...]

    # 임베딩 벡터의 차원을 d_model로 맞춤
    embeddings = np.pad(embeddings, ((0, 0), (0, d_model - embeddings.shape[1])), 'constant')
    print(f'@ embeddings2: {embeddings}')
    # @ embeddings2: [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], ...]

    return embeddings, tokens, pos_tags

# 중요도 계산 및 출력
def calculate_importance(text):
    d_model = 32  # 임베딩 차원
    num_heads = 4  # Self-Attention 헤드 수
    total_importance = {}

    for seed in range(1, 11):
        # 문장 전처리 및 임베딩
        embeddings, tokens, pos_tags = preprocess_and_embed(text, d_model)

        # Self-Attention 객체 생성 및 중요도 계산
        attention = SelfAttention(d_model, num_heads, seed)
        _, attention_weights = attention.call(embeddings[np.newaxis, ...])

        # 각 단어에 대한 중요도를 계산 (Attention 가중치 평균)
        token_importance = attention_weights.mean(axis=1).flatten()

        for token, importance in zip(tokens, token_importance):
            if token in total_importance:
                total_importance[token].append(importance)
            else:
                total_importance[token] = [importance]

    # 평균 중요도 계산
    avg_importance = {token: np.mean(importances) for token, importances in total_importance.items()}

    # 명사(NNG)와 동사(VV)만 남기기 위해 POS 태그 사용
    filtered_avg_importance = {token: avg_importance[token] for token, pos in pos_tags if pos in ['NNG', 'VV']}

    # 중요도 순으로 단어 정렬
    sorted_word_importance = sorted(filtered_avg_importance.items(), key=lambda x: x[1], reverse=True)

    # 결과 출력
    print("단어 중요도 순위:")
    for word, score in sorted_word_importance:
        print(f"단어: {word}, 중요도 점수: {score:.4f}")

# 예제 실행
text = "안녕하세요, 오늘 날씨가 참 좋네요. 어떻게 지내세요?"
calculate_importance(text)


JVMNotFoundException: No JVM shared library file (libjvm.so) found. Try setting up the JAVA_HOME environment variable properly.