In [1]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [22]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (496 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m496.6/496.6 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.6.0 konlpy-0.6.0


In [29]:
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
from konlpy.tag import Okt

# KoGPT2 모델과 토크나이저 불러오기
okt = Okt() # 형태소 분석기 객체 생성
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [45]:
def predict_diverse_phrases(text, num_predictions=5):
    """
    형태소 분석을 통해 의미적으로 다양한 어절을 예측 (명사 없는 어절 포함)
    """
    input_ids = tokenizer.encode(text, return_tensors='pt')
    input_length = len(input_ids[0])

    beam_outputs = model.generate(
        input_ids,
        max_length=input_length + 6,
        num_beams=num_predictions * 3,
        no_repeat_ngram_size=2,
        num_return_sequences=num_predictions * 3,
        early_stopping=True
    )

    final_predictions = []
    used_keys = set()

    for beam_output in beam_outputs:
        generated_sequence = tokenizer.decode(beam_output[input_length:], skip_special_tokens=True).strip()

        if generated_sequence:
            first_phrase = generated_sequence.split(' ')[0]

            if '.' in first_phrase: first_phrase = first_phrase.split('.')[0] + '.'
            elif '?' in first_phrase: first_phrase = first_phrase.split('?')[0] + '?'
            elif '!' in first_phrase: first_phrase = first_phrase.split('!')[0] + '!'

            # 명사가 없는 어절을 버리는 대신, 어절 전체를 고유 키로 사용
            nouns = okt.nouns(first_phrase)
            if not nouns:
                core_key = first_phrase
            else:
                core_key = nouns[0]

            # 수정한 키를 기준으로 중복 검사
            if core_key not in used_keys:
                used_keys.add(core_key)
                final_predictions.append(first_phrase)

            if len(final_predictions) >= num_predictions:
                break

    cleaned_predictions = [phrase.replace('\n', '') for phrase in final_predictions]
    return cleaned_predictions

# 3. 개선된 함수로 테스트
input_text = "오늘 저녁은 뭘"
predicted_phrases = predict_diverse_phrases(input_text)

print(f"입력: '{input_text}'")
print(f"추천 어절: {predicted_phrases}")

input_text2 = "커피 한 잔을 마시고"
predicted_phrases2 = predict_diverse_phrases(input_text2)

print(f"입력: '{input_text2}'")
print(f"추천 어절: {predicted_phrases2}")

입력: '오늘 저녁은 뭘'
추천 어절: ['먹어도', '먹을까요?', '먹었냐고', '먹었는지']
입력: '커피 한 잔을 마시고'
추천 어절: ['싶을', '싶다는', '싶었다.']
