# GPT

In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [6]:
# Korea -> seoul, USA -> washington.dc, France -> paris
prompt = "The capital of Korea is"

encodings = tokenizer(prompt, return_tensors='pt')
print(encodings)

{'input_ids': tensor([[ 464, 3139,  286, 4969,  318]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


In [8]:
# model.generate -> input ids를 넣으면 다음 토큰을 생성
outputs = model.generate(max_length=20, **encodings)
print(outputs)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  464,  3139,   286,  4969,   318, 22372,    11,   543,   318,  1363,
           284,   262,  1499,   338,  4387, 10368,   286,  3215,    12,  6286]])


In [13]:
tokenizer.decode(outputs[0])

"The capital of Korea is Seoul, which is home to the country's largest concentration of foreign-born"

In [None]:
country_list = ['korea', 'USA', 'France', 'Germany', 'Italia']

for i in range(len(country_list)):
    prompt = f'The capital of {country_list[i]} is'
    encodings = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(max_length=20, **encodings)
    print(tokenizer.decode(outputs[0]))

In [20]:
# prompt를 다음과 같이 기술하면 task description을 준 것으로 해석할 수 있음

prompt = """\
    Name the capital city for a given country.
    The capital of Korea is\
"""

In [26]:
# Zero-shot Learning
while True:
    nation = input("국가를 입력: ")
    if not nation.strip():
        break
    prompt = f"""\
    Name the capital city for a given country.
    The capital of {nation.strip()} is\
    """

    encodings = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(max_length=20, **encodings)
    print(tokenizer.decode(outputs[0]), flush=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 25, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of France is     Paris


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 25, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of Germany is     Berlin


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 26, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of Italia is     


In [27]:
# One-shot Learning
while True:
    nation = input("국가를 입력: ")
    if not nation.strip():
        break
    prompt = f"""\
    Name the capital city for a given country.
    The capital of England is London
    The capital of {nation.strip()} is\
    """

    encodings = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(max_length=20, **encodings)
    print(tokenizer.decode(outputs[0]), flush=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of France is     Paris


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of Germany is     Berlin


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 36, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of Italia is     Naples


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of Japan is     Tokyo


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of China is     Shanghai


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of USA is     New


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 38, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of United States of America is     New


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of America is     New


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 35, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of Canada is     Toronto


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 36, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


    Name the capital city for a given country.
    The capital of England is London
    The capital of North Korea is     Pyongyang


In [None]:
# Few-shot Learning
while True:
    nation = input("국가를 입력: ")
    if not nation.strip():
        break
    prompt = f"""\
    Name the capital city for a given country.
    The capital of England is London
    The capital of Japan is Tokyo
    The capital of China is Beijing
    The capital of {nation.strip()} is\
    """

    encodings = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(max_length=20, **encodings)
    print(tokenizer.decode(outputs[0]), flush=True)

In [None]:
'''
Prompt Tuning

The capital of England is London
England : London
Endland -> London
등등 어떻게 연결할지 결정

"천천히 생각해보자"라고 주었을 때 성능이 좋아졌다... 등등 새로운 시도도 이어지는 중 

'''

In [40]:
# 학습 시에 데이터를 주지 않았어도 새로 등장하는 단어 또한 효과적으로 예측하고 있는 것을 확인할 수 있음

review = input("리뷰를 입력해주세요: ")
prompt = f"""\
    Classify the given review into positive or negative.
    I don't like this movie. This review is negative.
    I love this movie. This review is positive.
    {review.strip()}. This review is\
    """
encodings = tokenizer(prompt.strip(), return_tensors='pt')
outputs = model.generate(max_length=40, **encodings)
print(tokenizer.decode(outputs[0]), flush=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 58, but ``max_length`` is set to 40. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


Classify the given review into positive or negative.
    I don't like this movie. This review is negative.
    I love this movie. This review is positive.
    Disgusting movie! Worst movie of the Year. This review is negative
