In [2]:
!pip install transformers datasets pandas matplotlib

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 14.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 80.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 65.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 74.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 85.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 

In [3]:
import os
import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from random import sample
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM



In [4]:
dataset = load_dataset("nsmc")

Downloading builder script:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/807 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset nsmc/default (download: 18.62 MiB, generated: 20.90 MiB, post-processed: Unknown size, total: 39.52 MiB) to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset nsmc downloaded and prepared to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
     bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
    )

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

In [6]:
# torch version
class FewshotClassifier(nn.Module):
    def __init__(self, tokenizer):
        super().__init__()
        self.gpt = AutoModelForCausalLM.from_pretrained(
            'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
            pad_token_id=tokenizer.eos_token_id, 
            torch_dtype='auto', low_cpu_mem_usage=True).to(device='cuda', non_blocking=True)
    
    def forward(self, x):
        outputs = self.gpt(x)[0][:, -1, :]
        
        return outputs

model = FewshotClassifier(tokenizer).cuda()

Downloading:   0%|          | 0.00/839 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/11.5G [00:00<?, ?B/s]

In [7]:
tr_fewshot_data = []

for ex in dataset['train']:
    tr_sent, tr_label = ex['document'], ex['label']
    tokens = tokenizer(tr_sent)['input_ids']
    if len(tokens) <= 25:
        tr_fewshot_data.append((tr_sent, tr_label))

tr_sample_size = 1000
train_fewshot_samples = []

for _ in range(tr_sample_size):
    fewshot_examples = sample(tr_fewshot_data, 30)
    train_fewshot_samples.append(fewshot_examples)


In [8]:
# def build_prompt_text(sent):
#     # return "문장: " + sent + ' 감정: '
#     return "문장: " + sent + '\n감정: '


def build_prompt_text(sent):
    return '영화 댓글의 감정을 분류해주는 모형\n댓글: ' + sent + '\n감정: ' # 0.81


def clean_text(sent):
    sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", sent)
    return sent_clean

In [9]:
from tqdm import tqdm

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

real_labels = []
pred_tokens = []
tst_sample_size = 100

for i, ex in enumerate(tqdm(dataset['test'].select(list(range(tst_sample_size))))):
    tst_sent, tst_label = ex['document'], ex['label']    
    tokens = tokenizer('[BOS]')['input_ids']
    
    for ex in train_fewshot_samples[i]:
        example_text, example_label = ex
        cleaned_example_text = clean_text(example_text)
        appended_prompt_example_text = build_prompt_text(cleaned_example_text)
        # appended_prompt_example_text += '긍정\n' if example_label == 1 else '부정\n'
        appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\n' # 0.76
        # appended_prompt_example_text += '긍정' if example_label == 1 else '부정' #0.67
        tokens += tokenizer(appended_prompt_example_text)['input_ids']
        tokens += tokenizer('########\n\n\n')['input_ids'] #0.76
        # tokens += tokenizer('########\n\n')['input_ids'] #0.71


    cleaned_sent = clean_text(tst_sent)
    appended_prompt_sent = build_prompt_text(cleaned_sent)
    
    test_tokens = tokenizer(appended_prompt_sent)['input_ids']
    
    tokens += test_tokens
    model.eval()
    with torch.no_grad():

        # x = tokenizer.encode(hf_tokens)
        outputs = model(torch.tensor([tokens]).cuda())
        pred = torch.argmax(outputs, axis=-1)

        pred_tokens.append(pred.cpu().item())

    pos = tokenizer(' 긍정')['input_ids']
    negative = tokenizer(' 부정')['input_ids']            
    
    label = pos if tst_label == 1 else negative      
    real_labels.append(label[0])
    

100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


In [10]:
accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]
accuracy = len([m for m in accuracy_match if m]) / len(real_labels)

print(accuracy)

0.86
