# 학습데이터, 테스트데이터 만들기

In [6]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
train_ft = pd.read_excel('train_tmp.xlsx')
test_ft = pd.read_excel('test_tmp.xlsx')

train_ft.shape,test_ft.shape

((4112, 8), (1028, 8))

In [8]:
train_price = train_ft.loc[:,['리뷰','가격.1_긍부정']].copy()
train_price = train_price[train_price['가격.1_긍부정'].isin([-1,1])]

test_price = test_ft.loc[:,['리뷰','가격.1_긍부정']].copy()
test_price = test_price[test_price['가격.1_긍부정'].isin([-1,1])]

train_price.shape,test_price.shape

((462, 2), (116, 2))

In [9]:
train_price['가격.1_긍부정'].value_counts() 
# 1의 비율이 훨씬 더 높은 수준임을 알 수 있음 

가격.1_긍부정
 1    410
-1     52
Name: count, dtype: int64

In [10]:
train_price.columns

Index(['리뷰', '가격.1_긍부정'], dtype='object')

In [11]:
train_arr = train_price['리뷰'].to_numpy()
test_arr = test_price['리뷰'].to_numpy()

train_arr.shape,test_arr.shape

((462,), (116,))

In [12]:
target = train_price['가격.1_긍부정'].to_numpy().reshape(-1,1)
target = np.where(target == -1,0,target)

target_test = test_price['가격.1_긍부정'].to_numpy().reshape(-1,1)
target_test = np.where(target_test == -1,0,target_test)

target = target.astype('int64')
target_test = target_test.astype('int64')

target.shape,target_test.shape

((462, 1), (116, 1))

In [13]:
target.mean()

0.8874458874458875

# 사전학습 모델 선정

In [8]:
model_name = 'kykim/bert-kor-base'

In [9]:
from transformers import AutoTokenizer, AutoModel

In [10]:
model = AutoModel.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 사전학습모델을 바탕으로 학습 실행

In [12]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, x, y=None):
        self.tokenizer = tokenizer
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        item = {}
        item["x"] = self.get_tokenizer(self.x[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item
    def get_tokenizer(self, text):
        x = self.tokenizer(text, padding="max_length", truncation=True)
        for k, v in x.items():
            x[k] = torch.tensor(v)
        return x

In [13]:
dt = ReviewDataset(tokenizer, train_arr, target)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': {'input_ids': tensor([[    2, 14668, 25103,  ...,     0,     0,     0],
         [    2, 20452,  5938,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])},
 'y': tensor([[1.],
         [1.]])}

In [14]:
class Net(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.pre_model = AutoModel.from_pretrained(model_name)
        self.fc_out = torch.nn.Linear( self.pre_model.config.hidden_size, 1)

    def forward(self, x):
        x = self.pre_model(**x)
        # x[0]: 모든 시점의 히든출력 batch, seq, features
        # x[1]: CLS 토큰의 히든출력 batch, features
        return self.fc_out(x[1])

In [15]:
model = Net(model_name)
model(batch["x"])

tensor([[0.4492],
        [0.5330]], grad_fn=<AddmmBackward0>)

In [16]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train() # 학습 모드
    for batch in tqdm(dataloader):
        pred = model( batch["x"].to(device) )
        loss = loss_fn( pred, batch["y"].to(device) )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

In [17]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    epoch_loss = 0
    pred_list = []
    act_func = torch.nn.Sigmoid()
    model.eval() # 평가 모드
    for batch in tqdm(dataloader):
        pred = model( batch["x"].to(device) )
        if batch.get("y") is not None:
            loss = loss_fn( pred, batch["y"].to(device) )
            epoch_loss += loss.item()

        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

n_splits = 5
cv = KFold(n_splits, shuffle=True, random_state=42)

batch_size = 8 # 배치 사이즈
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 객체
epochs = 100 # 최대 가능한 에폭수

In [19]:
is_holdout = False
reset_seeds(42) # 재현을 위해 시드고정
best_score_list = []
for i, (tri, vai) in enumerate( cv.split(train_arr) ):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(tokenizer, train_arr[tri], target[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(tokenizer, train_arr[vai], target[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam( model.parameters(), lr=2e-5 )

    best_score = 0 # 현재 최고 점수
    patience = 0 # 조기 종료 조건을 주기 위한 변수
    for epoch in range(epochs):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        pred = (pred > 0.5).astype(int) # 이진분류 문제에서 클래스 번호 결정
        score = f1_score(target[vai], pred,average = 'macro')

        print(train_loss, valid_loss, score)
        if score > best_score:
            best_score = score # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"price_{i}.pth") # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 정확도: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break

  return self.fget.__get__(instance, owner)()


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.3583801406178069 0.4334930572658777 0.8602150537634409


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.24363269585561245 0.31052067254980403 0.8924731182795699


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.10832024278475884 0.4496066488791257 0.8709677419354839


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.03125903453737339 0.7086109928883767 0.8602150537634409


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.02578498403779528 0.3970433760356779 0.9032258064516129


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.01807189047921132 0.43698957793336984 0.9139784946236559


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.007603922125665431 0.7441470178843398 0.8817204301075269


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.013333435948105885 0.37897009029984474 0.9139784946236559


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.03191562883303243 0.7899211985107589 0.8709677419354839


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.007862374493654104 0.6055071268686637 0.9032258064516129
0번째 폴드 최고 정확도: 0.9139784946236559


  return self.fget.__get__(instance, owner)()


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.34691262047024485 0.2991925496608019 0.9032258064516129


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.2124374587326608 0.26079009504367906 0.9139784946236559


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.07503627043494836 0.2409032644548764 0.946236559139785


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.019429641775786877 0.1795294237866377 0.967741935483871


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.004328062534926736 0.20643846970051527 0.967741935483871


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0028127772743476833 0.22688825369308083 0.967741935483871


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0020530249804575395 0.23820275360291512 0.956989247311828


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0014655845205081587 0.24440486085950397 0.956989247311828
1번째 폴드 최고 정확도: 0.967741935483871


  return self.fget.__get__(instance, owner)()


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.38064850161367275 0.2807426080107689 0.9239130434782609


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.2693404570380424 0.1951873398696383 0.9456521739130435


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.15902134054835806 0.15659795763591924 0.9456521739130435


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.03964704455451128 0.13643201052521667 0.9565217391304348


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.012453526877699064 0.15312239772174507 0.9456521739130435


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0064003929902026625 0.2182655846020983 0.9347826086956522


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.04615568788226773 0.1237656483038639 0.9565217391304348


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.018707671589119002 0.0907445468280154 0.967391304347826


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.016926391560465416 0.09493386368073213 0.967391304347826


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0024154472110317427 0.09703882498433813 0.967391304347826


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.001578280688610245 0.08933311245345976 0.9782608695652174


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.001215714650436681 0.0883418628606402 0.9782608695652174


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0009610133624556375 0.08904912400127311 0.9782608695652174


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0008121120713730442 0.09050013492863702 0.9782608695652174


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0007002080519475598 0.09263163553502334 0.9782608695652174
2번째 폴드 최고 정확도: 0.9782608695652174


  return self.fget.__get__(instance, owner)()


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.3759565977181526 0.3084781523793936 0.8913043478260869


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.2524085196250297 0.28472511827324826 0.9021739130434783


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.09730912824260428 0.3614267098406951 0.9021739130434783


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.02455599761587825 0.3632799667151024 0.9239130434782609


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.021350802923057306 0.417523779188438 0.9130434782608695


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0038241137294693197 0.4363640025937154 0.9130434782608695


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.002593138193255885 0.4551824963709805 0.9130434782608695


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.001954581729691238 0.462141692158184 0.9130434782608695
3번째 폴드 최고 정확도: 0.9239130434782609


  return self.fget.__get__(instance, owner)()


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.36443577841558356 0.3936195522546768 0.8586956521739131


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.2541521997686396 0.26424612756818533 0.8913043478260869


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.1184738481298406 0.3653752865890662 0.8695652173913043


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.06916702510987191 0.1497174883261323 0.9347826086956522


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.009052752710363649 0.16042176343034953 0.9456521739130435


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.003405786931831786 0.18198169392417185 0.9456521739130435


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0023793097303070605 0.1925067825941369 0.9347826086956522


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0017117740378338605 0.2072198363894131 0.9456521739130435


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

0.0014029735763695962 0.22151225800917018 0.9456521739130435
4번째 폴드 최고 정확도: 0.9456521739130435


# 테스트 데이터 예측 

In [20]:
test_dt = ReviewDataset(tokenizer,test_arr)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=2, shuffle=False)

In [21]:
pred_list = []
for i in range(n_splits):
    model = Net(model_name).to(device)
    state_dict = torch.load(f"price_{i}.pth")
    model.load_state_dict(state_dict)

    _, pred = test_loop(test_dl, model, loss_fn, device)

    pred_list.append(pred)
    if is_holdout:
        break

  return self.fget.__get__(instance, owner)()


  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

In [22]:
pred = np.mean(pred_list, axis=0)
pred = (pred > 0.5).astype(int)

In [23]:
score = f1_score(target_test, pred,average = 'macro')
score

0.9396551724137931