In [None]:
#!pip install transformers
#!pip install datasets

In [None]:
# 필요한 라이브러리 설치
from datasets import load_dataset
from datasets import Dataset

import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

import torch
from torch.optim import AdamW
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertConfig
from transformers import pipeline
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

# 데이터 로드

In [None]:
!wget https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv

In [None]:
df = pd.read_csv('finance_data.csv')
print('샘플의 개수 :', len(df))

In [None]:
df.head()

In [None]:
df.value_counts(df['labels'])

In [None]:
df['labels'] = df['labels'].replace(['neutral', 'positive', 'negative'], [0, 1, 2])
df.head()

In [None]:
dataset = Dataset.from_pandas(df)
split_dataset = dataset.train_test_split(0.2, seed = 777)
train_cs = split_dataset['train']
test_cs = split_dataset['test']

In [None]:
train_cs

In [None]:
test_cs

In [None]:
cs = train_cs.train_test_split(0.2, seed = 777)
train_cs = cs['train']
valid_cs = cs['test']

In [None]:
print(train_cs)
print(valid_cs)
print(test_cs)

# 데이터 전처리

In [None]:
train_sentence = list(map(lambda x : '[CLS] ' + str(x) + ' [SEP]', train_cs['kor_sentence']))
valid_sentence = list(map(lambda x : '[CLS] ' + str(x) + ' [SEP]', valid_cs['kor_sentence']))
test_sentence = list(map(lambda x : '[CLS] ' + str(x) + ' [SEP]', test_cs['kor_sentence']))

In [None]:
train_labels = train_cs['labels']
valid_labels = valid_cs['labels']
test_labels = test_cs['labels']

In [None]:
train_sentence[:5]

In [None]:
train_labels[:5]

In [None]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

In [None]:
max_len = 128

def data_to_tensor(sentences, labels, max_len):

  # BertTokenizer를 활용하여 토큰화
  tokenized_sent = [tokenizer.tokenize(sent) for sent in sentences]

  # 정수 인덱싱
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sent]

  # par_sequences를 활용하여 최대 길이(max_len)로 패딩
  input_ids = pad_sequences(input_ids, maxlen = max_len, dtype = 'long', truncating = 'post', padding = 'post')

  # 패딩 토큰을 구분하기 위해 Attention mask 제작
  attention_mask = []

  for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_mask.append(seq_mask)

  tensor_inputs = torch.tensor(input_ids)
  tensor_labels = torch.tensor(labels)
  tensor_masks = torch.tensor(attention_mask)

  return tensor_inputs, tensor_labels, tensor_masks

In [None]:
train_inputs, train_labels, train_masks = data_to_tensor(train_sentence, train_labels, max_len)
valid_inputs, valid_labels, valid_masks = data_to_tensor(valid_sentence, valid_labels, max_len)
test_inputs, test_labels, test_masks = data_to_tensor(test_sentence, test_labels, max_len)

In [None]:
print('정수인코딩결과:',test_inputs[0])
print('-'*100)
print('원본문장복원결과:', tokenizer.decode(test_inputs[0]))
print('-'*100)
print('어텐션마스크:', test_masks[0])
print('-'*100)
print('샘플의길이:', len(test_inputs[0]))
print('-'*100)
print('레이블:', test_labels[0])

In [None]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size = batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

# GPU 확인

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# 모델 로드

In [None]:
num_labels = 3
model = BertForSequenceClassification.from_pretrained('klue/bert-base', num_labels = num_labels)
model.to(device)

# 모델 학습

In [None]:
def metrics(predictions, labels):

  y_pred = predictions
  y_true = labels

  # Accuracy
  accuracy = accuracy_score(y_true, y_pred)

  # f1_score_macro
  f1_macro = f1_score(y_true, y_pred, average = 'macro', zero_division = 0)

  # f1_score_micro
  f1_micro = f1_score(y_true, y_pred, average = 'micro', zero_division = 0)

  # f1_score_weight
  f1_weight = f1_score(y_true, y_pred, average = 'weighted', zero_division = 0)

  metrics = {'accuracy' : accuracy,
             'f1_macro' : f1_macro,
             'f1_micro' : f1_micro,
             'f1_weighted' : f1_weight}

  return metrics

In [None]:
epochs = 3
optimizer = AdamW(model.parameters(), lr = 2e-5)

In [None]:
def train_epoch(model, train_dataloader, optimizer, device):

  total_loss = 0
  model.train()

  for step, batch in tqdm(enumerate(train_dataloader), desc = 'Training Batch'):
    batch = tuple(t.to(device) for t in batch)
    b_inputs_ids, b_inputs_mask, b_label = batch

    outputs = model(b_inputs_ids, token_type_ids = None, attention_mask = b_inputs_mask, labels = b_label)

    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  return total_loss / len(train_dataloader)

In [None]:
def evaluation(model, valid_dataloader, device):

  total_loss = 0
  pred = []
  true = []

  model.eval()

  for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_inputs_ids, b_inputs_mask, b_labels = batch

    with torch.no_grad():
      outputs = model(b_inputs_ids, token_type_ids = None, attention_mask = b_inputs_mask, labels = b_labels)

    if outputs.loss is not None:
      loss = outputs.loss
      total_loss += loss.item()

    logits = outputs.logits.detach().cpu().numpy()
    labels_ids = b_labels.to('cpu').numpy()

    pred.extend(np.argmax(logits, axis = 1).flatten())
    true.extend(labels_ids.flatten())

  eval_metrics = metrics(pred, true)

  return total_loss / len(valid_dataloader), eval_metrics

In [None]:
min_val_loss = float('inf')

for epoch in range(0, epochs):
  print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))

  train_epoch(model, train_dataloader, optimizer, device)

  print("\nRunning Validation...")

  val_loss, eval_metrics = evaluation(model, valid_dataloader, device)
  print(" Validation Loss: {0:.2f}".format(val_loss))
  print(" Accuracy: {0:.2f}".format(eval_metrics['accuracy']))
  print(" F1 Macro: {0:.2f}".format(eval_metrics['f1_macro']))
  print(" F1 Micro: {0:.2f}".format(eval_metrics['f1_micro']))
  print(" F1 Weighted: {0:.2f}".format(eval_metrics['f1_weighted']))

  if val_loss < min_val_loss:
    print(f"Validation loss decreased ({min_val_loss:.2f}--> {val_loss:.2f}). Saving model ...")
    torch.save(model.state_dict(), 'model_checkpoint.pt')
    min_val_loss = val_loss

In [None]:
model.load_state_dict(torch.load('model_checkpoint.pt'))

val_loss, eval_metrics = evaluation(model, test_dataloader, device)
print(" Test Loss: {0:.2f}".format(val_loss))
print(" Accuracy: {0:.2f}".format(eval_metrics['accuracy']))
print(" F1 Macro: {0:.2f}".format(eval_metrics['f1_macro']))
print(" F1 Micro: {0:.2f}".format(eval_metrics['f1_micro']))
print(" F1 Weighted: {0:.2f}".format(eval_metrics['f1_weighted']))

# 추론하기

In [None]:
pipe = pipeline('text-classification', model = model.to(device), tokenizer = tokenizer, device = 0, max_length = 512, return_all_scores = True, function_to_apply = 'softmax')

In [None]:
result = pipe('SK하이닉스가 매출이 급성장하였다')
print(result)

In [None]:
pipe = pipeline('text-classification', model = model.to(device), tokenizer = tokenizer, device = 0, max_length = 512, function_to_apply = 'softmax')
result = pipe('SK하이닉스가 매출이 급성장하였다')
print(result)

In [None]:
label_dict = {'LABEL_0' : '중립', 'LABEL_1' : '긍정', 'LABEL_2' : '부정'}

def prediction(text):
  result = pipe(text)
  return [label_dict[result[0]['label']]]

In [None]:
prediction('네 이 버가 매출이 급성장하였다')

In [None]:
prediction('ChatGPT의 등 장으로 인공지능 스타트업들은 위기다')

In [None]:
prediction('인 공 지능 기술의 발전으로 누군가는 기회를 얻을 것이고, 누군가는 얻지 못할 것이다')