In [None]:
import datetime
import torch
import pandas as pd
import numpy as np

from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BertConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [None]:
!wget https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv

In [None]:
df = pd.read_csv('finance_data.csv')
df.head()

In [None]:
df.labels.value_counts()

In [None]:
df['labels'] = df['labels'].replace(['neutral', 'positive', 'negative'], [0, 1, 2])

In [None]:
df.head()

In [None]:
df.to_csv('finance_data.csv', index=False, encoding='utf-8-sig')

In [None]:
all_data = Dataset.from_pandas(df)

In [None]:
cs = all_data.train_test_split(test_size = 0.2, seed = 777)
train = cs['train']
test = cs['test']

In [None]:
cs2 = train.train_test_split(test_size = 0.2, seed = 777)
train = cs2['train']
valid = cs2['test']

In [None]:
print(train)
print(valid)
print(test)

In [None]:
train_sentences = list(train['kor_sentence'])
valid_sentences = list(valid['kor_sentence'])
test_sentences = list(test['kor_sentence'])

In [None]:
train_labels = train['labels']
valid_labels = valid['labels']
test_labels = test['labels']

In [None]:
test_sentences[:5]

In [None]:
test_labels[:5]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

In [None]:
max_len = 128

def data_to_tensor(sentences, labels, max_len):

  encoded_text = [tokenizer.tokenize(sent) for sent in sentences]
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in encoded_text]

  pad_token = tokenizer.encode('<pad>')[0]
  input_ids = pad_sequences(input_ids, maxlen = max_len, value = pad_token, dtype = 'long', truncating = 'post', padding = 'post')

  attention_mask = []

  for seq in input_ids:
    seq_mask = [float(i != pad_token) for i in seq]
    attention_mask.append(seq_mask)

  tensor_inputs = torch.tensor(input_ids)
  tensor_labels = torch.tensor(labels)
  tensor_masks = torch.tensor(attention_mask)

  return tensor_inputs, tensor_labels, tensor_masks

In [None]:
train_inputs, train_labels, train_masks = data_to_tensor(train_sentences, train_labels, max_len)
valid_inputs, valid_labels, valid_masks = data_to_tensor(valid_sentences, valid_labels, max_len)
test_inputs, test_labels, test_masks = data_to_tensor(test_sentences, test_labels, max_len)

In [None]:
print('정수인코딩결과:',test_inputs[0])
print('-'*100)
print('원본문장복원결과:', tokenizer.decode(test_inputs[0]))
print('-'*100)
print('어텐션마스크:', test_masks[0])
print('-'*100)
print('샘플의길이:', len(test_inputs[0]))
print('-'*100)
print('레이블:', test_labels[0])

In [None]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_loader = DataLoader(valid_data, sampler = valid_sampler, batch_size = batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_loader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained('skt/kogpt2-base-v2', num_labels = num_labels)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5)

In [None]:
def metrics(prediction, labels):

  y_pred = prediction
  y_true = labels

  accuracy = accuracy_score(y_true, y_pred)

  f1_macro = f1_score(y_true, y_pred, average = 'macro', zero_division = 0)
  f1_micro = f1_score(y_true, y_pred, average = 'micro', zero_division = 0)
  f1_weight = f1_score(y_true, y_pred, average = 'weighted', zero_division = 0)

  metrics = {'accuracy' : accuracy,
             'f1_macro' : f1_macro,
             'f1_micro' : f1_micro,
             'f1_weight' : f1_weight}

  return metrics

In [None]:
def train_epoch(model, train_loader, optimizer, device):

  model.train()
  total_loss = 0

  for step, batch in tqdm(enumerate(train_loader), desc = "Training Batch"):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    outputs = model(b_input_ids, attention_mask = b_input_mask, token_type_ids = None, labels = b_labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  return total_loss / len(train_loader)

In [None]:
def evaluation(model, valid_loader, device):

  model.eval()
  total_loss = 0
  prediction = []
  true_labels = []

  for batch in valid_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      outputs = model(b_input_ids, attention_mask = b_input_mask, token_type_ids = None, labels = b_labels)

    if outputs.loss is not None:
      loss = outputs.loss
      total_loss += loss.item()

    logits = outputs.logits.detach().cpu().numpy()
    labels_ids = b_labels.to('cpu').numpy()

    prediction.extend(np.argmax(logits, axis = 1).flatten())
    true_labels.extend(labels_ids.flatten())

  eval_metrics = metrics(prediction, true_labels)

  return total_loss / len(valid_loader), eval_metrics

In [None]:
epochs = 3
min_val_loss = float('inf')

for epoch in range(0, epochs):
  print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))

  train_epoch(model, train_loader, optimizer, device)

  print("\nRunning Validation...")
  avg_valid_loss, eval_metrics = evaluation(model, valid_loader, device)
  print(" Validation Loss: {0:.2f}".format(avg_valid_loss))
  print(" Accuracy: {0:.2f}".format(eval_metrics['accuracy']))
  print(" F1 Macro: {0:.2f}".format(eval_metrics['f1_macro']))
  print(" F1 Micro: {0:.2f}".format(eval_metrics['f1_micro']))
  print(" F1 Weighted: {0:.2f}".format(eval_metrics['f1_weight']))

  if avg_valid_loss < min_val_loss:
    print(f"Validation loss decreased ({min_val_loss:.2f}--> {avg_valid_loss:.2f}). Saving model ...")
    torch.save(model.state_dict(), 'best_model.pt')
    min_val_loss = avg_valid_loss

In [None]:
model.load_state_dict(torch.load('best_model.pt'))

avg_loss, eval_metrics = evaluation(model, test_loader, device)
print(" Test Loss: {0:.2f}".format(avg_loss))
print(" Accuracy: {0:.2f}".format(eval_metrics['accuracy']))
print(" F1 Macro: {0:.2f}".format(eval_metrics['f1_macro']))
print(" F1 Micro: {0:.2f}".format(eval_metrics['f1_micro']))
print(" F1 Weighted: {0:.2f}".format(eval_metrics['f1_weight']))

In [None]:
from transformers import pipeline

pipe = pipeline('text-classification', model = model.to(device), tokenizer = tokenizer, device = 0, max_length = 512, return_all_scores = True, function_to_apply = 'softmax')

In [None]:
pipe = pipeline('text-classification', model = model.to(device), tokenizer = tokenizer, device = 0, max_length = 512, function_to_apply = 'softmax')

In [None]:
result = pipe('SK하이닉스가 매출이 급성장하였다')
print(result)

In [None]:
label_dict = {'LABEL_0' : '중립', 'LABEL_1' : '긍정', 'LABEL_2' : '부정'}

In [None]:
def prediction(text):
  res = pipe(text)
  return [label_dict[res[0]['label']]]

In [None]:
prediction('네이버가 매출이 급성장하였다')

In [None]:
prediction('ChatGPT의 등장으로 인공지능 스타트업들은 위기다')

In [None]:
prediction('인 공 지능 기술의 발전으로 누군가는 기회를 얻을 것이고, 누군가는 얻지 못할 것이다')