In [1]:
import os
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
cd/content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [5]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 14.6 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.19.0


In [6]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss

In [7]:
# 다 대 일 모델 불러오기 
MODEL_NAME = "klue/bert-base"
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, from_pt=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

In [25]:
# 입력 데이터(문장) 길이 제한
MAX_SEQ_LEN = 64

# bert의 입력에 맞게 입력 데이터 변형
def convert_data(X_data):
    # BERT 입력으로 들어가는 token, mask, segment, target 저장용 리스트
    tokens, masks, segments, targets = [], [], [], []
    
    for X in tqdm(X_data):
        # token: 입력 문장 토큰화
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)
        
        # Mask: 토큰화한 문장 내 패딩이 아닌 경우 1, 패딩인 경우 0으로 초기화
        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros
        
        # segment: 문장 전후관계 구분: 오직 한 문장이므로 모두 0으로 초기화
        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)

    # numpy array로 저장
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments]

In [26]:
Dataset = '/content/drive/MyDrive/Colab Notebooks/dataset/News_With_StockCode.csv'

In [27]:
dataset = pd.read_csv(Dataset)
dataset.head()

Unnamed: 0,date,title,content,code
0,2020-02-25 10:08:05,세계 경제에 이미 '팬데믹 공포' 고조,세계 경제에 이미 '팬데믹 공포' 고조 신종 코로나바이러스 감염증(코로나19)이 중...,a066570
1,2020-02-26 09:05:26,"코스피, 또 2% 급락 출발…2,060대로 후퇴","코스피, 또 2% 급락 출발…2,060대로 후퇴 코스피 또 2 급락 출발2060대...",a066570
2,2020-02-26 09:17:35,"코스피, 또 급락 출발…외인 매도에 2,060대로 후퇴","코스피, 또 급락 출발…외인 매도에 2,060대로 후퇴 코스피 하락 (PG)신종 ...",a066570
3,2020-02-26 09:36:23,"코스피, 또 급락 출발…외국인 매도에 2,060대로 후퇴(종합)","코스피, 또 급락 출발…외국인 매도에 2,060대로 후퇴(종합) 코스피 또다시 급락...",a000660
4,2020-02-26 16:53:28,"카카오뱅크, 코로나19 확산에 본사 직원 한시 재택근무","카카오뱅크, 코로나19 확산에 본사 직원 한시 재택근무 카카오뱅크카카오뱅크가 신종 ...",a035720


In [28]:
x_data = dataset['title']
x_data_converted = convert_data(x_data)

100%|██████████| 1725/1725 [00:00<00:00, 2249.48it/s]


In [29]:
# token, mask, segment 입력 정의
token_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_word_ids')
mask_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_masks')
segment_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_segment')
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_output = bert_outputs[0]

In [30]:
DROPOUT_RATE = 0.5
NUM_CLASS = 3
dropout = tf.keras.layers.Dropout(DROPOUT_RATE)(bert_output)
# Multi-class classification 문제이므로 activation function은 softmax로 설정
sentiment_layer = tf.keras.layers.Dense(NUM_CLASS, activation='softmax', kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02))(dropout)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_layer)

In [31]:
# 옵티마이저 Rectified Adam 하이퍼파리미터 조정
OPTIMIZER_NAME = 'RAdam'
LEARNING_RATE = 5e-5
TOTAL_STEPS = 10000
MIN_LR = 1e-5
WARMUP_PROPORTION = 0.1
EPSILON = 1e-8
CLIPNORM = 1.0
optimizer = tfa.optimizers.RectifiedAdam(learning_rate = LEARNING_RATE,
                                          total_steps = TOTAL_STEPS, 
                                          warmup_proportion = WARMUP_PROPORTION, 
                                          min_lr = MIN_LR, 
                                          epsilon = EPSILON,
                                          clipnorm = CLIPNORM)
# 감정분류 모델 컴파일
sentiment_model.compile(optimizer = optimizer, 
                        loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
                        metrics = ['accuracy'])

In [32]:
# 사전 학습된 모델 로드
BEST_MODEL_NAME = '/content/drive/MyDrive/Colab Notebooks/dataset/best_model.h5'
sentiment_model_best = tf.keras.models.load_model(BEST_MODEL_NAME,
                        custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})

# 뉴스 제목을 이용한 감성 분석 결과 도출 (중립 : 0, 긍정 : 1, 부정 : 2)
predicted_value = sentiment_model_best.predict(x_data_converted)
predicted_label = np.argmax(predicted_value, axis = 1)



In [34]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

print(predicted_label)

[2 2 2 2 2 1 2 0 1 0 0 2 0 2 2 2 2 0 0 1 0 2 0 0 1 0 0 0 2 2 0 0 2 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 1
 1 2 1 0 0 2 2 0 2 0 1 1 0 2 0 0 0 0 1 0 1 1 2 0 0 0 0 0 0 0 0 0 0 0 0 2 1
 0 0 1 0 0 0 2 0 0 0 0 0 0 1 1 1 0 2 1 0 0 0 0 1 1 0 0 0 0 0 0 2 0 0 0 0 1
 0 0 0 0 0 0 0 0 2 2 1 1 2 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 2
 0 2 2 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 2 1 0 0 2 2 1 0 1 0 1 1 1 0 0 1 1
 1 2 0 2 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 2 2 1 0 1 0 1 2 2 1 0 2
 0 2 0 1 2 2 0 0 0 0 2 1 2 0 0 2 0 1 1 0 0 0 0 2 1 0 0 1 0 1 0 1 1 0 0 0 0
 0 0 0 0 2 1 0 1 0 0 1 0 1 2 0 0 1 1 1 2 0 1 2 0 1 2 1 1 2 2 2 2 0 0 0 2 0
 1 1 0 1 2 1 2 1 0 0 2 1 1 1 1 1 0 0 1 1 1 1 0 2 0 1 0 2 1 1 1 2 0 0 0 0 1
 1 0 0 0 1 1 0 1 1 1 0 2 0 0 1 1 1 0 2 1 0 2 0 1 1 1 1 0 1 0 0 0 2 0 1 1 1
 0 2 1 0 0 0 0 1 1 0 0 2 2 0 1 1 0 0 2 1 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 2
 1 1 1 2 2 1 1 1 2 1 2 2 0 1 0 1 2 1 0 0 0 1 0 2 2 1 1 0 1 1 0 1 1 2 0 1 2
 1 0 1 0 0 1 1 1 1 1 1 1 