In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import os
import json

from tqdm import tqdm

In [4]:
# 시각화 함수
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [5]:
# 경로 정의
DATA_IN_PATH = 'D:/PBL 스터디/PBL 실습/pbl_data_in/'
DATA_OUT_PATH = 'D:/PBL 스터디/PBL 실습/pbl_data_out/'
INPUT_TRAIN_DATA = 'real_train_input_38.npy'
LABEL_TRAIN_DATA = 'real_train_label_38.npy'
DATA_CONFIGS = 'real_data_configs_38.json'

# 랜덤 시드 고정
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

train_input = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
train_input = pad_sequences(train_input, maxlen=train_input.shape[1])
train_label = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'),allow_pickle = True)
#prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'rt', encoding='UTF8'))

In [6]:
# 모델 하이퍼파라미터 정의 
model_name = 'cnn_classifier_kr'
BATCH_SIZE = 512
NUM_EPOCHS = 10 #10
VALID_SPLIT = 0.1 #0.1
MAX_LEN = train_input.shape[1]

kargs = {'model_name': model_name,
        'vocab_size': prepro_configs['vocab_size'],
        'embedding_size': 32,
        'num_filters': 100, #100
        'dropout_rate': 0.5,
        'hidden_dimension': 250,
        'output_dimension':1}

In [7]:
prepro_configs['vocab_size']

23091

In [8]:
# 모델 선언 및 컴파일 
class CNNClassifier(tf.keras.Model):
    
    def __init__(self, **kargs):
        super(CNNClassifier, self).__init__(name=kargs['model_name'])
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                     output_dim=kargs['embedding_size'])
        self.conv_list = [layers.Conv1D(filters=kargs['num_filters'],
                                   kernel_size=kernel_size,
                                   padding='valid',
                                   activation=tf.keras.activations.relu,
                                   kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
                     for kernel_size in [3,4,5]]
        self.pooling = layers.GlobalMaxPooling1D()
        self.dropout = layers.Dropout(kargs['dropout_rate'])
        self.fc1 = layers.Dense(units=kargs['hidden_dimension'],
                           activation=tf.keras.activations.relu,
                           kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
        self.pooling = layers.GlobalMaxPooling1D()
        self.dropout = layers.Dropout(kargs['dropout_rate'])
        self.fc2 = layers.Dense(units=kargs['hidden_dimension'],
                           activation=tf.keras.activations.relu,
                           kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
        self.pooling = layers.GlobalMaxPooling1D()
        self.dropout = layers.Dropout(kargs['dropout_rate'])
        self.fc3 = layers.Dense(units=kargs['output_dimension'],
                           activation=tf.keras.activations.sigmoid,
                           kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
    
    def call(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis=-1)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        
        return x

In [9]:
model = CNNClassifier(**kargs)

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])

In [10]:
# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)
checkpoint_path = DATA_OUT_PATH + model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_kr -- Folder already exists 



In [11]:
history = model.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                    validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.09400, saving model to D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_kr\weights.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.09400 to 0.71250, saving model to D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_kr\weights.h5
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.71250
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.71250 to 0.75750, saving model to D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_kr\weights.h5
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.75750 to 0.76850, saving model to D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_kr\weights.h5
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.76850 to 0.78850, saving model to D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_kr\weights.h5
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.78850
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.78850


In [12]:
# 결과 플롯
plot_graphs(history, 'loss')

NameError: name 'plt' is not defined

In [None]:
plot_graphs(history, 'accuracy')

In [13]:
# 결과 평가하기 
DATA_IN_PATH = 'D:/PBL 스터디/PBL 실습/pbl_data_in/'
DATA_OUT_PATH = 'D:/PBL 스터디/PBL 실습/pbl_data_out/'
INPUT_TEST_DATA = 'real_test_input_38.npy'
LABEL_TEST_DATA = 'real_test_label_38.npy'
#SAVE_FILE_NM = 'custom_model.h5'
SAVE_WEIGHTS_FILE_NM = 'weights.h5' #저장된 best model 이름

test_input = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_input = pad_sequences(test_input, maxlen=test_input.shape[1])
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [14]:
#model.load_weights(os.path.join(DATA_OUT_PATH, model_name, SAVE_FILE_NM))
model.load_weights(os.path.join(DATA_OUT_PATH, model_name, SAVE_WEIGHTS_FILE_NM))

In [15]:
model.evaluate(test_input, test_label_data)



[1.2383135557174683, 0.6004999876022339]

In [16]:
ar_pred = model.predict(test_input)
ar_pred = np.ravel(ar_pred, order='C')

In [17]:
test_data = pd.read_csv(DATA_IN_PATH+'test_data.csv', header = 0)
pred_df = pd.DataFrame({'text':test_data['text'], 'label':test_data['label'], 'predict':ar_pred})
pred_df[478:500]

Unnamed: 0,text,label,predict
478,"""왜 그렇게사니? 불쌍하다..""",0,0.143483
479,"""어디서 이런게 또 굴러나오냐...""",0,0.926773
480,"""이애는 머야?""",0,0.096607
481,"""강성범이 서민교수 보다도 더 못생긴게 까불고 있네 홍어인거 티내느라고 그러냐 ...",0,0.925629
482,"""주디가 또 편가르네""",0,0.647666
483,"""윤석열이되야 나라가 정상적으로 돌아옵니다! 정신제발 차리세요 강성범씨 쯔쯔쯔""",0,0.759575
484,"""너나잘하셔요 이제는별게다 참""",0,0.202103
485,"""대체 어떻게 미치면 사람이 이렇게 되는 걸까""",0,0.479957
486,"""이 놈 개그나 하지 왠 갑자기 헛소리냐? 이죄명같은 범법자가 대통령되면 ...",0,0.431311
487,"""죄좀써결한몸\n누워 x뱉기\n안떵헤리ㄸㅁㅇ""",0,0.743631


In [18]:
INPUT2_TEST_DATA = 'clova_test_input_38.npy'

test_input2 = np.load(open(DATA_IN_PATH + INPUT2_TEST_DATA, 'rb'))
test_input2 = pad_sequences(test_input2, maxlen=test_input.shape[1])

In [19]:
results = model.predict(test_input2)
return_data = pd.read_csv(DATA_IN_PATH + 'clova.csv', header=0)
return_data['results'] = results
return_data

Unnamed: 0,sentence,start,end,results
0,이번 신제품 출시에 대한 시장의 반응은 어때. 미친 개 같아.,980,8590,0.937683
1,시장의 반응이 차가워. 아 그런 젓 같은 일이,9880,15740,0.961118
2,우린 이제 야근의 연속이겠다. 판매량이 지난번 제품보다는 좋다고는 하는데,17430,26290,0.043407
3,모르겠다. 뭐가 문제인 걸까. 우리가 엿 같이 한 탓이지.,26480,35740,0.645844
4,야 야 진정해. 다시 분석해 보자.,37180,40240,0.097138


In [20]:
return_data.to_csv('clova_result.csv', index=False)

In [21]:
model.save(DATA_OUT_PATH+model_name+'my_model.tf', save_format="tf")

INFO:tensorflow:Assets written to: D:/PBL 스터디/PBL 실습/pbl_data_out/cnn_classifier_krmy_model.tf\assets


In [22]:
model.summary()

Model: "cnn_classifier_kr"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  738912    
_________________________________________________________________
conv1d (Conv1D)              multiple                  9700      
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  12900     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  16100     
_________________________________________________________________
dense (Dense)                multiple                  75250     
_________________________________________________________________
dense_1 (Dense)              multiple                  62750     
_________________________________________________________________
global_max_pooling1d_2 (Glob multiple            