In [2]:
import sys
import os
import numpy as np
import json

from sklearn.model_selection import train_test_split
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [3]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_INPUT_DATA = 'train_input.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
TEST_INPUT_DATA = 'test_input.npy'
TEST_ID_DATA = 'test_id.npy'

DATA_CONFIGS = 'data_configs.json'

train_input_data = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label_data = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)
    print(prepro_configs.keys())

dict_keys(['vocab', 'vocab_size'])


In [8]:
# 파라미터 변수
rng_seed = 1234
batch_size = 16
num_epochs = 3
vocab_size = prepro_configs['vocab_size'] + 1
emb_size = 128
valid_split = 0.2
train_input, eval_input, train_label, eval_label = train_test_split(train_input_data, train_label_data, test_size=valid_split, random_state=rng_seed)


# tf.data 세팅

In [20]:
def mapping_fn(X, Y=None):
    input, label = {'x': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_input, train_label))
    dataset = dataset.shuffle(buffer_size=len(train_input))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=num_epochs)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_input, eval_label))
    dataset = dataset.shuffle(buffer_size=len(eval_input))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(mapping_fn)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# 모델 세팅

In [23]:
# 모델에 대한 메인 부분입니다.


def model_fn(features, labels, mode):

    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    #embedding layer를 선언합니다.
    embedding_layer = keras.layers.Embedding(
                    vocab_size,
                    emb_size)(features['x'])
    
    # embedding layer에 대한 output에 대해 dropout을 취합니다.
    dropout_emb = keras.layers.Dropout(rate=0.5)(embedding_layer)

    ## filters = 128이고 kernel_size = 3,4,5입니다.
    ## 길이가 3,4,5인 128개의 다른 필터를 생성합니다. 3,4,5 gram의 효과처럼 다양한 각도에서 문장을 보는 효과가 있습니다.
    ## conv1d는 (배치사이즈, 길이, 채널)로 입력값을 받는데, 배치사이즈: 문장 숫자 | 길이: 각 문장의 단어의 개수 | 채널: 임베딩 출력 차원수임
    
    conv1 = keras.layers.Conv1D(
         filters=128,
         kernel_size=3,
        padding='valid',
         activation=tf.nn.relu)(dropout_emb)
    
    pool1 = keras.layers.GlobalMaxPool1D()(conv1)

    conv2 = keras.layers.Conv1D(
         filters=128,
         kernel_size=4,
        padding='valid',
         activation=tf.nn.relu)(dropout_emb)
    
    pool2 = keras.layers.GlobalMaxPool1D()(conv2)
    
    conv3 = keras.layers.Conv1D(
         filters=128,
         kernel_size=5,
        padding='valid',
         activation=tf.nn.relu)(dropout_emb)
    pool3 = keras.layers.GlobalMaxPool1D()(conv3)
    
    concat = keras.layers.concatenate([pool1, pool2, pool3]) #3,4,5gram이후 모아주기
    
    hidden = keras.layers.Dense(250, activation=tf.nn.relu)(concat)
    dropout_hidden = keras.layers.Dropout(rate=0.5)(hidden)
    logits = keras.layers.Dense(1, name='logits')(dropout_hidden)
    logits = tf.squeeze(logits, axis=-1)
    
    #최종적으로 학습, 평가, 테스트의 단계로 나누어 활용
    
    if PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits)
            }
        )
        
    loss = tf.losses.sigmoid_cross_entropy(labels, logits)

    if EVAL:
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)

In [24]:
model_dir = os.path.join(os.getcwd(), "data_out/checkpoint/cnn/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig(save_checkpoints_steps=200, keep_checkpoint_max=2,
                                    log_step_count_steps=400)

 #에스티메이터 객체 생성
cnn_est = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf)
cnn_est.train(train_input_fn) #학습하기
cnn_est.evaluate(eval_input_fn) #평가하기

INFO:tensorflow:Using config: {'_model_dir': '/Users/seungmoo/SM/Workspace/NLP/텐서플로와 머신러닝으로 시작하는 자연어 처리/study/data_out/checkpoint/cnn/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 400, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13ead6ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSave

{'acc': 0.866, 'loss': 0.56281734, 'global_step': 3750}

In [27]:
test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb')) 
ids = np.load(open(DATA_IN_PATH + TEST_ID_DATA, 'rb'))



In [29]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x":test_input_data}, shuffle=False)



In [None]:
predictions = np.array([p['prob'] for p in cnn_est.predict(input_fn=predict_input_fn)])

In [35]:
cnn_est.predict(input_fn=predict_input_fn)

<generator object Estimator.predict at 0x14a2a66d0>

In [43]:
predict_input_fn()

{'x': <tf.Tensor 'fifo_queue_DequeueUpTo_6:1' shape=(?, 174) dtype=int32>}