# Deep Learning 모델을 활용해 보자

앞서 만든 Word2Vec을 Deep Learning 모델로 학습시키면 어떨까요? 이 튜토리얼은 간단한 DNN을 활용하여 Sentiment Analysis를 해봅니다.

In [1]:
from prepro import data_prepro, review_to_wordlist, getAvgFeatureVecs
import pandas as pd
from IPython.display import display
from gensim.models import Word2Vec
import numpy as np

In [2]:
train = pd.read_csv( "../src/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

## Data Preprocessing

데이터 전처리 작업을 합니다. 앞선 튜토리얼에서 활용한 Sentence Embedding 기법을 그대로 활용하여 Sentence Embedding Vector를 Deep learning Input으로 활용하려 합니다.

학습이 잘 되었는지 확인하기 위해 Train Dataset과 Test Dataset를 구분합니다.

In [3]:
dev_portion = 0.2
train_count = int(train.shape[0] * (1 - dev_portion))

train_input_data = data_prepro(train['review'][:train_count])
train_labels = np.array(train['sentiment'][:train_count], dtype=np.float32)
train_labels = np.expand_dims(train_labels, axis=-1)

dev_input_data = data_prepro(train['review'][train_count:])
dev_labels = np.array(train['sentiment'][train_count:], dtype=np.float32)
dev_labels = np.expand_dims(dev_labels, axis=-1)

In [4]:
def make_multi_class(labels):
    output = np.zeros([labels.shape[0], 2])
    for i, (label, o) in enumerate(zip(labels, output)):
        if label == 0:
            output[i,0] = 1
        else:
            output[i,1] = 1
    
    return output

In [5]:
train_labels = make_multi_class(train_labels)
dev_labels = make_multi_class(dev_labels)

In [6]:
model = Word2Vec.load("300features_40minwords_10context")

In [7]:
num_features = 300
trainDataVecs = getAvgFeatureVecs(train_input_data, model, num_features)
devDataVecs = getAvgFeatureVecs(dev_input_data, model, num_features)

Review 0 of 20000
Review 1000 of 20000
Review 2000 of 20000
Review 3000 of 20000
Review 4000 of 20000
Review 5000 of 20000
Review 6000 of 20000
Review 7000 of 20000
Review 8000 of 20000
Review 9000 of 20000
Review 10000 of 20000
Review 11000 of 20000
Review 12000 of 20000
Review 13000 of 20000
Review 14000 of 20000
Review 15000 of 20000
Review 16000 of 20000
Review 17000 of 20000
Review 18000 of 20000
Review 19000 of 20000
Review 0 of 5000
Review 1000 of 5000
Review 2000 of 5000
Review 3000 of 5000
Review 4000 of 5000


## DNN 생성

tensorflow 모델을 활용하여 간단한 2-layer DNN을 구현합니다.

In [8]:
import tensorflow as tf

In [29]:
## Model hyperparameter
learning_rate = 0.01
training_epochs = 50
batch_size = 100
total_batch = int(trainDataVecs.shape[0]/batch_size)

input_dim = 300
hidden_1_dim = 100
hidden_2_dim = 100
num_classes = 2

In [10]:
X = tf.placeholder(tf.float32, [None, input_dim])
Y = tf.placeholder(tf.float32, [None, num_classes])

W1 = tf.Variable(tf.random_normal([input_dim, hidden_1_dim], stddev=0.01))
b1 = tf.Variable(tf.constant(0.0))

W2 = tf.Variable(tf.random_normal([hidden_1_dim, hidden_2_dim], stddev=0.01))
b2 = tf.Variable(tf.constant(0.0))

W3 = tf.Variable(tf.random_normal([hidden_2_dim, num_classes], stddev=0.01))
b3 = tf.Variable(tf.constant(0.0))

In [11]:
layer_1 = tf.add(tf.matmul(X, W1), b1)
layer_1 = tf.nn.relu(layer_1)

layer_2 = tf.add(tf.matmul(layer_1, W2), b2)
layer_2 = tf.nn.relu(layer_2)

predict = tf.add(tf.matmul(layer_2, W3), b3)

In [12]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [23]:
correct_prediction = tf.equal(tf.argmax(predict, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

## 실험

In [31]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(training_epochs):
        print("epoch ", epoch)
        avg_cost = .0
        for step in range(total_batch):
            train_input = trainDataVecs[step*batch_size:step*batch_size+batch_size]
            train_label = train_labels[step*batch_size:step*batch_size+batch_size]
            _, c = sess.run([optimizer, cost], feed_dict={X: train_input, Y: train_label})
            avg_cost += c / total_batch
            if step % 100 == 0:
                acc = accuracy.eval({X: train_input, Y: train_label})
                print("Step:", '%04d' % (step+1), "cost=", \
                    "{:.9f}".format(avg_cost), ", Training Accuracy= " + \
                  "{:.5f}".format(acc))
    
    acc = accuracy.eval({X: devDataVecs, Y: dev_labels})
    print("Dev Accuracy= " + "{:.5f}".format(acc))

epoch  0
Step: 0001 cost= 0.003465736 , Training Accuracy= 0.55000
Step: 0101 cost= 0.233506184 , Training Accuracy= 0.89000
epoch  1
Step: 0001 cost= 0.001582294 , Training Accuracy= 0.85000
Step: 0101 cost= 0.162070979 , Training Accuracy= 0.89000
epoch  2
Step: 0001 cost= 0.001471018 , Training Accuracy= 0.88000
Step: 0101 cost= 0.156766032 , Training Accuracy= 0.89000
epoch  3
Step: 0001 cost= 0.001419145 , Training Accuracy= 0.88000
Step: 0101 cost= 0.153867975 , Training Accuracy= 0.89000
epoch  4
Step: 0001 cost= 0.001427625 , Training Accuracy= 0.88000
Step: 0101 cost= 0.151783273 , Training Accuracy= 0.90000
epoch  5
Step: 0001 cost= 0.001402225 , Training Accuracy= 0.87000
Step: 0101 cost= 0.149577450 , Training Accuracy= 0.90000
epoch  6
Step: 0001 cost= 0.001372442 , Training Accuracy= 0.88000
Step: 0101 cost= 0.147355777 , Training Accuracy= 0.88000
epoch  7
Step: 0001 cost= 0.001343118 , Training Accuracy= 0.87000
Step: 0101 cost= 0.144998531 , Training Accuracy= 0.88000
