## Data Preparation

### Train & Test Data split

In [1]:
import keras
import pandas as pd
from keras.datasets import reuters

(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


### Array to DataFrame

In [2]:
train_data_df = pd.DataFrame(train_data)
train_labels_df = pd.DataFrame(train_labels)

In [3]:
len(train_data_df)

8982

### Combine train data & labels to look through crystal clearly

In [4]:
reuters_df = pd.DataFrame(train_data_df)
reuters_df.columns=['train_data']
reuters_df['train_labels'] = train_labels_df
reuters_df

Unnamed: 0,train_data,train_labels
0,"[1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, ...",3
1,"[1, 3267, 699, 3434, 2295, 56, 2, 7511, 9, 56,...",4
2,"[1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32,...",3
3,"[1, 4, 686, 867, 558, 4, 37, 38, 309, 2276, 46...",4
4,"[1, 8295, 111, 8, 25, 166, 40, 638, 10, 436, 2...",4
...,...,...
8977,"[1, 313, 262, 2529, 1426, 8, 130, 40, 129, 363...",19
8978,"[1, 4, 96, 5, 340, 3976, 23, 328, 6, 154, 7, 4...",19
8979,"[1, 141, 3890, 387, 81, 8, 16, 1629, 10, 340, ...",25
8980,"[1, 53, 46, 957, 26, 14, 74, 132, 26, 39, 46, ...",3


### Switch keys & values to make dictionary fit in our x_train data which is composed of indexes

In [5]:
# word_index는 단어와 정수 인덱스를 매핑한 딕셔너리입니다
word_index = reuters.get_word_index()
# 정수 인덱스와 단어를 매핑하도록 뒤집습니다
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# 뉴스를 디코딩합니다. 
# 0, 1, 2는 '패딩', '문서 시작', '사전에 없음'을 위한 인덱스이므로 3을 뺍니다
decoded_news = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[11]])

In [6]:
decoded_news

'? lt international thomson organisation ltd said it will report financial results in u s funds rather than sterling beginning from jan 1 1987 it said the change will not be applied retroactively to prior financial periods the company said as a result of recent investments most of its assets now are located in the united states reuter 3'

## Preprocessing

### Preprocessing for X (One-Hot encoding)

In [7]:
import numpy as np

def vectorize_Xsequences(sequences, dimension=10000):
    # 크기가 (len(sequences), dimension))이고 모든 원소가 0인 행렬을 만듭니다
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):

        results[i, sequence] = 1.  # results[i]에서 특정 인덱스의 위치를 1로 만듭니다
    print()
    return results

# 훈련 데이터를 벡터로 변환합니다
x_train = vectorize_Xsequences(train_data)
# 테스트 데이터를 벡터로 변환합니다
x_test = vectorize_Xsequences(test_data)





### Preprocessing for Y (One-Hot encoding)

In [8]:
def vectorize_Ysequences(sequences, dimension=46):
    # 크기가 (len(sequences), dimension))이고 모든 원소가 0인 행렬을 만듭니다
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):

        results[i, sequence] = 1.  # results[i]에서 특정 인덱스의 위치를 1로 만듭니다
    print()
    return results

# 훈련 데이터를 벡터로 변환합니다
y_train = vectorize_Ysequences(train_labels)
# 테스트 데이터를 벡터로 변환합니다
y_test = vectorize_Ysequences(test_labels)





## Modeling

### Model setting

In [17]:
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
import tensorflow as tf

model = models.Sequential()
model.add(layers.Dense(500, activation=tf.keras.layers.ELU(), input_shape=(10000,)))
model.add(layers.Dense(46, activation='softmax'))

### compling

In [18]:
model.compile(optimizer='Nadam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

### Train & Val split 

In [19]:
from sklearn.model_selection import train_test_split
partial_x_train, x_val, partial_y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=111)

### Fitting with tracing history

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=10,
                    batch_size=500,
                    validation_data=(x_val, y_val))

Epoch 1/10

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
import matplotlib.pyplot as plt

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# ‘bo’는 파란색 점을 의미합니다
plt.plot(epochs, loss, 'bo', label='Training loss')
# ‘b’는 파란색 실선을 의미합니다
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # 그래프를 초기화합니다
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
model.evaluate(x_test, y_test)