In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
data = pd.read_csv('udn2yrsnews_monpa.csv',encoding='utf-8').astype(str)
data.head()

# Tokenizing & Padding Sequence

In [0]:
import keras
from keras.preprocessing.sequence import pad_sequences

In [0]:
data['merge']=data.apply(lambda row:row['title']+" "+row['document'],axis=1)

In [0]:
#Configure Tokenizer
MAX_NUM_WORDS = 100000
MAX_SEQUENCE_LENGTH = 240
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

X=np.random.permutation(data[['category','merge']])
y=X[:,0]
X=X[:,1]

lenX_train=list(map(lambda x:len(x),X))
np.average(lenX_train)

In [0]:
from sklearn.model_selection import KFold,cross_validate

In [0]:
X_cross=[]
Xtest_cross=[]
kf = KFold(n_splits=10)

for train, test in kf.split(X):
    X_train=X[train]
    tokenizer.fit_on_texts(X_train)
    X_trainseq = tokenizer.texts_to_sequences(X_train)
    X_trainpad = keras.preprocessing.sequence.pad_sequences(X_trainseq,maxlen=MAX_SEQUENCE_LENGTH)
    
    X_test=X[test]
    X_testseq = tokenizer.texts_to_sequences(X_test)
    X_testpad = keras.preprocessing.sequence.pad_sequences(X_testseq,maxlen=MAX_SEQUENCE_LENGTH)
    
    X_cross.append(X_trainpad)
    Xtest_cross.append(X_testpad)

In [0]:
y_to_num = dict(zip(set(y), range(17)))
print(y_to_num)
y_num=[y_to_num[x] for x in y]
y_num=np.array(y_num)

In [0]:
y_cross=[]
ytest_cross=[]
yeval_cross=[]

for train, test in kf.split(X):
    y_train=y_num[train]
    y_test=y_num[test]
    
    y_traincat = keras.utils.to_categorical(y_train)
    y_testcat = keras.utils.to_categorical(y_test)
    
    y_cross.append(y_traincat)
    ytest_cross.append(y_testcat)
    
    yeval_cross.append(y_test)

# Model

In [1]:
from keras import Input
from keras.layers import Embedding,MaxPooling1D, Conv1D,LSTM,Dense,BatchNormalization,Dropout,Lambda,TimeDistributed,Flatten
from keras.models import Model,Sequential
from keras.layers.merge import Concatenate
from keras import backend as K

from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [0]:
# 基本參數設置，有幾個分類
NUM_CLASSES = 17

# 在語料庫裡有多少詞彙
MAX_NUM_WORDS = 100000

# 一個標題最長有幾個詞彙
MAX_SEQUENCE_LENGTH = 240

# 一個詞向量的維度
NUM_EMBEDDING_DIM = 256

# LSTM 輸出的向量維度
NUM_LSTM_UNITS = 128

# Convolution
filters1 = 32
kernel_size1 = 5

filters2 = 64
kernel_size2 = 3

pool_size = 2

# Training
batch_size = 500
epochs = 3

In [0]:
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS,NUM_EMBEDDING_DIM , input_length=MAX_SEQUENCE_LENGTH))

model.add(Conv1D(filters1,
                 kernel_size1,
                 padding='same',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))

model.add(Conv1D(filters2,
                 kernel_size2,
                 padding='same',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))


model.add(LSTM(NUM_LSTM_UNITS,input_shape=(MAX_SEQUENCE_LENGTH , NUM_EMBEDDING_DIM)))

model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(Dense(units=300,activation='relu'))


model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(Dense(units=NUM_CLASSES,activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#model.fit(X_cross[0], y_cross[0],
         # batch_size=batch_size,
          #epochs=epochs)

In [0]:
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [0]:
expectedlstmcnn=[]
predictedlstmcnn=[]

for i in range(10):
    X_train=X_cross[i]
    X_test=Xtest_cross[i]
    y_train=y_cross[i]
    y_test=yeval_cross[i]
    
    lstmcnn=model.fit(X_train, y_train,batch_size=batch_size,epochs=epochs)
    ypred=model.predict_classes(X_test)

    expectedlstmcnn.extend(y_test)
    predictedlstmcnn.extend(ypred)

In [0]:
print("Macro-average: {0:.2f},{1:.2f},{2:.2f}".format(metrics.precision_score(expectedlstmcnn, predictedlstmcnn, average='macro'),metrics.recall_score(expectedlstmcnn, predictedlstmcnn, average='macro'),metrics.f1_score(expectedlstmcnn, predictedlstmcnn, average='macro')))
print("Micro-average: {0:.2f},{1:.2f},{2:.2f}".format(metrics.precision_score(expectedlstmcnn, predictedlstmcnn, average='micro'),metrics.recall_score(expectedlstmcnn, predictedlstmcnn, average='micro'),metrics.f1_score(expectedlstmcnn, predictedlstmcnn, average='micro')))
print("Weighted-average: {0:.2f},{1:.2f},{2:.2f}".format(metrics.precision_score(expectedlstmcnn, predictedlstmcnn, average='weighted'),metrics.recall_score(expectedlstmcnn, predictedlstmcnn, average='weighted'),metrics.f1_score(expectedlstmcnn, predictedlstmcnn, average='weighted')))