In [39]:
# to be imported
from __future__ import print_function
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer


from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

In [41]:
import numpy as np
# Read the input dataset 
d = pd.read_csv("../tokenizedsenakhir.csv", 
                usecols=('classified_Aspek','Review_bersih'),
                dtype={'Review': object})
# Only interested in data with consumer complaints
d=d[d['Review_bersih'].notnull()]

d=d[d['classified_Aspek'].notnull()]
d.reset_index(drop=True,inplace=True)
x = d.iloc[:, 0].values
y = d.iloc[:, 1].values
# print(x)

#there are 11 unique classes for classification
print(np.unique(y, return_counts=True))

(array(['fasilitas', 'lokasi', 'serbaneka', 'suasana'], dtype=object), array([ 5698, 11027, 22220,  6792], dtype=int64))


In [42]:
 # encode the text with word sequences - Preprocessing step 1
tk = Tokenizer(num_words= 8000, filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True, split=" ")
tk.fit_on_texts(x)
x = tk.texts_to_sequences(x)
x = sequence.pad_sequences(x, maxlen=1500)

print(x)

[[   0    0    0 ...    4  152 5859]
 [   0    0    0 ... 1305  595    9]
 [   0    0    0 ...  238  344  283]
 ...
 [   0    0    0 ...    4  670 3960]
 [   0    0    0 ... 2688    7  896]
 [   0    0    0 ...  174   93  324]]


In [43]:
x.shape

(45737, 1500)

In [44]:
# # Label Encoding categorical data for the classification category
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# labelencoder_Y = LabelEncoder()
# y = labelencoder_Y.fit_transform(y)
# print(y)
# print(np.unique(y, return_counts=True))

# Label Encoding categorical data for the classification category
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)
print(y)
print(np.unique(y, return_counts=True))

[1 0 0 ... 2 1 2]
(array([0, 1, 2, 3]), array([ 5698, 11027, 22220,  6792], dtype=int64))


In [45]:
# Perform one hot encoding 
from tensorflow.keras import utils as np_utils
y = np_utils.to_categorical(y, num_classes= 4)

print(y)

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]]


In [46]:
# Seeding
np.random.seed(200)
indices = np.arange(len(x))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

In [47]:
index_from=3
start_char = 1
if start_char is not None:
        x = [[start_char] + [w + index_from for w in x1] for x1 in x]
elif index_from:
        x = [[w + index_from for w in x1] for x1 in x]

In [None]:
num_words = None
if not num_words:
        num_words = max([max(x1) for x1 in x])
        
oov_char = 2
skip_top = 0
# by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
        x = [[w if (skip_top <= w < num_words) else oov_char for w in x1] for x1 in x]
else:
        x = [[w for w in x1 if (skip_top <= w < num_words)] for x1 in x]
        
# split test and train data
test_split = 0.2
idx = int(len(x) * (1 - test_split))
x_train, y_train = np.array(x[:idx]), np.array(y[:idx])
x_test, y_test = np.array(x[idx:]), np.array(y[idx:])

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(y)

In [34]:
x_train = sequence.pad_sequences(x_train, maxlen=53)
x_test = sequence.pad_sequences(x_test, maxlen=53)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (33004, 53)
x_test shape: (8251, 53)


In [35]:
print('y_test shape:', y_test.shape)

y_test shape: (8251, 4)


In [36]:
y_test

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]], dtype=float32)

In [37]:
max_features = 2000
maxlen = 53
embedding_dims = 50
filters = 64
kernel_size = 20
hidden_dims = 250


# CNN with max pooling imeplementation 
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='sigmoid',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('sigmoid'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(4))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=32,
          epochs=50,
          validation_data=(x_test, y_test))


Build model...
Train on 33004 samples, validate on 8251 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1ba1cb13b00>

In [None]:
_, acc = model.evaluate(x_train, y_train, verbose=1)
print("Accuracy: %.2f" % (acc*100), "%")

In [None]:
_, acc = model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: %.2f" % (acc*100), "%")