In [1]:
import pandas as pd
import numpy as np
import re

## Load saved sequences data

In [2]:
df_seq = pd.read_csv('../data/cat661-root-train.csv')
X = df_seq.iloc[:,0].values
y = df_seq.iloc[:,1].values
X_roots = df_seq.iloc[:,2].values

## Processing data

In [3]:
# pre-processing
def pre_process(X):
    X_p = []

    for name in X:
        name = name.lower().split()
        name = [re.compile('[(),]+').sub('', w) for w in name] 
        name = [w for w in name if re.compile('[\W_]+').sub('', w)] # remove all words that only constain special character
        name = ' '.join(name)
        #name = ViTokenizer.tokenize(name)
        X_p.append(name)

    return X_p

In [4]:
X = pre_process(X)
X_roots = pre_process(X_roots)

In [5]:
# preview data
for title, root in zip(X[:5], X_roots[:5]):
    print(title, '-', root)

camera ahd hồng ngoại vantech vp-141ahdl - camera box vantech vp-141ahd hồng ngoại
camera ip hồng ngoại xoay zoom eview ptb04n13 - camera ip hồng ngoại eview ptb04n13
camera ahd hồng ngoại escort esc-603ahd 1.3 - camera ahd escort esc-603ahd
camera wv-cp300 - camera box panasonic wv-cp300 hồng ngoại
camera hikvision ds-2ce16f1t-it5 hd-tvi 3m - camera hikvision ds-2ce16f1t-it5


## Sequence model

### Embedding

In [6]:
num_samples = len(X)

In [14]:
input_texts = []
target_texts = []

input_tokens = set()
target_tokens = set()

w2d = {}

In [15]:
# word-level tokens
for i in range(num_samples):
    # cast into tokens
    input_texts.append(X[i].split())
    target_texts.append(['\START_'] + X_roots[i].split() + ['\END_'])
    
    for word in input_texts[i]:
        if word not in input_tokens:
            input_tokens.add(word)
        if word not in w2d:
            w2d[word] = 1
        else:
            w2d[word] += 1
            
    for word in target_texts[i]:
        if word not in target_tokens:
            target_tokens.add(word)
        if word not in w2d:
            w2d[word] = 1
        else:
            w2d[word] += 1

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

max_encoder_seq_length = max([len(seq) for seq in input_texts])
max_decoder_seq_length = max([len(seq) for seq in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)  

Number of samples: 34735
Number of unique input tokens: 11646
Number of unique output tokens: 7881
Max sequence length for inputs: 49
Max sequence length for outputs: 15


In [34]:
w2d_sorted = sorted(w2d.items(), key=lambda item: item[1])

In [45]:
threshold = 20

print('Root tokens length:', len(w2d_sorted))

# apply thres hold
w2d_thres = [item for item in w2d_sorted if item[1] > threshold]

print('Threshold tokens length:', len(w2d_thres))
w2d_thres[:100]

Root tokens length: 12676
Threshold tokens length: 1339


[('sắt', 21),
 ('hds-2120iraw', 21),
 ('line', 21),
 ('server', 21),
 ('diện', 21),
 ('vt-3300l', 21),
 ('darkfighter', 21),
 ('vt-3325a', 21),
 ('vp-3601', 21),
 ('qtx-2611', 21),
 ('av-808', 21),
 ('lens', 21),
 ('vp-160b', 21),
 ('ipc-hdw1020sp', 21),
 ('thực', 21),
 ('vt-3326a', 21),
 ('dẫn', 21),
 ('đường', 21),
 ('vietmap', 21),
 ('vt-3324a', 21),
 ('dh-ipc-hfw4231sp', 21),
 ('kpc133-zdp', 21),
 ('vdt-333zip', 21),
 ('ipc-hdbw4231f-as', 21),
 ('qtx-1410', 21),
 ('vt-3224k', 21),
 ('balun', 21),
 ('12.0', 21),
 ('vp-4561', 21),
 ('5x', 21),
 ('sno-e6031rp', 21),
 ('vt3118a', 21),
 ('ca-fw181gp', 21),
 ('eco-9213aip', 21),
 ('dáng', 21),
 ('đáo', 21),
 ('120', 21),
 ('ds-2cd2522fwd-iws', 21),
 ('rắc', 21),
 ('vp-266hdi', 21),
 ('d1', 21),
 ('dh-ipc-eb5500p', 21),
 ('jt-hd3310', 21),
 ('vp-301ahdm', 21),
 ('nguồnpoe', 21),
 ('avn362zvp', 21),
 ('1megapixel', 21),
 ('vt-3226h', 21),
 ('4108', 21),
 ('qtx-4100b', 21),
 ('hds-2020irpw', 21),
 ('hik-16d6t-', 21),
 ('bton', 21),
 ('vp-57

In [46]:
# building dictionary of tokens
input_token_index = dict([(token, i) for i, token in enumerate(input_tokens)])
target_token_index = dict([(token, i) for i, token in enumerate(target_tokens)])

In [49]:
# building embedding for input and target data
encoder_input_data = np.zeros((num_samples, max_encoder_seq_length))
decoder_input_data = np.zeros((num_samples, max_decoder_seq_length))
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens))

MemoryError: 

In [119]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for j, word in enumerate(input_text):
        encoder_input_data[i, j] = input_token_index[word]
    for j, word in enumerate(target_text):
        decoder_input_data[i, j] = target_token_index[word]
        
        # decoder_target_data is ahead of decoder_input_data by one timestep
        if j > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, j - 1, target_token_index[word]] = 1.
            

### Model

In [50]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

In [51]:
latent_dim = 256
epochs = 1
batch_size = 128

In [53]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
encoder_eb = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_eb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [54]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_eb = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_eb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [55]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    2981376     input_2[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 256)    2017536     input_3[0][0]                    
____________________________________

In [126]:
# Run training
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s_word.h5')

Train on 24967 samples, validate on 6242 samples
Epoch 1/1


  str(node.arguments) + '. They will not be included '
