In [0]:
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
  AutoNLP datasets.
"""

import os
import numpy as np

class AutoNLPDataset(object):
    def __init__(self, dataset_dir):
        """
            train_dataset, test_dataset: list of strings
            train_label: np.array
        """
        self.dataset_name_ = dataset_dir
        self.dataset_dir_ = dataset_dir
        self.metadata_ = self.read_metadata(os.path.join(dataset_dir, "meta.json"))

    def read_dataset(self):
        self.train_dataset = self._read_dataset(os.path.join(self.dataset_dir_, "train.data"))
        self.train_label = self.read_label(os.path.join(self.dataset_dir_, "train.solution"))
        self.test_dataset = self._read_dataset(os.path.join(self.dataset_dir_, "test.data"))

    def get_train(self):
        return self.train_dataset, self.train_label

    def get_test(self):
        return self.test_dataset

    def get_metadata(self):
        return self.metadata_

    def read_metadata(self, metadata_path):
        import json
        return json.load(open(metadata_path))

    def _read_dataset(self, dataset_path):
        with open(dataset_path) as fin:
            return fin.readlines()

    def read_label(self, label_path):
        return np.loadtxt(label_path)

    def get_class_num(self):
        """ return the number of class """
        return self.metadata_["class_num"]

    def get_train_num(self):
        """ return the number of train instance """
        return self.metadata_["train_num"]

    def get_test_num(self):
        """ return the number of test instance """
        return self.metadata_["test_num"]

    def get_language(self):
        """ ZH or EN """
        return self.metadata_["language"]

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
file_path = '/content/drive/My Drive/autonlp_starting_kit-master/offline_data/O1/O1.data'

In [0]:
autoDaset = AutoNLPDataset(file_path)
autoDaset.read_dataset()

In [43]:
# -*- coding: utf-8 -*-

import pandas as pd
import os
import re
import argparse
import time
import jieba
import pickle
import tensorflow as tf
import numpy as np
import sys, getopt
from subprocess import check_output
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import SpatialDropout1D
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import MaxPooling2D
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import GlobalAveragePooling1D
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tensorflow.python.keras.preprocessing import text
from tensorflow.python.keras.preprocessing import sequence

from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
                                    # (nothing gets printed in Jupyter, only if you run it standalone)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras


MAX_SEQ_LENGTH = 500
MAX_VOCAB_SIZE = 20000 # Limit on the number of features. We use the top 20K features

# code form https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
def clean_en_text(dat):
    
    REPLACE_BY_SPACE_RE = re.compile('["/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z #+_]')
    
    ret = []
    for line in dat:
        # text = text.lower() # lowercase text
        line = REPLACE_BY_SPACE_RE.sub(' ', line)
        line = BAD_SYMBOLS_RE.sub('', line)
        line = line.strip()
        ret.append(line)
    return ret

def clean_zh_text(dat):
    REPLACE_BY_SPACE_RE = re.compile('[“”【】/（）：！～「」、|，；。"/(){}\[\]\|@,\.;]')
    
    ret = []
    for line in dat:
        line = REPLACE_BY_SPACE_RE.sub(' ', line)
        line = line.strip()
        ret.append(line)
    return ret


def sequentialize_data(train_contents, val_contents=None):
    """Vectorize data into ngram vectors.

    Args:
        train_contents: training instances
        val_contents: validation instances
        y_train: labels of train data.

    Returns:
        sparse ngram vectors of train, valid text inputs.
    """
    tokenizer = text.Tokenizer(num_words = MAX_VOCAB_SIZE)
    tokenizer.fit_on_texts(train_contents)
    x_train = tokenizer.texts_to_sequences(train_contents)

    if val_contents:
        x_val = tokenizer.texts_to_sequences(val_contents)

    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQ_LENGTH:
        max_length = MAX_SEQ_LENGTH

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    if val_contents:
        x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    word_index = tokenizer.word_index
    num_features = min(len(word_index) + 1, MAX_VOCAB_SIZE)
    if val_contents:
        return x_train, x_val, word_index, num_features, tokenizer, max_length
    else:
        return x_train, word_index, num_features, tokenizer, max_length


def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.

    Args:
        num_classes: Number of classes.

    Returns:
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation


def sep_cnn_model(input_shape,
                  num_classes,
                  num_features,
                  blocks=1,
                  filters=64,
                  kernel_size=4,
                  dropout_rate=0.25):
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)

    model = models.Sequential()
    model.add(Embedding(input_dim=num_features, output_dim=200, input_length=input_shape))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(op_units, activation=op_activation))

    # for _ in range(blocks - 1):
    #     model.add(Dropout(rate=dropout_rate))
    #     model.add(SeparableConv1D(filters=filters,
    #                               kernel_size=kernel_size,
    #                               activation='relu',
    #                               bias_initializer='random_uniform',
    #                               depthwise_initializer='random_uniform',
    #                               padding='same'))
    #     model.add(SeparableConv1D(filters=filters,
    #                               kernel_size=kernel_size,
    #                               activation='relu',
    #                               bias_initializer='random_uniform',
    #                               depthwise_initializer='random_uniform',
    #                               padding='same'))
    #     model.add(MaxPooling1D(pool_size=3))

    # model.add(SeparableConv1D(filters=filters * 2,
    #                           kernel_size=kernel_size,
    #                           activation='relu',
    #                           bias_initializer='random_uniform',
    #                           depthwise_initializer='random_uniform',
    #                           padding='same'))
    # model.add(SeparableConv1D(filters=filters * 2,
    #                           kernel_size=kernel_size,
    #                           activation='relu',
    #                           bias_initializer='random_uniform',
    #                           depthwise_initializer='random_uniform',
    #                           padding='same'))

    # model.add(GlobalAveragePooling1D())
    # # model.add(MaxPooling1D())
    # model.add(Dropout(rate=0.5))
    # model.add(Dense(op_units, activation=op_activation))
    return model


def _tokenize_chinese_words(text):
    return ' '.join(jieba.cut(text, cut_all=False))

def vectorize_data(x_train, x_val=None):
    vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    if x_val:
        full_text = x_train + x_val
    else:
        full_text = x_train
    vectorizer.fit(full_text)
    train_vectorized = vectorizer.transform(x_train)
    if x_val:
        val_vectorized = vectorizer.transform(x_val)
        return train_vectorized, val_vectorized, vectorizer
    return train_vectorized, vectorizer


# onhot encode to category
def ohe2cat(label):
    return np.argmax(label, axis=1)


class Model(object):
    """ 
        model of CNN baseline without pretraining.
        see `https://aclweb.org/anthology/D14-1181` for more information.
    """

    def __init__(self, metadata, train_output_path="./", test_input_path="./"):
        """ Initialization for model
        :param metadata: a dict formed like:
            {"class_num": 10,
             "language": ZH,
             "num_train_instances": 10000,
             "num_test_instances": 1000,
             "time_budget": 300}
        """
        self.done_training = False
        self.metadata = metadata
        self.train_output_path = train_output_path
        self.test_input_path = test_input_path

    def train(self, train_dataset, remaining_time_budget=None):
        """model training on train_dataset.
        
        :param train_dataset: tuple, (x_train, y_train)
            x_train: list of str, input training sentences.
            y_train: A `numpy.ndarray` matrix of shape (sample_count, class_num).
                     here `sample_count` is the number of examples in this dataset as train
                     set and `class_num` is the same as the class_num in metadata. The
                     values should be binary.
        :param remaining_time_budget:
        """
        if self.done_training:
            return
        x_train, y_train = train_dataset

        # tokenize Chinese words
        if self.metadata['language'] == 'ZH':
            x_train = clean_zh_text(x_train)
            x_train = list(map(_tokenize_chinese_words, x_train))
        else:
            x_train = clean_en_text(x_train)

        x_train, word_index, num_features, tokenizer, max_length = sequentialize_data(x_train)
        num_classes = self.metadata['class_num']

        # initialize model
        model = sep_cnn_model(input_shape=x_train.shape[1:][0],
                              num_classes=num_classes,
                              num_features=num_features,
                              blocks=2,
                              filters=64,
                              kernel_size=4,
                              dropout_rate=0.25)
        if num_classes == 2:
            loss = 'binary_crossentropy'
        else:
            loss = 'sparse_categorical_crossentropy'
        optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
        callbacks = [tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=10)]
        history = model.fit(
            x_train,
            ohe2cat(y_train),
            epochs=100,
            callbacks=callbacks,
            validation_split=0.1,
            verbose=2,  # Logs once per epoch.
            batch_size=32,
            shuffle=True)
        print(str(type(x_train)) + " " + str(y_train.shape))
        model.save(self.train_output_path + 'model.h5')
        with open(self.train_output_path + 'tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(self.train_output_path + 'model.config', 'wb') as f:
            f.write(str(max_length).encode())
            f.close()

        self.done_training=True

    def test(self, x_test, remaining_time_budget=None):
        """
        :param x_test: list of str, input test sentences.
        :param remaining_time_budget:
        :return: A `numpy.ndarray` matrix of shape (sample_count, class_num).
                 here `sample_count` is the number of examples in this dataset as test
                 set and `class_num` is the same as the class_num in metadata. The
                 values should be binary or in the interval [0,1].
        """
        model = models.load_model(self.test_input_path + 'model.h5')
        with open(self.test_input_path + 'tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle, encoding='iso-8859-1')
        with open(self.test_input_path + 'model.config', 'r') as f:
            max_length = int(f.read().strip())
            f.close()

        train_num, test_num = self.metadata['train_num'], self.metadata['test_num']
        class_num = self.metadata['class_num']

        # tokenizing Chinese words
        if self.metadata['language'] == 'ZH':
            x_test = clean_zh_text(x_test)
            x_test = list(map(_tokenize_chinese_words, x_test))
        else:
            x_test = clean_en_text(x_test)

        x_test = tokenizer.texts_to_sequences(x_test)
        x_test = sequence.pad_sequences(x_test, maxlen=max_length)
        result = model.predict_classes(x_test)

        # category class list to sparse class list of lists
        y_test = np.zeros([test_num, class_num])
        for idx, y in enumerate(result):
            y_test[idx][y] = 1
        return y_test




Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device



In [0]:
model_path = '/content/sample_data'
model = Model(autoDaset.get_metadata(),model_path,model_path)

In [0]:
x_data, y_data = autoDaset.get_train()

In [0]:
X_train, Y_train = x_data[:6000], y_data[:6000]
X_val, Y_val = x_data[6000:],y_data[6000:]

train_data = (X_train, Y_train)
val_data = (X_val, Y_val)

In [47]:
model.train(train_data)

Train on 5400 samples, validate on 600 samples
Epoch 1/100
5400/5400 - 25s - loss: 0.6658 - acc: 0.5857 - val_loss: 0.5993 - val_acc: 0.6867
Epoch 2/100
5400/5400 - 24s - loss: 0.4229 - acc: 0.8189 - val_loss: 0.4134 - val_acc: 0.8217
Epoch 3/100
5400/5400 - 24s - loss: 0.2111 - acc: 0.9187 - val_loss: 0.4099 - val_acc: 0.8217
Epoch 4/100
5400/5400 - 24s - loss: 0.1094 - acc: 0.9622 - val_loss: 0.5791 - val_acc: 0.8117
Epoch 5/100
5400/5400 - 24s - loss: 0.0615 - acc: 0.9806 - val_loss: 0.6654 - val_acc: 0.8117
Epoch 6/100
5400/5400 - 24s - loss: 0.0418 - acc: 0.9883 - val_loss: 0.5933 - val_acc: 0.8050
Epoch 7/100
5400/5400 - 24s - loss: 0.0284 - acc: 0.9924 - val_loss: 0.6252 - val_acc: 0.8167
Epoch 8/100
5400/5400 - 24s - loss: 0.0218 - acc: 0.9941 - val_loss: 0.7355 - val_acc: 0.8150
Epoch 9/100
5400/5400 - 24s - loss: 0.0141 - acc: 0.9961 - val_loss: 0.7643 - val_acc: 0.8100
Epoch 10/100
5400/5400 - 24s - loss: 0.0095 - acc: 0.9974 - val_loss: 0.8177 - val_acc: 0.8150
Epoch 11/100

In [0]:
res = model.test(X_val)

In [27]:
Y_val

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [0]:
y_predict = ohe2cat(res)

In [0]:
y_true = ohe2cat(Y_val)

In [38]:
predict_good = np.equal(y_predict[:1792],y_true)

0.7633928571428571