# Setup

In [1]:
path = "data/join"
topic = "立法方式保障"

In [2]:
from __future__ import division, print_function
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import jieba
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten, Dropout, BatchNormalization, Conv1D, ZeroPadding1D, MaxPooling1D
import os, math, re, pickle

jieba.set_dictionary(os.path.join("data", "dict.txt.big"))

Using TensorFlow backend.


In [3]:
_data = None

def load_data():
    global _data
    if _data is None:
        _data = pd.read_csv(os.path.join(path, topic + "-sentences.csv"))
    return _data

Build dictionary for word embedding.

In [4]:
dictionary_path = os.path.join(path, "dictionary.pkl")

clean_phrase_re = re.compile(r"[ ]+")

def cleaned_phrase(phrases):
    for ph in phrases:
        ph = clean_phrase_re.sub("", ph)
        if ph != "":
            yield ph

def create_dictionary(*data):
    phrases = {}
    for d in data:
        for sentence in d:
            for ph in cleaned_phrase(jieba.cut(str(sentence))):
                phrases[ph] = True
    with open(os.path.join(path, "dictionary.txt"), "w") as fh:
        fh.writelines([ ph + "\n" for ph in phrases.keys() ])
    !cd $path; mkdir -p models; ln ../fasttext/wiki.zh.bin models/wiki.zh.bin
    !cd $path; ~/bin/fasttext print-word-vectors models/wiki.zh.bin < dictionary.txt > dictionary.vec
    dictionary = pd.read_csv(os.path.join(path, "dictionary.vec"), 
                             delim_whitespace=True, engine="python", header=None, index_col=0)
    with open(dictionary_path, "wb") as fh:
        pickle.dump([{ ph: i for i, ph in enumerate(dictionary.index) }, dictionary], fh)

def load_dictionary():
    with open(dictionary_path, "rb") as fh:
        [ dict_index, dictionary ] = pickle.load(fh)
        return dict_index, dictionary
    
if not os.path.exists(dictionary_path):
    data = load_data()
    create_dictionary(data.sentence)

dict_index, dictionary = load_dictionary()
phrases_n = len(dictionary)
latent_n = len(dictionary.columns)

Examine phrase length.

In [5]:
#data = load_data()
#stat = np.frompyfunc(lambda s: len(jieba.lcut(str(s))), 1, 1)(data.sentence.values)
#(stat.min(), stat.max(), stat.mean(), stat.std())

Encode lables and embed phrases.

In [6]:
# phrase-length (min, max, mean, std) = (1, 1096, 17.385437090122373, 20.549680891647231)
input_length = 20

data_path = os.path.join(path, "data.pkl")

orid_index = { "O": 0, "R": 1, "I": 2, "D": 3 }

if not os.path.exists(data_path):
    def get_label(df):
        labels = np.zeros((len(df), 4))
        for i, l in enumerate(df["orid"]):
            j = orid_index[l]
            labels[i, j] = 1
        return labels

    def get_text(df):
        texts = np.zeros((len(df), input_length))
        for i, text in enumerate(df["sentence"].values):
            for j, ph in enumerate(cleaned_phrase(jieba.lcut(str(text)))):
                if j >= input_length:
                    break
                if ph in dict_index:
                    texts[i, j] = dict_index[ph]
        return texts
    
    data = load_data()
    data = data[data["orid"].notnull()]
    mask = np.random.random(len(data)) > 0.1
    train, valid = data[mask], data[~mask]
    train_x, train_y = get_text(train), get_label(train)
    valid_x, valid_y = get_text(valid), get_label(valid)
    
    with open(data_path, "wb") as fh:
        pickle.dump([(train_x, train_y), (valid_x, valid_y)], fh)
else:
    with open(data_path, "rb") as fh:
        [(train_x, train_y), (valid_x, valid_y)] = pickle.load(fh)

Building prefix dict from /Users/pm5/src/trustableai/ggv-example/nbs/data/dict.txt.big ...
Loading model from cache /var/folders/sy/q12w5xyn4lngqxh_j63vwr4h0000gn/T/jieba.u6fcb61346ce78c4353b892e9bfaaea38.cache
Loading model cost 1.651 seconds.
Prefix dict has been built succesfully.


# Simple CNN

In [10]:
def simple_cnn_model():
    model = Sequential()
    model.add(Embedding(phrases_n, latent_n, input_length=input_length, weights=[dictionary], trainable=False))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 3, border_mode="same", activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(100, activation="relu"))
    model.add(Dropout(0.7))
    model.add(Dense(4, activation="softmax"))
    return model
    
simple_cnn = simple_cnn_model()
simple_cnn.compile("adam", loss="categorical_crossentropy", metrics=["accuracy"])
simple_cnn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 20, 300)       9825000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
batchnormalization_1 (BatchNorma (None, 20, 300)       1200        embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 20, 300)       0           batchnormalization_1[0][0]       
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 20, 64)        57664       dropout_1[0][0]                  
___________________________________________________________________________________________

In [12]:
def train_simple_cnn(lr=None, epoch=1, full=False):
    if lr is not None:
        simple_cnn.optimizer.lr = lr
    if full:
        simple_cnn.layers[0].trainable = True
    simple_cnn.fit(train_x, train_y, nb_epoch=epoch, validation_data=(valid_x, valid_y))
    
train_simple_cnn(1e-4)
train_simple_cnn(1e-1, 4)
train_simple_cnn(1e-2, 16)
train_simple_cnn(1e-3, 16)
train_simple_cnn(1e-4, 2, full=True)

Train on 55 samples, validate on 6 samples
Epoch 1/1
Train on 55 samples, validate on 6 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Train on 55 samples, validate on 6 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Train on 55 samples, validate on 6 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Train on 55 samples, validate on 6 samples
Epoch 1/2
Epoch 2/2
