# Setup

In [19]:
path = "data/join"
topic = "立法方式保障"

In [20]:
from __future__ import division, print_function
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import jieba
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten, Dropout, BatchNormalization, Conv1D, ZeroPadding1D
import os, math, re, pickle

jieba.set_dictionary(os.path.join("data", "dict.txt.big"))

In [21]:
_data = None

def load_data():
    global _data
    if _data is None:
        _data = pd.read_csv(os.path.join(path, topic + "-sentences.csv"))
    return _data

Build dictionary for word embedding.

In [35]:
dictionary_path = os.path.join(path, "dictionary.pkl")

clean_phrase_re = re.compile(r"[ ]+")

def cleaned_phrase(phrases):
    for ph in phrases:
        ph = clean_phrase_re.sub("", ph)
        if ph != "":
            yield ph

def create_dictionary(*data):
    phrases = {}
    for d in data:
        for sentence in d:
            for ph in cleaned_phrase(jieba.cut(str(sentence))):
                phrases[ph] = True
    with open(os.path.join(path, "dictionary.txt"), "w") as fh:
        fh.writelines([ ph + "\n" for ph in phrases.keys() ])
    !cd $path; mkdir -p models; ln ../fasttext/wiki.zh.bin models/wiki.zh.bin
    !cd $path; ~/bin/fasttext print-word-vectors models/wiki.zh.bin < dictionary.txt > dictionary.vec
    dictionary = pd.read_csv(os.path.join(path, "dictionary.vec"), 
                             delim_whitespace=True, engine="python", header=None, index_col=0)
    with open(dictionary_path, "wb") as fh:
        pickle.dump([{ ph: i for i, ph in enumerate(dictionary.index) }, dictionary], fh)

def load_dictionary():
    with open(dictionary_path, "rb") as fh:
        [ dict_index, dictionary ] = pickle.load(fh)
        return dict_index, dictionary
    
if not os.path.exists(dictionary_path):
    data = load_data()
    create_dictionary(data.sentence)

dict_index, dictionary = load_dictionary()
phrases_n = len(dictionary)
latent_n = len(dictionary.columns)

ln: models/wiki.zh.bin: File exists


Examine phrase length.

In [28]:
#data = load_data()
#stat = np.frompyfunc(lambda s: len(jieba.lcut(str(s))), 1, 1)(data.sentence.values)
#(stat.min(), stat.max(), stat.mean(), stat.std())

Encode lables and embed phrases.

In [41]:
# phrase-length (min, max, mean, std) = (1, 1096, 17.385437090122373, 20.549680891647231)
input_length = 20

data_path = os.path.join(path, "data.pkl")

orid_index = { "O": 0, "R": 1, "I": 2, "D": 3 }

if not os.path.exists(data_path):
    def get_label(df):
        labels = np.zeros((len(df), 4))
        for i, l in enumerate(df["orid"]):
            j = orid_index[l]
            labels[i, j] = 1
        return labels

    def get_text(df):
        texts = np.zeros((len(df), input_length))
        for i, text in enumerate(df["sentence"].values):
            for j, ph in enumerate(cleaned_phrase(jieba.lcut(str(text)))):
                if j >= input_length:
                    break
                if ph in dict_index:
                    texts[i, j] = dict_index[ph]
        return texts
    
    data = load_data()
    data = data[data["orid"].notnull()]
    mask = np.random.random(len(data)) > 0.1
    train, valid = data[mask], data[~mask]
    train_x, train_y = get_text(train), get_label(train)
    valid_x, valid_y = get_text(valid), get_label(valid)
    
    with open(data_path, "wb") as fh:
        pickle.dump([(train_x, train_y), (valid_x, valid_y)], fh)
else:
    with open(data_path, "rb") as fh:
        [(train_x, train_y), (valid_x, valid_y)] = pickle.load(fh)