* 針對 刑法320竊盜罪 且刑責與刑期都只有一項 (避免多人、多罪名)
* X = 犯罪事實 (起訴書)
* Y = 判決 (裁判書)
* 對 X 斷詞

In [4]:
%%time
import json, pickle, re, os
import tensorflow as tf
from ckiptagger import WS

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) # ignore tensorflow warning
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # to use gpu
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config) # let keras use only required VRAM
ws = WS('./ckip/data')

with open('facts.json', 'r') as f:
    facts = json.load(f)
with open('maintext_parsed.json', 'r') as f:
    maintext = json.load(f)
with open('./pkl/criminal_320.pkl', 'rb') as f:
    criminal_320 = pickle.load(f)

X = []
Y = []
X_seg = []

### select the data
for file, key_list in criminal_320.items():
    for key in key_list:
        if facts[key] == None:
            continue
        try: label = maintext[key]['labels']
        except: continue
        if len(label) == 1:
            label = label[0]

            fact = re.sub('\s', '', str(facts[key]))
            if (label == '有期徒刑' and len(maintext[key]['imprisonment']) == 1) or \
            (label == '拘役' and len(maintext[key]['short_imprisonment']) == 1) or \
            label == '罰金' or label == '無罪':
                X.append(fact)
                Y.append(label)

### segmentation
for i, fact in enumerate(X):
    try:
        fact = re.sub('\s', '', fact)
        X_seg.append(' '.join(ws([fact])[0]))
    except:continue
        
with open('./pkl/X.pkl', 'wb') as f:
    pickle.dump(X, f)
with open('./pkl/Y.pkl', 'wb') as f:
    pickle.dump(Y, f)
with open('./pkl/X_seg.pkl', 'wb') as f:
    pickle.dump(X_seg, f)

CPU times: user 1h 53min 30s, sys: 12min 26s, total: 2h 5min 57s
Wall time: 36min 57s


In [5]:
from collections import Counter
import pickle
with open('./pkl/Y.pkl', 'rb') as f:
    Y = pickle.load(f)
print('Totol', len(Y))

z = Counter(Y)
for i in list(z.most_common()):
    print(i[0], i[1])

Totol 12298
拘役 6437
有期徒刑 3617
罰金 1996
無罪 248


* 建立 embedding_matrix for keras, word_idx for padding
* X : 將斷詞過的文本轉換成由數字組成的相同長度序列，每個數字代表一組詞彙
* Y_onehot : 將各種 判決 轉換成 數字 再轉換成 one-hot encoding
* 分割資料
* y_test_act : 把 one-hot 再轉回數字，最後的 confusion matrix 可以用

In [1]:
def max_seq_len(X_seg, percentage):
    word_count = Counter([len(x.split(' ')) for x in X_seg])
    sort_count = sorted(word_count.items(), key=lambda kv: kv[0])
    max_len = 0
    for k, v in sort_count:
        max_len += v
        if max_len/len(X_seg) > percentage:
            return k

In [9]:
import pickle
from collections import Counter
with open('./pkl/X_seg.pkl', 'rb') as f:
    X = pickle.load(f)
    
max_seq_len(X, 0.9)

233

In [2]:
%%time
import pickle
import numpy as np
from collections import Counter
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

with open('./pkl/X_seg.pkl', 'rb') as f:
    X_seg = pickle.load(f)
with open('./pkl/Y.pkl', 'rb') as f:
    Y = pickle.load(f)
wv_model = Word2Vec.load('./data/fact_w2v.model')
max_sequence_len = max_seq_len(X_seg, 0.9)

word_idx = {"_PAD": 0}
vocab_list = [(k, wv_model.wv[k]) for k, v in wv_model.wv.vocab.items()]
embedding_matrix = np.zeros((len(wv_model.wv.vocab) + 1, wv_model.vector_size))
for i, word in enumerate(vocab_list):
    word_idx[word[0]] = i+1
    embedding_matrix[i+1] = word[1]

X = [[word_idx.get(w, 0) for w in doc.split(' ')] for doc in X_seg]
X = pad_sequences(X, maxlen=max_sequence_len)

le = LabelEncoder()
Y_le = le.fit_transform(Y)
Y_onehot = to_categorical(Y_le)

x_train, x_test, y_train, y_test = train_test_split(X, Y_onehot, test_size=0.3, shuffle=True, random_state=5566)
y_test_act = [np.where(r==1)[0][0] for r in y_test]
y_test_act = np.array(y_test_act)

with open('./pkl/x_train.pkl', 'wb') as f:
    pickle.dump(x_train, f)
with open('./pkl/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
with open('./pkl/x_test.pkl', 'wb') as f:
    pickle.dump(x_test, f)
with open('./pkl/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)
with open('./pkl/y_test_act.pkl', 'wb') as f:
    pickle.dump(y_test_act, f)
with open('./pkl/embedding_matrix.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


CPU times: user 5.72 s, sys: 32.5 s, total: 38.2 s
Wall time: 38 s
