# Join 平台眾開講

## Setup

- [x] Read messages from CSV.
- [x] Cut message content by Jieba.
- [ ] Build embeddings using fastText word vectors pre-trained on Wikipedia corpus.
- [ ] Build training and validation datasets.

In [214]:
path = "data/join"
topic = "立法方式保障"
# topic = "同性伴侣法"
# topic = "同性婚姻法"

In [349]:
from __future__ import division, print_function
import pandas as pd, numpy as np
import jieba
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
import os, math

Read in messages from CSV.

In [538]:
def get_messages_from_orig(topic):
    messages = pd.read_csv(os.path.join(path, topic + ".csv"), index_col=0)
    mask = messages.astype('str').applymap(lambda x: len(x.decode('utf-8'))).content > 20
    messages = messages[mask]
    messages.to_csv(os.path.join(path, topic + "-good.csv"))
    return messages

def get_labeled_messages(topic):
    return pd.read_csv(os.path.join(path, topic + "-good.csv"), index_col=0)

def labeled_only(messages):
    return messages[messages.ORID.notnull()]

all_messages = get_messages_from_orig(topic) if not os.path.exists(os.path.join(path, topic + "-good.csv")) \
                                         else get_labeled_messages(topic)
print("Total messages: {count}".format(count=len(all_messages)))
messages = labeled_only(all_messages)
print("Labeled messages: {count}".format(count=len(messages)))
messages.head()

Total messages: 10215
Labeled messages: 58


Unnamed: 0_level_0,createDate,authorName,content,ORID
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,2015-10-31 15:53:48,黃道明,在台灣已經有同志收養小孩了，你的資訊是多落伍～,O
9,2015-10-31 15:52:09,黃道明,真奇怪，明明是歐美一個個陸續通過同婚，你是眼睛瞎了嗎？,O
10,2015-10-31 15:51:24,高守謙,我來回答吧四處約砲 無固定性伴侶 就算是性解放的一種,I
14,2015-10-31 15:38:13,了了,援交 毒品 賭博 全部比同性戀禍害更深 那麼怕的話別生了地球很危險的,I
16,2015-10-31 15:34:41,路過的，呵呵。,同性戀領養現在是合法的喔，因為我們未婚都是單身者，現在單身者是可以領養小朋友的。,O


Build dictionary of phrases and load word embeddings.

In [539]:
def write_dictionary(messages):
    contents = [ jieba.lcut(c) for c in messages.content ]
    all_phrases = set([ ph for c in contents for ph in c ])
    with open(os.path.join(path, "dictionary.txt"), "w") as fh:
        for ph in all_phrases:
            fh.write(ph.encode("utf-8") + "\n")
            
def read_dictionary():
    dictionary = pd.read_csv(os.path.join(path, "dictionary.vec"), 
                       delim_whitespace=True, engine="python", header=None, index_col=0)
    return dictionary

if not os.path.exists("dictionary.vec"):
    write_dictionary(all_messages)
    !cd data/join; ../../../bin/fasttext print-word-vectors models/wiki.zh.bin < dictionary.txt > dictionary.vec
dictionary = read_dictionary()
dictionary.shape

(44956, 300)

In [540]:
dict_index = { ph.decode("utf-8"): i for i, ph in enumerate(dictionary.index) }

In [541]:
dict_index[u"同性"], dictionary.loc["同性"]

(35096, 1     -0.602590
 2     -0.106030
 3      0.585770
 4     -0.032679
 5      0.335980
 6     -0.544030
 7     -0.095186
 8     -0.346140
 9     -0.173840
 10     0.378300
 11    -0.276490
 12     0.023498
 13    -0.262720
 14    -0.178130
 15     0.561330
 16    -0.051622
 17    -0.254940
 18     0.053725
 19     0.079208
 20     0.169360
 21     0.408680
 22     0.176830
 23     0.061802
 24    -0.233210
 25    -0.180860
 26     0.354090
 27    -0.435450
 28     0.332370
 29     0.060527
 30     0.247440
          ...   
 271   -0.779680
 272    0.266220
 273    0.804560
 274   -0.406260
 275    0.451250
 276    0.038507
 277    0.900590
 278    0.222490
 279   -0.131450
 280    0.289220
 281   -0.461580
 282    0.028088
 283   -0.051983
 284    0.761830
 285    0.721690
 286   -0.048678
 287   -1.467300
 288   -0.408780
 289   -0.224890
 290    0.023509
 291   -0.040479
 292    0.233910
 293    0.410750
 294    0.446700
 295   -0.314240
 296    0.448340
 297    0.660420
 298   

In [560]:
input_length = 50
batch_size = 64

In [561]:
def create_embeddings(dictionary, input_length=100):
    phrases, latents = dictionary.shape
    embedding = Embedding(phrases, latents, input_length=input_length, weights=[dictionary])
    return embedding

Separate the messages into training and validation datasets.

In [577]:
select = np.random.random(len(messages)) < 0.9
train = messages[select]
valid = messages[~select]

In [578]:
print("Training data: {count}".format(count=len(train)))
train.head()

Training data: 52


Unnamed: 0_level_0,createDate,authorName,content,ORID
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,2015-10-31 15:52:09,黃道明,真奇怪，明明是歐美一個個陸續通過同婚，你是眼睛瞎了嗎？,O
10,2015-10-31 15:51:24,高守謙,我來回答吧四處約砲 無固定性伴侶 就算是性解放的一種,I
14,2015-10-31 15:38:13,了了,援交 毒品 賭博 全部比同性戀禍害更深 那麼怕的話別生了地球很危險的,I
16,2015-10-31 15:34:41,路過的，呵呵。,同性戀領養現在是合法的喔，因為我們未婚都是單身者，現在單身者是可以領養小朋友的。,O
20,2015-10-31 15:32:02,大少爺,如果一直沒通過，哪來足夠的長時間和數據證明給你看？希望能結婚就等於自私嗎？異性戀自己能結婚，...,R


In [579]:
print("Validation data: {count}".format(count=len(valid)))
valid.head()

Validation data: 6


Unnamed: 0_level_0,createDate,authorName,content,ORID
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,2015-10-31 15:53:48,黃道明,在台灣已經有同志收養小孩了，你的資訊是多落伍～,O
14091,2015-08-04 4:56:35,杜里昂,"同性戀要什麼權利還要你們""給""，你們是哪根蔥啊？人權是最基本的保障好嗎？但至少是第一步…… ...",R
14101,2015-08-04 4:00:00,RED,參考國外已上路的同性婚姻可見，在同性婚姻合法化並制定在原婚姻法基礎上，非專案法管理，結果是雙...,I
14126,2015-08-03 15:46:03,馮耿,"１。讚成同性也有婚姻的基本權利,但無須另設新法(改民法的用語定義不就好了@@?)。２。讚成伴...",D
14132,2015-08-03 10:14:57,黑桐喵,原來別人要不要結婚須要所有人一起投票決定。既然都說了「政府對全體人民的人權有履行義務且不應以...,R


Convert datasets to word embeddings.

In [580]:
def get_data(messages):
    jieba_cut = np.frompyfunc(lambda x: [ ph for ph in jieba.lcut(x.decode("utf-8")) if ph != u" " ][:input_length], 1, 1)
    word_embed = np.frompyfunc(lambda x: [ dict_index[ph] for ph in x ], 1, 1)
    fill_to_length = np.frompyfunc(lambda x: x + [0] * (input_length - len(x)), 1, 1)
    return np.stack(fill_to_length(word_embed(jieba_cut(messages.content.values))), axis=0)

answers = { "O": 0, "R": 1, "I": 2, "D": 3 }

def get_answer(messages):
    return np.array([ answers[x] for x in messages.ORID ])

train = (get_data(train), get_answer(train))
valid = (get_data(valid), get_answer(valid))

In [581]:
train[0][10], train[1][10], valid[0][0], valid[1][0]

(array([17293, 22897, 38608,  3570, 17341,  6481, 20914, 40590,  2743,
         4100,  6481, 28378, 19941, 10141, 10141, 10141, 10141, 10141,
         3570, 19941, 17363, 25141, 32542,  3069, 30973,  4387,  6481,
        20303,   679, 10141,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]),
 1,
 array([43195, 11227, 21960, 17636, 38356, 29589, 19089, 17341, 20914,
        30973,  4387, 44296,  5882, 16474, 31682,  4569,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]),
 0)

# Single hidden layer model

In [582]:
def linear_model():
    model = Sequential()
    model.add(create_embeddings(dictionary, input_length))
    model.add(Dense(512))
    model.add(Dense(4, activation="softmax"))
    return model

linear = linear_model()
linear.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 50, 300)       13486800    embedding_input_5[0][0]          
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 50, 512)       154112      embedding_5[0][0]                
____________________________________________________________________________________________________
dense_7 (Dense)                  (None, 50, 4)         2052        dense_6[0][0]                    
Total params: 13,642,964
Trainable params: 13,642,964
Non-trainable params: 0
____________________________________________________________________________________________________


In [583]:
linear.compile("adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [584]:
linear.fit(train[0], train[1], nb_epoch=1, validation_data=valid)

ValueError: Error when checking model target: expected dense_7 to have 3 dimensions, but got array with shape (52, 1)