* https://www.kaggle.com/c/fake-news-pair-classification-challenge/data
* https://github.com/fxsjy/jieba

BERT
* https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb#scrollTo=Hcpfl4N2EdOk
* https://github.com/google-research/bert/blob/master/multilingual.md

Gluon-bert
* https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html

Embedding options
* https://github.com/Embedding/Chinese-Word-Vectors (Embedding 1)
* https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md (Embedding 2)

Performance
* Using Embedding 1: Performance 0.70640
* Using Embedding 2: Performance 0.68641
* Using BERT: Performance 0.849

In [1]:
cd /home/dmlab/sundong/competition/wsdm2019/fakenews/notebook/bert_repo

/home/dmlab/sundong/competition/wsdm2019/fakenews/notebook/bert_repo


In [2]:
import modeling
import optimization
import run_classifier
import tokenization

In [3]:
cd /home/dmlab/sundong/competition/wsdm2019/fakenews/notebook

/home/dmlab/sundong/competition/wsdm2019/fakenews/notebook


In [4]:
import glob
import pandas as pd
import numpy as np

import tensorflow as tf
import keras
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [5]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15870701569239746073
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 143523492730913191
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 8615557650607129747
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14520152884
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12604866486874891888
physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:06:00.0, compute capability: 7.0"
]


In [6]:
TASK = 'Fake' 
assert TASK in ('Fake', 'CoLA'), 'Only (Fake, CoLA) are demonstrated here.'

TASK_DATA_DIR = '../data/FakeNews_BERT/FakeNews_BERT/'


# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model

BERT_MODEL = 'chinese_L-12_H-768_A-12' 
BERT_PRETRAINED_DIR = '../data/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
! ls $BERT_PRETRAINED_DIR


OUTPUT_DIR = '/home/dmlab/sundong/competition/wsdm2019/fakenews/notebook/results'
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** BERT pretrained directory: ../data/chinese_L-12_H-768_A-12 *****
bert_config.json		     bert_model.ckpt.index  vocab.txt
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta
***** Model output directory: /home/dmlab/sundong/competition/wsdm2019/fakenews/notebook/results *****


In [7]:
# Model Hyper Parameters
TRAIN_BATCH_SIZE = 128 # 32
EVAL_BATCH_SIZE = 32  # 8    
TEST_BATCH_SIZE = 32  # 8    
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 43  # According to our dataset, length of 43 can cover 99.5% of the titles.

# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

In [8]:
processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
  "fake": run_classifier.FakeProcessor,
}
processor = processors[TASK.lower()]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

In [9]:
tpu_cluster_resolver = None

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

In [10]:
train_examples = processor.get_train_examples(TASK_DATA_DIR)
num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [11]:
model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=True)

In [12]:
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

INFO:tensorflow:Using config: {'_model_dir': '/home/dmlab/sundong/competition/wsdm2019/fakenews/notebook/results', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff7b811a160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu

In [13]:
import datetime

In [14]:
# Train the model.
print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')
train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started training at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** Finished training at {} *****'.format(datetime.datetime.now()))

MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...
INFO:tensorflow:Writing example 0 of 256442
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-1
INFO:tensorflow:tokens: [CLS] 祛 湿 排 油 还 减 肥 ， 每 天 坚 持 做 ， 轻 松 瘦 大 肚 子 [SEP] 火 箭 阵 中 一 大 将 铁 定 成 赛 季 第 三 巨 头 ， 但 却 遭 [SEP]
INFO:tensorflow:input_ids: 101 4865 3969 2961 3779 6820 1121 5503 8024 3680 1921 1780 2898 976 8024 6768 3351 4607 1920 5496 2094 102 4125 5055 7347 704 671 1920 2199 7188 2137 2768 6612 2108 5018 676 2342 1928 8024 852 1316 6901 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:label: unrelated (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-2
INFO:tensorflow:tokens: [CLS] [UNK] ： 12 岁 女 孩 称 遭 母 亲 家 暴 手 指 被 剪 断 身 体 被 [SEP] [UNK] ： 12 岁 女 孩 称 遭 母 亲 家 暴 手 指 被 剪 断 身 体 被 [

INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = 

INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FRO

INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  na

INFO:tensorflow:Loss for final step: 0.187461.
INFO:tensorflow:training_loop marked as finished
***** Finished training at 2018-12-07 00:26:51.204568 *****


In [16]:
# Eval the model.
eval_examples = processor.get_dev_examples(TASK_DATA_DIR)
eval_features = run_classifier.convert_examples_to_features(
    eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(eval_examples)))
print('  Batch size = {}'.format(EVAL_BATCH_SIZE))

# Eval will be slightly WRONG on the TPU because it will truncate
# the last batch.
eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
eval_input_fn = run_classifier.input_fn_builder(
    features=eval_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print('  {} = {}'.format(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))

INFO:tensorflow:Writing example 0 of 64110
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: dev-1
INFO:tensorflow:tokens: [CLS] 怀 孕 后 在 电 脑 前 工 作 有 影 响 吗 ？ 电 脑 辐 射 对 胎 [SEP] 螃 蟹 ！ 补 钙 ！ 吹 风 机 ！ 防 辐 射 服 ！ 专 坑 孕 妇 的 [SEP]
INFO:tensorflow:input_ids: 101 2577 2097 1400 1762 4510 5554 1184 2339 868 3300 2512 1510 1408 8043 4510 5554 6781 2198 2190 5522 102 6083 6101 8013 6133 7159 8013 1430 7599 3322 8013 7344 6781 2198 3302 8013 683 1778 2097 1967 4638 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:label: unrelated (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: dev-2
INFO:tensorflow:tokens: [CLS] 文 章 不 会 英 语 ， 被 骂 以 后 求 助 儿 子 来 翻 译 ， 结 [SEP] 2018 即 将 面 临 30 年 来 最 大 失 业 潮 ， 倒 闭 潮 的 危 机 [SEP]
INFO:tensorflow:input_ids: 101 3152 4995 679 833 5739 6427 8024 6158 7733 809 140

INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT

INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder

INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:

In [17]:
def input_fn_builder(features, seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

    
  def input_fn(params):
    """The actual input function."""
    
    batch_size = 2

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

In [18]:
# Test the model.
test_examples = processor.get_test_examples(TASK_DATA_DIR)
test_features = run_classifier.convert_examples_to_features(
    test_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

# Eval will be slightly WRONG on the TPU because it will truncate
# the last batch.
test_input_fn = input_fn_builder(          # run_classifier.
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)


result = estimator.predict(input_fn=test_input_fn)

INFO:tensorflow:Writing example 0 of 80126
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-1
INFO:tensorflow:tokens: [CLS] 萨 拉 赫 人 气 爆 棚 ! 埃 及 总 统 大 选 未 参 选 获 百 万 [SEP] 辟 谣 ！ 里 昂 官 方 否 认 费 基 尔 加 盟 利 物 浦 ， 难 道 [SEP]
INFO:tensorflow:input_ids: 101 5855 2861 6622 782 3698 4255 3476 106 1812 1350 2600 5320 1920 6848 3313 1346 6848 5815 4636 674 102 6792 6469 8013 7027 3203 2135 3175 1415 6371 6589 1825 2209 1217 4673 1164 4289 3855 8024 7410 6887 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:label: unrelated (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-2
INFO:tensorflow:tokens: [CLS] 萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思 [SEP] 10 大 最 让 美 国 人 相 信 的 荒 诞 谣 言 ， 如 蜥 蜴 人 掌 控 [SEP]
INFO:tensorflow:input_ids: 101 5855 6809 1990 6158 2936 1400 1440 6425 5401 1744 46

In [19]:
raw_result_list = []
result_list = []
for rr in result:
    raw_result_list.append(rr)
    result_list.append(np.argmax(rr))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (2, 43)
INFO:tensorflow:  name = input_mask, shape = (2, 43)
INFO:tensorflow:  name = label_ids, shape = (2,)
INFO:tensorflow:  name = segment_ids, shape = (2, 43)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (21128, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/laye

INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CK

INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:te

In [25]:
test_path = '../data/test.csv'
submission_path = '../data/sample_submission_sd_BERT.csv'
df_test = pd.read_csv(test_path)
df_test['pred_label'] = result_list
idx_label = {0:'unrelated', 1:'agreed', 2:'disagreed'}
df_test['pred_label'] = df_test['pred_label'].apply(lambda x: idx_label[x])

In [27]:
df_test[['id', 'pred_label']].rename(columns={'id':'Id','pred_label':'Category'}).to_csv(submission_path, index=False)

# old code

In [None]:
xnli_tsv = pd.read_csv('../data/XNLI-1.0/XNLI-1.0/xnli.dev.tsv', sep='\t')

In [None]:
xnli_tsv.tail(5)

In [None]:
tr_tsv = pd.read_csv('../data/FakeNews_BERT/FakeNews_BERT/train.tsv', sep='\t')
de_tsv = pd.read_csv('../data/FakeNews_BERT/FakeNews_BERT/dev.tsv', sep='\t')
te_tsv = pd.read_csv('../data/FakeNews_BERT/FakeNews_BERT/test.tsv', sep='\t')

In [None]:
tr_tsv.label.value_counts()

In [None]:
de_tsv.label.value_counts()

In [None]:
tr_tsv.head(10)

In [None]:
te_tsv

In [None]:
tr_tsv[tr_tsv.title1_zh == '掌控地球的6个外星人，空间站无意中拍摄到这画面']

In [None]:
tr_tsv[tr_tsv.title1_zh == '掌控地球的6个外星人，空间站无意中拍摄到这画面'].title2_zh.apply(lambda x: x.replace('\n', ','))

In [None]:
## Preparing dataset 

In [None]:
train_path = '../data/train.csv'
test_path = '../data/test.csv'
sample_submission_path = '../data/sample_submission.csv'
submission_path = '../data/sample_submission_sd_embed2.csv'

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_samplesub = pd.read_csv(sample_submission_path)

In [None]:
df_train['label'].value_counts()

In [None]:
### Handling problematic cases, which include '\n' or '\t' 
df_train['title1_zh'] = df_train.title1_zh.apply(lambda x: str(x).replace('\n', ','))
df_train['title1_zh'] = df_train.title1_zh.apply(lambda x: str(x).replace('\t', ','))
df_train['title2_zh'] = df_train.title2_zh.apply(lambda x: str(x).replace('\n', ','))
df_train['title2_zh'] = df_train.title2_zh.apply(lambda x: str(x).replace('\t', ','))

df_test['title1_zh'] = df_test.title1_zh.apply(lambda x: str(x).replace('\n', ','))
df_test['title1_zh'] = df_test.title1_zh.apply(lambda x: str(x).replace('\t', ','))
df_test['title2_zh'] = df_test.title2_zh.apply(lambda x: str(x).replace('\n', ','))
df_test['title2_zh'] = df_test.title2_zh.apply(lambda x: str(x).replace('\t', ','))

In [None]:
np.percentile(df_test['title2_zh'].apply(len), 99.5)

In [None]:
idxs = list(df_train.index)
np.random.shuffle(idxs)
val_num = len(idxs) // 5 
train_idxs = idxs[:-val_num]
val_idxs = idxs[-val_num:]

In [None]:
df_tra = df_train.loc[train_idxs, :][['title1_zh', 'title2_zh', 'label']]
df_val = df_train.loc[val_idxs, :][['title1_zh', 'title2_zh', 'label']]
df_tes = df_test[['title1_zh', 'title2_zh']]

In [None]:
df_tra.to_csv('../data/FakeNews_BERT/FakeNews_BERT/train.tsv', sep='\t',header=True, index=False)
df_val.to_csv('../data/FakeNews_BERT/FakeNews_BERT/dev.tsv', sep='\t',header=True, index=False)
df_tes.to_csv('../data/FakeNews_BERT/FakeNews_BERT/test.tsv', sep='\t',header=True, index=False)

In [None]:
re

In [None]:
df_train['label'].value_counts()

In [None]:
#encoding=utf-8
import jieba

In [None]:
seg_list = jieba.cut("2017养老保险又新增两项，农村老人人人可申领，你领到了吗", cut_all=False)
print(list(seg_list))

In [None]:
df = df_train

In [None]:
df_train.shape, df_test.shape

In [None]:
dictt = {}

for df in [df_train, df_test]:
    for i in df[['tid1','title1_zh']].drop_duplicates().iterrows():
        dictt[i[1]['tid1']] = i[1]['title1_zh']
    for i in df[['tid2','title2_zh']].drop_duplicates().iterrows():
        dictt[i[1]['tid2']] = i[1]['title2_zh']

In [None]:
len(dictt)

In [None]:
dictt2 = {}
for i in dictt.keys():
    try:
        seg_list = jieba.cut(dictt[i], cut_all=False)
        dictt2[i] = list(seg_list)
    except AttributeError:
        dictt2[i] = [] 

In [None]:
import numpy as np
from gensim.models import KeyedVectors


tmp_file = '../data/cc.zh.300.vec'
model = KeyedVectors.load_word2vec_format(tmp_file)
model.get_vector('的')

In [None]:
aafa = []
for l in dictt2.values():
    aafa.append(len(l))
    
np.median(aafa), np.percentile(aafa, q=99.5), np.mean(aafa), np.max(aafa)

In [None]:
import tensorflow as tf
import keras
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [None]:
word_embeddings = {i:j for i,j in zip(model.wv.index2word, list(model.wv.vectors))}
word_index = {j:i for i,j in enumerate(word_embeddings.keys())}

In [None]:
dictt3 = {}
word_thres = int(np.percentile(aafa, q=99.5))
for i,j in dictt2.items():
    if len(j) < word_thres:
        k = j + ['']*(word_thres-len(j))
    else:
        k = j[:word_thres]
    dictt3[i] = k

In [None]:
df_train['label'] = pd.factorize(df_train['label'])[0]

In [None]:
df_train.head(5)

In [None]:
class Prediction:
    def __init__(self):
        self.batch_size = 128
        self._load_word_embedding()
        self._train_val_split()
    
    
    def _load_word_embedding(self):
        self.word_embedding = word_embeddings

        
    def _simple_attention(self, target, reference):
        attention = keras.layers.Dense(1, activation=keras.activations.tanh)(reference)
        attention = keras.layers.Reshape((-1,))(attention)
        attention = keras.layers.Activation(keras.activations.softmax)(attention)
        return keras.layers.Dot((1,1))([target, attention])
    
        
    def _train_val_split(self):
        idxs = list(df_train.index)
        np.random.shuffle(idxs)
        val_num = len(idxs) // 5 
        self.train_idxs = idxs[:-val_num]
        self.val_idxs = idxs[-val_num:]
        self.training_data_size = len(self.train_idxs)
        self.validation_data_size = len(self.val_idxs)
        
        
    def _build_model(self):
        max_num_word = np.percentile(aafa, q=99.5)
        word_embedding_layer = keras.layers.Embedding(
            input_dim = len(self.word_embedding),
            output_dim = len(list(self.word_embedding.values())[0]),
            weights = [np.array(list(self.word_embedding.values()))],
            input_length = max_num_word,
            trainable = False
        )
        
        dropout = keras.layers.Dropout(0.2)
        
        word_input = keras.Input((max_num_word,))
        word_emb = word_embedding_layer(word_input)
        word_cnn = keras.layers.Conv1D(filters=200, kernel_size=5, padding='same', activation='relu', strides=1)(word_emb)
        word_att = self._simple_attention(word_cnn, word_cnn)
        

        word_input2 = keras.Input((max_num_word,))
        word_emb2 = word_embedding_layer(word_input2)
        word_cnn2 = keras.layers.Conv1D(filters=200, kernel_size=5, padding='same', activation='relu', strides=1)(word_emb2)
        word_att2 = self._simple_attention(word_cnn2, word_cnn2)
        
        word_output = keras.layers.Concatenate()([word_att, word_att2])
        
        
        logits = keras.layers.Dense(3, activation=keras.activations.softmax)(word_output)
        model = keras.Model([word_input, word_input2], logits)
        model.compile(
            optimizer = keras.optimizers.Adam(0.001),
            loss = keras.losses.categorical_crossentropy,
            metrics = [keras.metrics.categorical_accuracy]
        )
        return model

    
    def _training_data_generator(self):
        def __gen__():
            while True:
                for idx in self.train_idxs:
                    pair = df_train.iloc[idx]
                    text1_ind = [word_index.get(x, 893405) for x in dictt3[pair['tid1']]]
                    text2_ind = [word_index.get(x, 893405) for x in dictt3[pair['tid2']]]
                    yield text1_ind, text2_ind, pair['label']  
                    
        gen = __gen__()
        
        while True:
            batch = [np.stack(x) for x in zip(*(next(gen) for _ in range(self.batch_size)))]
            yield [batch[0], batch[1]], keras.utils.to_categorical(batch[-1], 3)
            
    
    
    def _validation_data_generator(self):
        def __gen__():
            while True:
                for idx in self.val_idxs:
                    pair = df_train.iloc[idx]
                    text1_ind = [word_index.get(x, 893405) for x in dictt3[pair['tid1']]]
                    text2_ind = [word_index.get(x, 893405) for x in dictt3[pair['tid2']]]
                    yield text1_ind, text2_ind, pair['label']  
                    
        gen = __gen__()
        
        while True:
            batch = [np.stack(x) for x in zip(*(next(gen) for _ in range(self.batch_size)))]
            yield [batch[0], batch[1]], keras.utils.to_categorical(batch[-1], 3)
                
    
    
    def _test_data_generator(self):
        def __gen__():
            while True:
                idxs = list(df_test.index)
                for idx in idxs:
                    pair = df_test.iloc[idx]
                    text1_ind = [word_index.get(x, 893405) for x in dictt3[pair['tid1']]]
                    text2_ind = [word_index.get(x, 893405) for x in dictt3[pair['tid2']]]
                    yield text1_ind, text2_ind, pair['label']  
                    
        gen = __gen__()
        
        while True:
            batch = [np.stack(x) for x in zip(*(next(gen) for _ in range(len(df_test))))]
            yield [batch[0], batch[1]]  
            
    
    def _test_batch_data(self):
        t1 = df_test['tid1'].apply(lambda x: [word_index.get(y, 893405) for y in dictt3[x]]).apply(np.array)
        t2 = df_test['tid2'].apply(lambda x: [word_index.get(y, 893405) for y in dictt3[x]]).apply(np.array)
        return [np.array(list(t1)), np.array(list(t2))]
   
    
    
    def train(self):
        batch_size = 20
        self.training_data = self._training_data_generator()
        self.validation_data = self._validation_data_generator()
        #         self.test_data = self._test_data_generator()
        self.test_data = self._test_batch_data()
        
        self.model = self._build_model()
        
        self.history = self.model.fit_generator(
            generator = self.training_data,
            validation_data = self.validation_data,
            steps_per_epoch = self.training_data_size // self.batch_size,
            validation_steps = self.validation_data_size // self.batch_size,
            class_weight = {0:1/16, 1:1/15, 2:1/5},
            epochs = 3
        )
      
        
        self.result = self.model.predict_on_batch(self.test_data)
           
#         self.result = self.model.predict_generator(
#             generator = self.test_data,
#             steps = 1
#         )
        

In [None]:
r = Prediction()
r.train()

In [None]:
## validation accuacy가 가장 높은 epoch의 모델로 test data를 평가하는 걸로 코드 전환 - 모델 save or stopping condition

In [None]:
df_test['pred_label'] = r.result.argmax(axis=1)
idx_label = {0:'unrelated', 1:'agreed', 2:'disagreed'}
df_test['pred_label'] = df_test['pred_label'].apply(lambda x: idx_label[x])
print(df_test['pred_label'].value_counts())


In [None]:
df_test[['id', 'pred_label']].rename(columns={'id':'Id','pred_label':'Category'}).to_csv(submission_path, index=False)

In [None]:
df_train['label'].value_counts()

In [None]:
min(df_train.tid1), max(df_train.tid1), min(df_test.tid1), max(df_test.tid1)

In [None]:
min(df_train.tid2), max(df_train.tid2), min(df_test.tid2), max(df_test.tid2)