# 特徴抽出器として用いる

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


## 事前学習したモデルの出力を取得

### ① ファイルを経由してコマンドラインで実行

#### 入力ファイル準備

In [2]:
train_df = pd.read_csv("/home/ubuntu/glue_data/ARD/train.tsv",delimiter="\t")

dev_df = pd.read_csv("/home/ubuntu/glue_data/ARD/dev.tsv",delimiter="\t")

test_df = pd.read_csv("/home/ubuntu/glue_data/ARD/test.tsv",delimiter="\t")

In [None]:
# trainだけで４万件
train_df["reviewText"].to_csv("/home/ubuntu/tmp/input.txt",index=None)

In [3]:
# dev版
dev_df["reviewText"].to_csv("/home/ubuntu/tmp/input.txt",index=None)

#### 出力

```zsh
export BERT_BASE_DIR=multilingual_L-12_H-768_A-12
python extract_features.py \
  --input_file=tmp/input.txt \
  --output_file=tmp/output.jsonl \
  --vocab_file=$BERT_BASE_DIR/vocab.txt \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --layers=-1 \
  --max_seq_length=128 \
  --batch_size=8
  ```
 ↑　この方法だと多大なストレージを消費する

### ② Notebook上で取得する

In [4]:
import tensorflow as tf
import extract_features
import modeling
import tokenization

  from ._conv import register_converters as _register_converters


In [6]:
flags = tf.flags
FLAGS = flags.FLAGS

tf.app.flags.DEFINE_string('f', '', 'kernel') ## jupyter上でのエラー回避

FLAGS.bert_config_file = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12/bert_config.json'

FLAGS.vocab_file = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12/vocab.txt'

FLAGS.init_checkpoint = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt'

FLAGS.layers = '-1'

In [7]:
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  master=FLAGS.master,
  tpu_config=tf.contrib.tpu.TPUConfig(
      num_shards=FLAGS.num_tpu_cores,
      per_host_input_for_training=is_per_host))

In [10]:
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

In [11]:
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

In [12]:
tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

In [13]:
examples = extract_features.read_examples(input_file='/home/ubuntu/tmp/input.txt')

In [14]:
features = extract_features.convert_examples_to_features(
      examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] " i purchased the 13 " " mba a few weeks ago . it ' s my first mac . there are lots of cool things about the new mac ##book airs . but what stands out most to me is the performance gain from an ss ##d instead of a spinning magnetic hard drive . it boots up from cold to ready in under 10 seconds . seriously . it shut ##s down in about 4 - 5 seconds . of course , you don ' t need to shut it down and boot it up from cold every time - just shut the lid and it goes to sleep in 1 second . open it later and it wakes up just as you left [SEP]
INFO:tensorflow:input_ids: 101 1000 1045 4156 1996 2410 1000 1000 15038 1037 2261 3134 3283 1012 2009 1005 1055 2026 2034 6097 1012 2045 2024 7167 1997 4658 2477 2055 1996 2047 6097 8654 14369 1012 2021 2054 4832 2041 2087 2000 2033 2003 1996 2836 5114 2013 2019 7020 2094 2612 1997 1037 9419 8060 2524 3298 1012 2009 6879 2039 2013 3147 2000 3201 1999 2104 2184 3823 

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


In [15]:
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

In [16]:
model_fn = extract_features.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      layer_indexes=layer_indexes,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

In [17]:
estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=FLAGS.use_tpu,
  model_fn=model_fn,
  config=run_config,
  predict_batch_size=FLAGS.batch_size)

input_fn = extract_features.input_fn_builder(
  features=features, seq_length=FLAGS.max_seq_length)


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpjinq72zi', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3628a20da0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_

In [25]:
results = list(estimator.predict(input_fn, yield_single_examples=True))

INFO:tensorflow:Could not find trained model in model_dir: /tmp/tmpjinq72zi, running initialization to predict.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/ke

INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tenso

INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT

## 学習器にかける

In [27]:
X_raw = [r['layer_output_0'] for r in results]

In [31]:
X = np.vstack([x.reshape(-1) for x in X_raw])

y = dev_df['helpful_rate'].values

In [35]:
from sklearn.model_selection import train_test_split

In [37]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
from sklearn.linear_model import LinearRegression

In [44]:
linear = LinearRegression()

In [45]:
linear.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [46]:
y_pred = linear.predict(X_test)

In [47]:
from sklearn.metrics import r2_score, mean_squared_error

In [48]:
mean_squared_error(y_true=y_test, y_pred=y_pred)  

29088.57949249909

In [49]:
r2_score(y_true=y_test, y_pred=y_pred)  

-617305.7307311621