# Estimatorで利用する
> tftで作成したデータで訓練して、予測時にその処理を当てはめられる様にEstimatorを書く。

## 手順
    1. TFRecordからDataset作成する関数を書く
    2. metadataからFeature columnsを作成する関数を書く
    3. Estimatorを訓練する
    4. transform fnを適用したServing input fnを作成し、estimatorをsaved modelとして保存
    

In [191]:
# import library
import tensorflow as tf
import tensorflow_transform as tft
import os

from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

In [192]:
# 定数を定義
TRANSFORM_ARTEFACTS_DIR = 'transform_fn'
TARGET_FEATURE_NAME = 'fare_amount'

In [193]:
transformed_metadata = metadata_io.read_metadata(
        os.path.join(TRANSFORM_ARTEFACTS_DIR,"transformed_metadata"))


In [194]:
a = transformed_metadata.schema

In [195]:
a

Schema(feature {
  name: "dropofflat"
  type: INT
  int_domain {
    min: 0
    max: 5
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "dropofflon"
  type: INT
  int_domain {
    min: 0
    max: 5
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "fare_amount"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "key"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "passengers"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "pickuplat"
  type: INT
  int_domain {
    min: 0
    max: 5
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "pickuplon"
  type: INT
  int_domain {
    min: 0
    max: 4
    is_categorical: true
  }
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
)

In [196]:
column_schema = a.as_feature_spec()

In [197]:
column_schema

{'dropofflat': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'dropofflon': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'fare_amount': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'key': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'passengers': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'pickuplat': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'pickuplon': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}

In [198]:
column_schema['dropofflat']

FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)

In [199]:
column_schema['dropofflat'].dtype

tf.int64

In [200]:
for i in column_schema:
    print(i)

dropofflat
dropofflon
fare_amount
key
passengers
pickuplat
pickuplon


In [201]:
column_info = a.domains()

In [202]:
column_info['dropofflat']

min: 0
max: 5
is_categorical: true

## TFRecordからDatasetを作成

In [203]:
def tfrecords_input_fn(files_name_pattern, transformed_metadata,
                       mode=tf.estimator.ModeKeys.EVAL,  
                       num_epochs=1, 
                       batch_size=500):
    def _divide_func(features):
        target = features.pop(TARGET_FEATURE_NAME)
        return features, target
    dataset = tf.contrib.data.make_batched_features_dataset(
        file_pattern=files_name_pattern,
        batch_size=batch_size,
        features=transformed_metadata.schema.as_feature_spec(),
        reader=tf.data.TFRecordDataset,
        num_epochs=num_epochs,
        shuffle=True if mode == tf.estimator.ModeKeys.TRAIN else False,
        shuffle_buffer_size=1+(batch_size*2),
        prefetch_buffer_size=1
    )
    dataset = dataset.map(_divide_func)
    return dataset
    

## Feature columnを作成する

In [204]:
def create_feature_columns(transformed_metadata):
    feature_columns = []
    
    column_schemas = transformed_metadata.schema.as_feature_spec()
    
    for feature_name in column_schemas:
        if feature_name==TARGET_FEATURE_NAME:
            continue
        column_schema = column_schemas[feature_name]
        
        if column_schema.dtype == tf.float32:
            feature_columns.append(tf.feature_column.numeric_column(feature_name))
        elif column_schema.dtype == tf.int64:

            feature_columns.append(
                tf.feature_column.categorical_column_with_identity(
                feature_name, num_buckets=5+1)
                                  )
    return feature_columns

#### 参考にしたコード
def create_feature_columns(transformed_metadata):
    feature_columns = []
    
    column_schemas = transformed_metadata.schema.column_schemas
    
    for feature_name in column_schemas:
        if feature_name==TARGET_FEATURE_NAME:
            cotinue
        column_schema = column_schemas[feature_name]
        
        if isinstance(column_schema._domain, dataset_schema.FloatDomain):
            feature_columns.append(tf.feature_column.numeric_column(feature_name))
        elif isinstance(column_schema._domain, dataset_schema.IntDomain):
            if column_schema._domain._is_categorical==True:
                feature_columns.append(
                    tf.feature_column.categorical_column_with_identity(
                    feature_name, num_buckets=column_schema._domain._max_value+1)
                                      )
            else:
                feature_columns.append(
                    tf.feature_column.numeric_columne(feature_name)
                )
    return feature_columns

## Estimator

In [205]:
def create_estimator(transformed_metadata, run_config):
    feature_cols = create_feature_columns(transformed_metadata)
    estimator = tf.estimator.LinearRegressor(feature_cols, config=run_config)
    return estimator

## Experiment

In [206]:
model_dir = 'tft_output_dir'
run_config = tf.estimator.RunConfig(
    tf_random_seed=1982983,
    model_dir=model_dir
)

In [207]:
train_data_files = './tfrecord/train_transformed-00000-of-00001'
eval_data_files = './tfrecord/test_transformed-00000-of-00001'

In [208]:
# TrainSpec
train_spec = tf.estimator.TrainSpec(
    input_fn=lambda: tfrecords_input_fn(
        train_data_files, transformed_metadata,
        mode=tf.estimator.ModeKeys.TRAIN,
        num_epochs=10,
        batch_size=256,
    ),
    max_steps=2000
)

# EvalSpec
eval_spec = tf.estimator.EvalSpec(
    input_fn=lambda: tfrecords_input_fn(
        eval_data_files, transformed_metadata
    ),
    steps=None,
    throttle_secs=10
)

In [209]:
from datetime import datetime

if tf.io.gfile.exists(model_dir):
    tf.io.gfile.rmtree(model_dir)

estimator = create_estimator(transformed_metadata, run_config)

tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("")
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 


tf.estimator.train_and_evaluate(
  estimator,
  train_spec,
  eval_spec
)


time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

I0806 21:12:07.193453 4734801344 estimator.py:209] Using config: {'_model_dir': 'tft_output_dir', '_tf_random_seed': 1982983, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13be83eb8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
I0806 21:12:07.197669 4734801344 estimator_training.py:186] Not using Distribute Coordinator.
I0806 2


Experiment started at 12:12:07
.......................................


I0806 21:12:09.330124 4734801344 estimator.py:1147] Done calling model_fn.
I0806 21:12:09.332545 4734801344 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
I0806 21:12:09.621339 4734801344 monitored_session.py:240] Graph was finalized.
I0806 21:12:09.764506 4734801344 session_manager.py:500] Running local_init_op.
I0806 21:12:09.817604 4734801344 session_manager.py:502] Done running local_init_op.
I0806 21:12:11.483687 4734801344 basic_session_run_hooks.py:606] Saving checkpoints for 0 into tft_output_dir/model.ckpt.
I0806 21:12:12.358397 4734801344 basic_session_run_hooks.py:262] loss = 45861.266, step = 1
I0806 21:12:15.253170 4734801344 basic_session_run_hooks.py:692] global_step/sec: 34.494
I0806 21:12:15.254960 4734801344 basic_session_run_hooks.py:260] loss = 25182.527, step = 101 (2.897 sec)
I0806 21:12:15.867171 4734801344 basic_session_run_hooks.py:692] global_step/sec: 162.873
I0806 21:12:15.869142 4734801344 basic_session_run_hooks.py:260] loss = 22965.629, step 

.......................................
Experiment finished at 12:12:29

Experiment elapsed time: 22.331452 seconds


## Model Export
> tfma用にeval saved modelも作成する。

In [257]:
# Raw data metadata
NUMERIC_FEATURE_NAMES = ['pickuplon','pickuplat', 'dropofflon','dropofflat','passengers']
TARGET_FEATURE_NAME = 'fare_amount'
KEY_COLUMN = 'key'
def create_raw_metadata():  
    
    raw_data_schema = {}
    
    # key feature schema
    raw_data_schema[KEY_COLUMN]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
    
    # target feature schema
    raw_data_schema[TARGET_FEATURE_NAME]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
        
    # numerical features schema
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in NUMERIC_FEATURE_NAMES})
    
      # create dataset_metadata given raw_schema
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema(raw_data_schema))
    
    return raw_metadata


In [211]:
raw_metaedata = create_raw_metadata()

In [212]:
a=raw_metaedata.schema

In [259]:
def serving_input_receiver_fn():
    from tensorflow_transform.saved import saved_transform_io
    
    raw_input_features = {
        name: tf.compat.v1.placeholder(tf.float32, [None]) for name in NUMERIC_FEATURE_NAMES + [KEY_COLUMN]
    }
    
    _, transformed_features = (
        saved_transform_io.partially_apply_saved_transform(
            os.path.join(TRANSFORM_ARTEFACTS_DIR, transform_fn_io.TRANSFORM_FN_DIR),
            raw_input_features
        )
    )
    return tf.estimator.export.ServingInputReceiver(transformed_features, raw_input_features)

In [260]:
export_dir = os.path.join(model_dir, 'export')

if tf.io.gfile.exists(export_dir):
    tf.io.gfile.rmtree(export_dir)

estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=serving_input_receiver_fn
)

W0806 22:07:54.094624 4734801344 saved_transform_io.py:323] partially_apply_saved_transform is deprecated.  Use the transform_raw_features method of the TFTrandformOutput class instead.
I0806 22:07:54.173266 4734801344 saver.py:1499] Saver not created because there are no variables in the graph to restore
I0806 22:07:54.181468 4734801344 estimator.py:1145] Calling model_fn.
I0806 22:07:55.013038 4734801344 estimator.py:1147] Done calling model_fn.
I0806 22:07:55.015441 4734801344 export_utils.py:170] Signatures INCLUDED in export for Classify: None
I0806 22:07:55.019019 4734801344 export_utils.py:170] Signatures INCLUDED in export for Regress: None
I0806 22:07:55.022450 4734801344 export_utils.py:170] Signatures INCLUDED in export for Predict: ['predict']
I0806 22:07:55.023123 4734801344 export_utils.py:170] Signatures INCLUDED in export for Train: None
I0806 22:07:55.025068 4734801344 export_utils.py:170] Signatures INCLUDED in export for Eval: None
I0806 22:07:55.026631 4734801344 ex

b'tft_output_dir/export/1565096874'

In [265]:
saved_model_dir = 'tft_output_dir/export/1565096874'


def estimate_local(instance):
 
    predictor_fn = tf.contrib.predictor.from_saved_model(
        export_dir=saved_model_dir,
        signature_def_key="predict"
    )
    
    instance = dict((k, [v]) for k, v in instance.items())
    value = predictor_fn(instance)['predictions'][0][0]
    return value

instance = {
    'dropofflat': -73.987625,
    'dropofflon': 40.750617,
    'passengers': 0.0,
    'pickuplat': -73.971163,
    'pickuplon': -73.971163,
    'key': 5.0 
}

prediction = estimate_local(instance)
print(prediction)

I0806 22:09:06.668709 4734801344 saver.py:1280] Restoring parameters from tft_output_dir/export/1565096874/variables/variables


1.4761792


### Also export the EvalSavedModel



In [215]:
import tensorflow_model_analysis as tfma

In [276]:
def create_feature_eval_columns(transformed_metadata):
    feature_columns = []
    
    column_schemas = transformed_metadata.schema.as_feature_spec()
    
    for feature_name in column_schemas:
        column_schema = column_schemas[feature_name]
        
        if column_schema.dtype == tf.float32:
            feature_columns.append(tf.feature_column.numeric_column(feature_name))
        elif column_schema.dtype == tf.int64:
            feature_columns.append(
                tf.feature_column.categorical_column_with_identity(
                feature_name, num_buckets=5+1)
                                  )
    return feature_columns

In [277]:
feat_eval_col = create_feature_eval_columns(transformed_metadata)

In [278]:
feat_eval_col

[IdentityCategoricalColumn(key='dropofflat', number_buckets=6, default_value=None),
 IdentityCategoricalColumn(key='dropofflon', number_buckets=6, default_value=None),
 NumericColumn(key='fare_amount', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='key', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='passengers', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 IdentityCategoricalColumn(key='pickuplat', number_buckets=6, default_value=None),
 IdentityCategoricalColumn(key='pickuplon', number_buckets=6, default_value=None)]

In [279]:
# Create eval_input_fn

def eval_input_receiver_fn():
    serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_placeholder')

    # This *must* be a dictionary containing a single key 'examples', which
    # points to the input placeholder.
    receiver_tensors = {'examples': serialized_tf_example}
    feat_eval_col = create_feature_eval_columns(transformed_metadata)
    feature_spec =  tf.feature_column.make_parse_example_spec(
      feat_eval_col)
    features = tf.io.parse_example(serialized_tf_example, feature_spec)

    return tfma.export.EvalInputReceiver(
    features=features,
    receiver_tensors=receiver_tensors,
    labels=features['fare_amount'])

In [291]:
# export the EvalSavedModel
eval_export_dir = os.path.join(model_dir, 'eval_export')

if tf.io.gfile.exists(eval_export_dir):
    tf.io.gfile.rmtree(eval_export_dir)

tfma.export.export_eval_savedmodel(
    estimator=estimator, export_dir_base=eval_export_dir, 
    eval_input_receiver_fn=eval_input_receiver_fn
)

I0807 18:47:43.959941 4734801344 estimator.py:1145] Calling model_fn.
I0807 18:47:45.255685 4734801344 estimator.py:1147] Done calling model_fn.
I0807 18:47:45.258794 4734801344 export_utils.py:170] Signatures INCLUDED in export for Classify: None
I0807 18:47:45.261008 4734801344 export_utils.py:170] Signatures INCLUDED in export for Regress: None
I0807 18:47:45.262856 4734801344 export_utils.py:170] Signatures INCLUDED in export for Predict: None
I0807 18:47:45.265178 4734801344 export_utils.py:170] Signatures INCLUDED in export for Train: None
I0807 18:47:45.266222 4734801344 export_utils.py:170] Signatures INCLUDED in export for Eval: ['eval']
W0807 18:47:45.268890 4734801344 export_utils.py:182] Export includes no default signature!
I0807 18:47:45.364635 4734801344 saver.py:1280] Restoring parameters from tft_output_dir/model.ckpt-304
I0807 18:47:45.443629 4734801344 builder_impl.py:661] Assets added to graph.
I0807 18:47:45.444389 4734801344 builder_impl.py:456] No assets to write

b'tft_output_dir/eval_export/1565171263'

## モデルを分析する

In [300]:
eval_shared_model = tfma.default_eval_shared_model(eval_saved_model_path='tft_output_dir/eval_export/1565171263'
)
eval_result = tfma.run_model_analysis(
    eval_shared_model=eval_shared_model,
    data_location='./tfrecord/test_transformed-00000-of-00001',
    file_format='tfrecords'
)
tf.logging.set_verbosity(tf.logging.INFO)
tfma.view.render_slicing_metrics(eval_result)

I0807 18:51:42.124160 4734801344 saver.py:1280] Restoring parameters from tft_output_dir/eval_export/1565171263/variables/variables


SlicingMetricsViewer(config={'weightedExamplesColumn': 'post_export_metrics/example_count'}, data=[{'slice': '…

何も出てこない