In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

tf.test.is_gpu_available()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.13.1
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 0.25.3
sklearn 0.22.1
tensorflow 1.13.1
tensorflow._api.v1.keras 2.2.4-tf


False

In [3]:
# 使用pandas读取csv文件
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print("="*80)
print(eval_df.head())

   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   
4         

In [4]:
y_train = train_df.pop("survived")
y_eval = eval_df.pop("survived")

print(train_df.head())
print("="*80)
print(eval_df.head())
print("="*80)
print(y_train.head())
print("="*80)
print(y_eval.head())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
      sex   age  n_siblings_spouses  parch     fare   class     deck  \
0    male  35.0                   0      0   8.0500   Third  unknown   
1    male  54.0                   0      0  51.8625   First        E   
2  female  58.0                   0      0  26.5500   First        C   
3  female  55.0                   0      0  16.0000  Second  unknown   
4    male  34.0                   0      0  13.0000  Second        D   

  

In [5]:
# 离散特征:性别，兄弟姐妹和配偶个数，parch父母或孩子是否在船上，'class':仓位上/中/下等仓，
# 'deck':货仓还是在夹板上；’embark_town‘:出发的港口，'alone':是否是一个人
categorical_columns = ['sex','n_siblings_spouses','parch','class','deck',
                       'embark_town','alone']

# 连续特征：‘age’，‘fare’:票价
numeric_columns = ['age','fare']

feature_columns = []
# 处理离散特征
for categorical_column in categorical_columns:
    # 构建词表
    vocab = train_df[categorical_column].unique()
    print(categorical_column,vocab) # 打印出对应的离散值词表
    # 使用tf.feature_column.categorical_column_with_vocabulary_list构建feature_column
    feature_column = tf.feature_column.categorical_column_with_vocabulary_list(
                        categorical_column,vocab)
    # tf.feature_column.indicator_column构建one_hot的feature_column
    one_hot_feature_column = tf.feature_column.indicator_column(feature_column)
    
    # 最后将构建好的feature_column添加到feature_columns列表中
    feature_columns.append(one_hot_feature_column)
    
# 处理连续特征
for numeric_column in numeric_columns:
    # 构建连续值的feature_column:只需要特征的key值和数据类型
    feature_column = tf.feature_column.numeric_column(numeric_column,
                                                      dtype=tf.float32)
    feature_columns.append(feature_column)

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [7]:
# 定义构建dataset的方法
def make_dataset(data_df, label_df, epochs = 10,shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

In [11]:
output_dir = "customized_estimator"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

#自定义mode_fn
def model_fn(features,labels,mode,params):
    # model 指的就是模型的运行时状态: Train,Eval,Predict
    input_for_next_layer = tf.feature_column.input_layer(
        features,params["feature_columns"]) # 输入层
    for n_unit in params["hidden_units"]: # 隐藏层
        input_for_next_layer = tf.layers.dense(input_for_next_layer,
                                               units=n_unit,
                                               activation=tf.nn.relu)
    logits = tf.layers.dense(input_for_next_layer,
                             units=params["n_classes"],
                             activation=None)
    predicted_classes = tf.argmax(logits,1)
    
    # model_fn方法必须返回一个tf.estimator.EstimatorSpec()对象
    if mode == tf.estimator.ModeKeys.PREDICT:
        # 在预测模式下，不需要计算loss和准确率，只需要计算预测值
        predictions = {
            "class_ids": predicted_classes[:,tf.newaxis],
            "probabilities": tf.nn.softmax(logits),
            "logits":logits
        } 
        
        return tf.estimator.EstimatorSpec(mode,predictions = predictions)
    # loss
    loss = tf.losses.sparse_softmax_cross_entropy(labels = labels,
                                                  logits = logits)
    # tf.metrics.accuracy可以累积求平均计算准确率
    accuracy = tf.metrics.accuracy(labels = labels,
                                   predictions = predicted_classes,
                                   name = "acc_op")
    metrics = {
        "accuracy":accuracy
    }
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode,loss = loss,
                                          eval_metric_ops = metrics)
    # 在训练的时候才需要的train_op
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(
        loss, global_step = tf.train.get_global_step())
    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode,loss = loss,
                                          train_op = train_op)
    
estimator = tf.estimator.Estimator(
    model_fn = model_fn,
    model_dir = output_dir,
    params = {
        "feature_columns":feature_columns,
        "hidden_units":[100,100],
        "n_classes":2
    })
estimator.train(input_fn = lambda : make_dataset(train_df,y_train,epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'customized_estimator', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x156de5390>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph w

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x156ddb650>

In [12]:
estimator.evaluate(lambda:make_dataset(eval_df,y_eval,epochs=1))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-09T09:10:39Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from customized_estimator/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-01-09-09:10:40
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.7878788, global_step = 1960, loss = 0.5543977
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: customized_estimator/model.ckpt-1960


{'accuracy': 0.7878788, 'loss': 0.5543977, 'global_step': 1960}