In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.6.2
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.3.4
numpy 1.19.5
pandas 1.1.5
sklearn 0.24.2
tensorflow 2.6.2
keras.api._v2.keras 2.6.0


In [2]:
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

In [3]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

In [4]:
train_df.describe()

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [5]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            # categorical_column_with_vocabulary_list可以直接看官网
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

for numeric_column in numeric_columns:
    print(numeric_column)
    feature_columns.append(
        tf.feature_column.numeric_column(
            numeric_column, dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']
age
fare


In [6]:
feature_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town',

In [7]:
type(train_df)

pandas.core.frame.DataFrame

In [8]:
train_df.shape

(627, 9)

In [24]:
eval_df.shape

(264, 9)

In [9]:
type(y_train)

pandas.core.series.Series

In [10]:
dataset = tf.data.Dataset.from_tensor_slices(
        (dict(train_df), y_train))  # dict(train_df)

for i in dataset.take(1):
    print(i)

({'sex': <tf.Tensor: shape=(), dtype=string, numpy=b'male'>, 'age': <tf.Tensor: shape=(), dtype=float64, numpy=22.0>, 'n_siblings_spouses': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'parch': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'fare': <tf.Tensor: shape=(), dtype=float64, numpy=7.25>, 'class': <tf.Tensor: shape=(), dtype=string, numpy=b'Third'>, 'deck': <tf.Tensor: shape=(), dtype=string, numpy=b'unknown'>, 'embark_town': <tf.Tensor: shape=(), dtype=string, numpy=b'Southampton'>, 'alone': <tf.Tensor: shape=(), dtype=string, numpy=b'n'>}, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


In [11]:
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    # 必须是repeat类型的dataset，进行分批
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [12]:
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)

# 线性分类器模型
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    # 之前定义好的feature_columns传入
    feature_columns = feature_columns)

linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized 



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into linear_model/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 308.604
INFO:tensorflow:loss = 0.63881236, step = 100 (0.325 sec)
INFO:tensorflow:global_step/sec: 477.084
INFO:tensorflow:loss = 0.32474536, step = 200 (0.209 sec)
INFO:tensorflow:global_step/sec: 444.852
INFO:tensorflow:loss = 0.3958012, step = 300 (0.225 sec)
INFO:tensorflow:global_step/sec: 444.707
INFO:tensorflow:loss = 0.514357, step = 400 (0.224 sec)
INFO:tensorflow:global_

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7fc2cd475860>

In [13]:
linear_estimator.get_variable_names()

['global_step',
 'linear/linear_model/age/weights',
 'linear/linear_model/alone_indicator/weights',
 'linear/linear_model/bias_weights',
 'linear/linear_model/class_indicator/weights',
 'linear/linear_model/deck_indicator/weights',
 'linear/linear_model/embark_town_indicator/weights',
 'linear/linear_model/fare/weights',
 'linear/linear_model/n_siblings_spouses_indicator/weights',
 'linear/linear_model/parch_indicator/weights',
 'linear/linear_model/sex_indicator/weights',
 'training/Ftrl/beta',
 'training/Ftrl/decay',
 'training/Ftrl/l1_regularization_strength',
 'training/Ftrl/l2_regularization_strength',
 'training/Ftrl/learning_rate',
 'training/Ftrl/learning_rate_power',
 'training/Ftrl/linear/linear_model/age/weights/accumulator',
 'training/Ftrl/linear/linear_model/age/weights/linear',
 'training/Ftrl/linear/linear_model/alone_indicator/weights/accumulator',
 'training/Ftrl/linear/linear_model/alone_indicator/weights/linear',
 'training/Ftrl/linear/linear_model/bias_weights/accu

In [14]:
linear_estimator.get_variable_value('training/Ftrl/linear/linear_model/parch_indicator/weights/linear')

array([[-1.5075307],
       [-3.0258067],
       [-1.3623335],
       [ 1.2499554],
       [-0.9221774],
       [ 2.1498895]], dtype=float32)

In [15]:
linear_estimator.get_variable_value('training/Ftrl/linear/linear_model/sex_indicator/weights/accumulator')

array([[7.6933193],
       [3.9346614]], dtype=float32)

In [16]:
!ls -l linear_model

total 3028
-rw-rw-r-- 1 wp wp     130 May  4 21:35 checkpoint
-rw-rw-r-- 1 wp wp 1367038 May  4 21:35 events.out.tfevents.1651671296.ubuntu
-rw-rw-r-- 1 wp wp  949921 May  4 21:34 graph.pbtxt
-rw-rw-r-- 1 wp wp     452 May  4 21:34 model.ckpt-0.data-00000-of-00001
-rw-rw-r-- 1 wp wp    1749 May  4 21:34 model.ckpt-0.index
-rw-rw-r-- 1 wp wp  379665 May  4 21:34 model.ckpt-0.meta
-rw-rw-r-- 1 wp wp     452 May  4 21:35 model.ckpt-1960.data-00000-of-00001
-rw-rw-r-- 1 wp wp    1749 May  4 21:35 model.ckpt-1960.index
-rw-rw-r-- 1 wp wp  379665 May  4 21:35 model.ckpt-1960.meta


In [None]:
# !rm -rf linear_model

In [17]:
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-05-04T21:37:14
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.71483s
INFO:tensorflow:Finished evaluation at 2022-05-04-21:37:15
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.7878788, accuracy_baseline = 0.625, auc = 0.83841443, auc_precision_recall = 0.7826154, average_loss = 0.46759558, global_step = 1960, label/mean = 0.375, loss = 0.45158345, precision = 0.7171717, prediction/mean = 0.38183612, recall = 0.7171717
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: linear_model/model.ckpt-1960


{'accuracy': 0.7878788,
 'accuracy_baseline': 0.625,
 'auc': 0.83841443,
 'auc_precision_recall': 0.7826154,
 'average_loss': 0.46759558,
 'label/mean': 0.375,
 'loss': 0.45158345,
 'precision': 0.7171717,
 'prediction/mean': 0.38183612,
 'recall': 0.7171717,
 'global_step': 1960}

In [19]:
!ls

data			       tf02_premade_estimators.ipynb
linear_model		       tf03_premade_estimators-new_feature.ipynb
tf01_keras_to_estimator.ipynb


In [20]:
# 下面是使用dnn估计器
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)

# 创建dnn估计器
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    # 因为是dnn，我们定义层，两层，每一层是128
    hidden_units = [128, 128,128],
    # 激活函数
    activation_fn = tf.nn.relu,
    # 在Linear也有这个参数，只不过默认的，我们没有设置
    optimizer = 'Adam')

# 开始训练
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './dnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorfl

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7fc3a8f657f0>

In [21]:
dnn_estimator.get_variable_names()

['dnn/hiddenlayer_0/bias',
 'dnn/hiddenlayer_0/kernel',
 'dnn/hiddenlayer_1/bias',
 'dnn/hiddenlayer_1/kernel',
 'dnn/hiddenlayer_2/bias',
 'dnn/hiddenlayer_2/kernel',
 'dnn/logits/bias',
 'dnn/logits/kernel',
 'global_step',
 'training/Adam/beta_1',
 'training/Adam/beta_2',
 'training/Adam/decay',
 'training/Adam/dnn/hiddenlayer_0/bias/m',
 'training/Adam/dnn/hiddenlayer_0/bias/v',
 'training/Adam/dnn/hiddenlayer_0/kernel/m',
 'training/Adam/dnn/hiddenlayer_0/kernel/v',
 'training/Adam/dnn/hiddenlayer_1/bias/m',
 'training/Adam/dnn/hiddenlayer_1/bias/v',
 'training/Adam/dnn/hiddenlayer_1/kernel/m',
 'training/Adam/dnn/hiddenlayer_1/kernel/v',
 'training/Adam/dnn/hiddenlayer_2/bias/m',
 'training/Adam/dnn/hiddenlayer_2/bias/v',
 'training/Adam/dnn/hiddenlayer_2/kernel/m',
 'training/Adam/dnn/hiddenlayer_2/kernel/v',
 'training/Adam/dnn/logits/bias/m',
 'training/Adam/dnn/logits/bias/v',
 'training/Adam/dnn/logits/kernel/m',
 'training/Adam/dnn/logits/kernel/v',
 'training/Adam/learning

In [22]:
dnn_estimator.get_variable_value('training/Adam/dnn/hiddenlayer_0/kernel/m').shape

(34, 128)

In [23]:
# 评估
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-05-04T21:39:50
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./dnn_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.92552s
INFO:tensorflow:Finished evaluation at 2022-05-04-21:39:51
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.79545456, accuracy_baseline = 0.625, auc = 0.8480869, auc_precision_recall = 0.8050849, average_loss = 0.5118347, global_step = 1960, label/mean = 0.375, loss = 0.48585257, precision = 0.7227723, prediction/mean = 0.376497, recall = 0.7373737
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: ./dnn_model/model.ckpt-1960


{'accuracy': 0.79545456,
 'accuracy_baseline': 0.625,
 'auc': 0.8480869,
 'auc_precision_recall': 0.8050849,
 'average_loss': 0.5118347,
 'label/mean': 0.375,
 'loss': 0.48585257,
 'precision': 0.7227723,
 'prediction/mean': 0.376497,
 'recall': 0.7373737,
 'global_step': 1960}