In [4]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

tf.test.is_gpu_available()

2.0.0-dev20191002
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.17.2
pandas 0.25.1
sklearn 0.21.3
tensorflow 2.0.0-dev20191002
tensorflow_core.keras 2.2.4-tf


False

In [5]:
# 使用pandas读取csv文件
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print("="*80)
print(eval_df.head())

   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   
4         

In [6]:
y_train = train_df.pop("survived")
y_eval = eval_df.pop("survived")

print(train_df.head())
print("="*80)
print(eval_df.head())
print("="*80)
print(y_train.head())
print("="*80)
print(y_eval.head())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
      sex   age  n_siblings_spouses  parch     fare   class     deck  \
0    male  35.0                   0      0   8.0500   Third  unknown   
1    male  54.0                   0      0  51.8625   First        E   
2  female  58.0                   0      0  26.5500   First        C   
3  female  55.0                   0      0  16.0000  Second  unknown   
4    male  34.0                   0      0  13.0000  Second        D   

  

In [7]:
# 离散特征:性别，兄弟姐妹和配偶个数，parch父母或孩子是否在船上，'class':仓位上/中/下等仓，
# 'deck':货仓还是在夹板上；’embark_town‘:出发的港口，'alone':是否是一个人
categorical_columns = ['sex','n_siblings_spouses','parch','class','deck',
                       'embark_town','alone']

# 连续特征：‘age’，‘fare’:票价
numeric_columns = ['age','fare']

feature_columns = []
# 处理离散特征
for categorical_column in categorical_columns:
    # 构建词表
    vocab = train_df[categorical_column].unique()
    print(categorical_column,vocab) # 打印出对应的离散值词表
    # 使用tf.feature_column.categorical_column_with_vocabulary_list构建feature_column
    feature_column = tf.feature_column.categorical_column_with_vocabulary_list(
                        categorical_column,vocab)
    # tf.feature_column.indicator_column构建one_hot的feature_column
    one_hot_feature_column = tf.feature_column.indicator_column(feature_column)
    
    # 最后将构建好的feature_column添加到feature_columns列表中
    feature_columns.append(one_hot_feature_column)
    
# 处理连续特征
for numeric_column in numeric_columns:
    # 构建连续值的feature_column:只需要特征的key值和数据类型
    feature_column = tf.feature_column.numeric_column(numeric_column,
                                                      dtype=tf.float32)
    feature_columns.append(feature_column)

#cross feature 交叉特征：
# age:[1,2,3,4,5],gender:[male,female]
# age_x_gender:[(1,male),(2,male),(3,male),(4,male),(5,male),(1,female),(2,female),(3,female),(4,female),(5,female)]
# hash_bucket_size的作用是将过于稀疏的one-hot特征进行hash取模以减少稀疏特征的维度：1000000：100 -> hash(100000 values) % 100
# 假设'age'是从0-100的连续值，如果不hash，可能有100000个不一样的年龄，
# 现在模100之后，就变成每一岁是一个桶，[0，0.5）都是0岁，[0.5,1.5)都是一岁...
# 这样既缩减了离散特征维度，也不会影响训练效果
crossed_column = tf.feature_column.crossed_column(['age','sex'],
                                 hash_bucket_size = 100)
crossed_column = tf.feature_column.indicator_column(crossed_column)
feature_columns.append(crossed_column)

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [None]:
# 定义构建dataset的方法
def make_dataset(data_df, label_df, epochs = 10,shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [None]:
# 定义存放模型的文件夹
output_dir = "baseline_model" 
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# 定义baseline_estimator
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir = output_dir,
    n_classes=2) # 2 分类

# 使用baseline_estimator进行训练
baseline_estimator.train(input_fn = lambda : make_dataset(train_df,y_train,epochs=100))

In [None]:
baseline_estimator.evaluate(input_fn=lambda :make_dataset(eval_df,y_eval,
                                                          epochs = 1,
                                                          shuffle = False,
                                                          batch_size = 20))