In [12]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
import numpy as np
import os
import gzip

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
train_file = './Titanic.train.csv'
eval_file = './Titanic.valid.csv'

from sklearn.model_selection import train_test_split
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

In [None]:
train_df.pop('PassengerId')
eval_df.pop('PassengerId')
train_df.pop('Name')
eval_df.pop('Name')
y_train = train_df.pop('Survived')
y_eval = eval_df.pop('Survived')

In [None]:
categorical_columns = ['Sex', 'SibSp', 'Parch', 'Pclass', 'Cabin', 'Embarked', 'Ticket']
numeric_columns = ['Age', 'Fare']
feature_columns = []
# 离散
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    # tf.feature_column可以嵌套，类似于pipeline
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))
# 连续
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            numeric_column, dtype=tf.float32))
    
# cross feature：对两个离散特征做笛卡尔积
# age = [1, 2, 3, 4, 5]  gender = [male, female]
# age_x_gender = [(1,male), (2,male),...,(5,male),.....(5,female)]
# hash_bucket_size = 100意义：将高纬度的、稀疏的交叉特征降为100维
# 若age 100个， gender 100个， 100*100 --> 100
# tf.feature_column.crossed_column(['Age', 'Sex'], hash_bucket_size = 100)
# linear可以直接使用， dnn需要转换
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['Age', 'Sex'], hash_bucket_size = 100)))

In [None]:
# 生成dataset
def make_dataset(data_df, label_df, epochs=10, shuffle=True,batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices(
        # 需要将DataFrame转为字典
        (dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [None]:
output_dir = './baseline_model_new_features'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir = output_dir, n_classes = 2)
baseline_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

In [None]:
baseline_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs=1, shuffle = False, batch_size = 20))

In [None]:
linear_output_dir = './linear_model_new_features'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir, n_classes = 2,
    feature_columns = feature_columns)
# 默认‘Adam’↑     训练过程在TensorBoard中查看 ↓
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs=1, shuffle = False, batch_size = 20))

In [None]:
dnn_output_dir = './linear_model_new_features'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir, n_classes = 2,
    feature_columns = feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu, 
    optimizer = 'Adam')

dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs=1, shuffle = False, batch_size = 20))