### 导入工具包

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import tensorflow as tf

### 设定数据列名

In [12]:
CONTINUOUS_COLUMNS =  ["I"+str(i) for i in range(1,14)] # 1-13 inclusive
CATEGORICAL_COLUMNS = ["C"+str(i) for i in range(1,27)] # 1-26 inclusive
LABEL_COLUMN = ["clicked"]

TRAIN_DATA_COLUMNS = LABEL_COLUMN + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS

### 定义处理格式处理函数

In [13]:
BATCH_SIZE = 400

def generate_input_fn(filename, batch_size=BATCH_SIZE):
    def _input_fn():
        filename_queue = tf.train.string_input_producer([filename])
        reader = tf.TextLineReader()
        # Reads out batch_size number of lines
        key, value = reader.read_up_to(filename_queue, num_records=batch_size)
        
        # 1 int label, 13 ints, 26 strings
        cont_defaults = [ [0] for i in range(1,14) ]
        cate_defaults = [ [" "] for i in range(1,27) ]
        label_defaults = [ [0] ]
        column_headers = TRAIN_DATA_COLUMNS
        # The label is the first column of the data.
        # 如果有缺失值就用默认值代替
        record_defaults = label_defaults + cont_defaults + cate_defaults

        # Decode CSV data that was just read out. 
        # Note that this does NOT return a dict, 
        # so we will need to zip it up with our headers
        columns = tf.decode_csv(
            value, record_defaults=record_defaults)
        
        # all_columns is a dictionary that maps from column names to tensors of the data.
        all_columns = dict(zip(column_headers, columns))
        
        # Pop and save our labels 
        # dict.pop() returns the popped array of values; exactly what we need!
        labels = all_columns.pop(LABEL_COLUMN[0])
        
        # the remaining columns are our features
        features = all_columns 

        # Sparse categorical features must be represented with an additional dimension. 
        # There is no additional work needed for the Continuous columns; they are the unaltered columns.
        # See docs for tf.SparseTensor for more info
        for feature_name in CATEGORICAL_COLUMNS:
            features[feature_name] = tf.expand_dims(features[feature_name], -1)

        return features, labels

    return _input_fn

print('input function configured')

input function configured


### 建立输入到DNN_Linear模型的特征

In [14]:
# Sparse base columns：对于类别特征，用hash编码将其转化为稀疏列
wide_columns = []# 用于Line模型的特征
for name in CATEGORICAL_COLUMNS:
    wide_columns.append(tf.contrib.layers.sparse_column_with_hash_bucket(
            name, hash_bucket_size=1000))

print('Wide/Sparse columns configured')

Wide/Sparse columns configured


In [15]:
# Continuous base columns：对于连续性特征，直接使用真实值。
deep_columns = []# 用于DNN模型的特征
for name in CONTINUOUS_COLUMNS:
    deep_columns.append(tf.contrib.layers.real_valued_column(name))

print('deep/continuous columns configured')

deep/continuous columns configured


In [16]:
# Embeddings for wide columns into deep columns：再将类别特征进行词嵌入模型，送入DNN模型。
for col in wide_columns:
    deep_columns.append(tf.contrib.layers.embedding_column(col, 
                                                           dimension=8))

print('wide and deep columns configured')

wide and deep columns configured


### 建立模型
* **Wide & Deep**: Combined Linear and Deep Classifier

In [21]:
def create_model_dir(model_type):
    # Returns something like models/model_WIDE_AND_DEEP_1493043407
    return 'models/model_' + model_type + '_' + str(int(time.time()))

# Specify the desired model_dir 
def get_model(model_type, model_dir):
    print("Model directory = %s" % model_dir)
    
    # There are more options here than shown here. 
    # We are using this to show additional checkpointing for illustrative purposes.
    # In a real system with far more samples, you would 
    #     likely choose to save checkpoints less frequently.
    runconfig = tf.contrib.learn.RunConfig(
        save_checkpoints_secs=None,
        save_checkpoints_steps = 100,
    )
    
    m = None
    
    # Linear Classifier
    if model_type == 'WIDE':
        m = tf.contrib.learn.LinearClassifier(
            model_dir=model_dir, 
            feature_columns=wide_columns)

    # Deep Neural Net Classifier
    if model_type == 'DEEP':
        m = tf.contrib.learn.DNNClassifier(
            model_dir=model_dir,
            feature_columns=deep_columns,
            hidden_units=[100, 50, 25])

    # Combined Linear and Deep Classifier
    if model_type == 'WIDE_AND_DEEP':
        m = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[100,700,50,25],
            config=runconfig)
        
    print('estimator built')
    
    return m
    

MODEL_TYPE = 'WIDE_AND_DEEP'
model_dir = create_model_dir(model_type=MODEL_TYPE)
m = get_model(model_type=MODEL_TYPE, model_dir=model_dir)

Model directory = models/model_WIDE_AND_DEEP_1560670723
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000146901C6240>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': None, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'models/model_WIDE_AND_DEEP_1560670723'}
estimator built


### 训练模型

In [22]:
# 加载数据
train_file = "data/train.csv"
eval_file  = "data/eval.csv"

In [23]:
%%time

# This can be found with
# wc -l train.csv
train_sample_size = 800000
train_steps = train_sample_size/BATCH_SIZE # 800000/400 = 2000

m.fit(input_fn=generate_input_fn(train_file, BATCH_SIZE), steps=train_steps)

print('fit done')

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into models/model_WIDE_AND_DEEP_1560670723\model.ckpt.
INFO:tensorflow:loss = 4.3260183, step = 2
INFO:tensorflow:Saving checkpoints for 102 into models/model_WIDE_AND_DEEP_1560670723\model.ckpt.
INFO:tensorflow:global_step/sec: 7.59014
INFO:tensorflow:loss = 0.45413193, step = 202 (18.034 sec)
INFO:tensorflow:Saving checkpoints for 204 into models/model_WIDE_AND_DEEP_1560670723\model.ckpt.
INFO:tensorflow:global_step/sec: 9.93795
INFO:tensorflow:Saving checkpoints for 306 into models/model_WIDE_AND_DEEP_1560670723\model.ckpt.
INFO:tensorflow:global_step/sec: 15.9216
INFO:tensorflow:loss = 0.53203994, step = 402 (13.141 sec)
INFO:tensorflow:Saving checkpoints for 408 into models/model_WIDE_AND_DEEP_1560670723\model.ckpt.
INFO:tensorflow:global_step/sec: 15.719
INFO:tensorflow:Saving c

### 验证模型

In [24]:
%%time

eval_sample_size = 200000 
eval_steps = eval_sample_size/BATCH_SIZE 

results = m.evaluate(input_fn=generate_input_fn(eval_file), 
                     steps=eval_steps)
print('evaluate done')

print('Accuracy: %s' % results['accuracy'])
print(results)

INFO:tensorflow:Starting evaluation at 2019-06-16-07:45:13
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP_1560670723\model.ckpt-2002
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [50/500]
INFO:tensorflow:Evaluation [100/500]
INFO:tensorflow:Evaluation [150/500]
INFO:tensorflow:Evaluation [200/500]
INFO:tensorflow:Evaluation [250/500]
INFO:tensorflow:Evaluation [300/500]
INFO:tensorflow:Evaluation [350/500]
INFO:tensorflow:Evaluation [400/500]
INFO:tensorflow:Evaluation [450/500]
INFO:tensorflow:Evaluation [500/500]
INFO:tensorflow:Finished evaluation at 2019-06-16-07:45:27
INFO:tensorflow:Saving dict for global step 2002: accuracy = 0.7657, accuracy/baseline_label_mean = 0.251165, accuracy/threshold_0.500000_mean = 0.7657, auc = 0.72143906, auc_precision_recall = 0.47333822, global_step = 2002, labels/actual_label_mean = 0.251165, labels/prediction_mean = 0.2521450

DNN_Linear模型： dnn_hidden_units=[100, 70, 50, 25]得到的结果   
Accuracy:0.7657    
loss: 0.502732  