### 导入工具包

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import tensorflow as tf

### 加载训练数据

In [4]:
CONTINUOUS_COLUMNS =  ["I"+str(i) for i in range(1,14)] # 1-13 inclusive
CATEGORICAL_COLUMNS = ["C"+str(i) for i in range(1,27)] # 1-26 inclusive
LABEL_COLUMN = ["clicked"]

TRAIN_DATA_COLUMNS = LABEL_COLUMN + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS

In [6]:
data=pd.read_csv("./data/train.csv",header=None,names=TRAIN_DATA_COLUMNS)
data.head()

Unnamed: 0,clicked,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1,1,5,0,1382,4,15,2,181,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,0,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2,0,44,1,102,8,2,2,4,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,0,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2,0,1,14,767,89,4,2,245,...,8efede7f,3412118d,0,0,e587c466,ad3062eb,3a171ecb,3b183c5c,0,0
3,0,0,893,0,0,4392,0,0,0,0,...,1e88c74f,74ef3502,0,0,6b3a5ca6,0,3a171ecb,9117a34a,0,0
4,0,3,-1,0,0,2,0,3,0,0,...,1e88c74f,26b3c7a7,0,0,21c9516a,0,32c7478e,b34f3128,0,0


In [8]:
data.shape

(800000, 40)

### 定义输入数据格式处理函数

In [14]:
BATCH_SIZE = 400

def generate_input_fn(filename, batch_size=BATCH_SIZE):
    def _input_fn():
        filename_queue = tf.train.string_input_producer([filename])
        reader = tf.TextLineReader()
        # Reads out batch_size number of lines
        key, value = reader.read_up_to(filename_queue, num_records=batch_size)
        
        # 1 int label, 13 ints, 26 strings
        cont_defaults = [ [0] for i in range(1,14) ]
        cate_defaults = [ [" "] for i in range(1,27) ]
        label_defaults = [ [0] ]
        column_headers = TRAIN_DATA_COLUMNS
        # The label is the first column of the data.
        # 如果有缺失值就用默认值代替
        record_defaults = label_defaults + cont_defaults + cate_defaults

        # Decode CSV data that was just read out. 
        # Note that this does NOT return a dict, 
        # so we will need to zip it up with our headers
        columns = tf.decode_csv(
            value, record_defaults=record_defaults)
        
        # all_columns is a dictionary that maps from column names to tensors of the data.
        all_columns = dict(zip(column_headers, columns))
        
        # Pop and save our labels 
        # dict.pop() returns the popped array of values; exactly what we need!
        labels = all_columns.pop(LABEL_COLUMN[0])
        
        # the remaining columns are our features
        features = all_columns 

        # Sparse categorical features must be represented with an additional dimension. 
        # There is no additional work needed for the Continuous columns; they are the unaltered columns.
        # See docs for tf.SparseTensor for more info
        for feature_name in CATEGORICAL_COLUMNS:
            features[feature_name] = tf.expand_dims(features[feature_name], -1)

        return features, labels

    return _input_fn

print('input function configured')

input function configured


### 建立特征，输入到Linear模型

In [9]:
# Sparse base columns：对于类别特征，用hash编码将其转化为稀疏列
wide_columns = []# 用于Line模型的特征
for name in CATEGORICAL_COLUMNS:
    wide_columns.append(tf.contrib.layers.sparse_column_with_hash_bucket(
            name, hash_bucket_size=1000))

print('Wide/Sparse columns configured')

Wide/Sparse columns configured


In [10]:
# Continuous base columns：对于连续性特征，直接使用真实值。
deep_columns = []# 用于DNN模型的特征
for name in CONTINUOUS_COLUMNS:
    deep_columns.append(tf.contrib.layers.real_valued_column(name))

print('deep/continuous columns configured')

deep/continuous columns configured


In [11]:
# Embeddings for wide columns into deep columns：再将类别特征进行词嵌入模型，送入DNN模型。
for col in wide_columns:
    deep_columns.append(tf.contrib.layers.embedding_column(col, 
                                                           dimension=8))

print('wide and deep columns configured')

wide and deep columns configured


### 建立模型

* **Wide**: Linear Classifier

In [12]:
def create_model_dir(model_type):
    # Returns something like models/model_WIDE_AND_DEEP_1493043407
    return 'models/model_' + model_type + '_' + str(int(time.time()))

# Specify the desired model_dir 
def get_model(model_type, model_dir):
    print("Model directory = %s" % model_dir)
    
    # There are more options here than shown here. 
    # We are using this to show additional checkpointing for illustrative purposes.
    # In a real system with far more samples, you would 
    #     likely choose to save checkpoints less frequently.
    runconfig = tf.contrib.learn.RunConfig(
        save_checkpoints_secs=None,
        save_checkpoints_steps = 100,
    )
    
    m = None
    
    # Linear Classifier
    if model_type == 'WIDE':
        m = tf.contrib.learn.LinearClassifier(
            model_dir=model_dir, 
            feature_columns=wide_columns)

    # Deep Neural Net Classifier
    if model_type == 'DEEP':
        m = tf.contrib.learn.DNNClassifier(
            model_dir=model_dir,
            feature_columns=deep_columns,
            hidden_units=[100, 50, 25])

    # Combined Linear and Deep Classifier
    if model_type == 'WIDE_AND_DEEP':
        m = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[100, 70, 50, 25],
            config=runconfig)
        
    print('estimator built')
    
    return m
    

MODEL_TYPE = 'WIDE'
model_dir = create_model_dir(model_type=MODEL_TYPE)
m = get_model(model_type=MODEL_TYPE, model_dir=model_dir)

Model directory = models/model_WIDE_1560666827
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
Instructions for updating:
Please switch to tf.contrib.estimator.*_head.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001880DB01940>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5

### 训练模型

In [13]:
train_file = "data/train.csv"
eval_file  = "data/eval.csv"

In [15]:
%%time

# This can be found with
# wc -l train.csv
train_sample_size = 800000
train_steps = train_sample_size/BATCH_SIZE # 8000/40 = 200

m.fit(input_fn=generate_input_fn(train_file, BATCH_SIZE), steps=train_steps)

print('fit done')

Instructions for updating:
Use the `axis` argument instead
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.EstimatorSpec. You can use the `estimator_spec` method to create an equivalent one.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into models/model_WIDE_1560666827\model.ckpt.
INFO:tensorflow:loss = 0.6931474, step = 1
INFO:tensorflow:global_step/sec: 45.7418
INFO:tensorflow:loss = 0.45659465, step = 101 (2.187 sec)
INFO:tensorflow:global_step/sec: 134.775
INFO:tensorflow:loss = 0.53344035, step = 201 (0.741 sec)
INFO:tensorflow:global_step/sec: 134.943
INFO:tensorflow:loss = 0.47971246, step = 301 (0.742 sec)
INFO:tensorflow:global_step/sec: 122.133
INFO:tensorflow:loss = 0.5206535, step = 401 (0.819 sec)
INFO:tensorflow:global_step/sec: 129.044
INFO:tensorflow:loss = 0.56900674, step =

### 验证模型

In [16]:
%%time

eval_sample_size = 200000 
eval_steps = eval_sample_size/BATCH_SIZE 

results = m.evaluate(input_fn=generate_input_fn(eval_file), 
                     steps=eval_steps)
print('evaluate done')

print('Accuracy: %s' % results['accuracy'])
print(results)

INFO:tensorflow:Starting evaluation at 2019-06-16-06:40:47
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_WIDE_1560666827\model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [50/500]
INFO:tensorflow:Evaluation [100/500]
INFO:tensorflow:Evaluation [150/500]
INFO:tensorflow:Evaluation [200/500]
INFO:tensorflow:Evaluation [250/500]
INFO:tensorflow:Evaluation [300/500]
INFO:tensorflow:Evaluation [350/500]
INFO:tensorflow:Evaluation [400/500]
INFO:tensorflow:Evaluation [450/500]
INFO:tensorflow:Evaluation [500/500]
INFO:tensorflow:Finished evaluation at 2019-06-16-06:40:54
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.766125, accuracy/baseline_label_mean = 0.251165, accuracy/threshold_0.500000_mean = 0.766125, auc = 0.72278744, auc_precision_recall = 0.47572494, global_step = 2000, labels/actual_label_mean = 0.251165, labels/prediction_mean = 0.26990917, lo

只用了Linear模型得到的结果   
Accuracy: 0.766125   
loss: 0.50244325