In [1]:
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

1.8.0


### Load data and preprocessing

In [2]:
train_df = pd.read_csv('./train.csv', sep=',')
eval_df = pd.read_csv('./eval.csv', sep=',')
test_df = pd.read_csv('./test.csv', sep=',')

In [6]:
def process_date(df):
    df.lastModified = df.lastModified.apply(lambda row: 'Unknown' 
                                            if row[2:].isnumeric() else row)
    
    df['last_year_modified'] = df.lastModified.apply(lambda row: str(row.split(' ')[3]) 
                                                     if row != 'Unknown' else row) 
    return df   

In [8]:
def add_more_features(dataframe):
    df = dataframe.copy()
    df['url_length'] = df['url'].apply(lambda x:len(x))
    df = process_date(df)
    return df

In [9]:
train_df = add_more_features(train_df)
eval_df = add_more_features(eval_df)
test_df = add_more_features(test_df)

In [3]:
train_df.head()

Unnamed: 0,url,compromissionType,isHiddenFraudulent,contentLength,serverType,poweredBy,contentType,lastModified
0,www.advinno.com/products/asic-prototyping,defacement,False,0,Apache,Unknown,text/html; charset=utf-8,"Tue, 29 Jan 2013 17:44:27 GMT"
1,www.escolareiarthur.com.br/index.php?option=co...,defacement,False,0,Unknown,Unknown,Unknown,Unknown
2,www.wxlottery.com/wfjs/dlt/index.html,defacement,False,13331,Microsoft-IIS/6.0,Unknown,text/html,"Wed, 07 Nov 2012 03:11:11 GMT"
3,520168.cn/n146c21.aspx,phishing,False,13671,Microsoft-IIS/6.0,ASP.NET,text/html; charset=gb2312,Unknown
4,9d345009-a-62cb3a1a-s-sites.googlegroups.com/s...,phishing,True,0,GSE,Unknown,text/html; charset=UTF-8,"Wed, 21 Oct 2009 12:30:55 GMT"


In [19]:
def get_vocabulary_list(category):
    train_cat = list(train_df[category].unique())
    eval_cat = list(eval_df[category].unique())
    test_cat = list(test_df[category].unique())
    return sorted(set(train_cat + eval_cat + test_cat))

In [None]:
get_vocabulary_list('last_year_modified')

In [21]:
# Create pandas input function
def make_train_input_fn(df, batch_size, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = df,
        y = df['isHiddenFraudulent'],
        batch_size = batch_size,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1024,
        num_threads = 1
    )

def make_eval_input_fn(df, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = df,
        y = df['isHiddenFraudulent'], 
        batch_size = 256,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

def make_test_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x = df,
        batch_size = 16,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

#### Features to take into account

In [12]:
num_int_features = ['url_length', 'contentLength']
category_features = ['serverType', 'poweredBy', 'contentType', 'last_year_modified']

In [22]:
def create_feature_cols():

    tf_fc_num = tf.feature_column.numeric_column
    tf_fc_cat = tf.feature_column.categorical_column_with_vocabulary_list
    
    num_cols = [tf_fc_num(col) for col in num_int_features]
    cat_cols = [tf_fc_cat(key=col, 
                          vocabulary_list=get_vocabulary_list(col)) for col in category_features]
    
    cols = num_cols + cat_cols
    
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

In [23]:
def serving_input_fn():
    
    n_int = len(num_int_features)
    n_cat = len(category_features)
    
    num_placeholders = [tf.placeholder(tf.int64, [None]) for i in range(n_int)]
    string_placeholders = [tf.placeholder(tf.string, [None]) for i in range(n_cat)]
    
    feat_names = num_int_features + category_features
    placeholders = num_placeholders + string_placeholders
    
    json_feature_placeholders = dict(zip(feat_names, placeholders))
    
    features = json_feature_placeholders
    
    return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

In [87]:
# def serving_input_fn():
    
#     def get_length(single_string):
#         x = tf.expand_dims(single_string,0)
#         splitted = tf.string_split(x, delimiter="").values
#         length = tf.size(splitted)
#         return length

#     def get_lengths(string_batch):
#         return tf.map_fn(get_length, string_batch, dtype=tf.int32)
    
#     json_feature_placeholders = {
#         'url': tf.placeholder(tf.string, None),
#     }
    
#     features = {
#         'url_length': get_lengths(json_feature_placeholders['url'])
#     }    

#     return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

### Each model function

In [16]:
# Create estimator train and evaluate function
NUM_EPOCH = 10
BATCH_SIZE = 256
SAVE_CKPT_STEPS = 200 # steps
EVAL_INTERVAL_SEC =  5 # sec
HIDDEN_UNITS = [32,8,2]
MAX_STEPS = train_df.shape[0]//BATCH_SIZE * NUM_EPOCH

def train_and_evaluate(output_dir, model='linear'):
    
    run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                        save_summary_steps=50,
                                        keep_checkpoint_max=10,
                                        save_checkpoints_steps=SAVE_CKPT_STEPS)
    
    if model == 'linear':
        estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                               config=run_config)
    else:
        estimator = tf.estimator.DNNClassifier(feature_columns=feature_cols, hidden_units=HIDDEN_UNITS,
                                           config=run_config)

    train_spec = tf.estimator.TrainSpec(input_fn=make_train_input_fn(train_df, BATCH_SIZE, NUM_EPOCH), 
                                        max_steps=MAX_STEPS)

    export_latest = tf.estimator.LatestExporter(name='exporter', 
                                                serving_input_receiver_fn=serving_input_fn,
                                                exports_to_keep=None)

    eval_spec = tf.estimator.EvalSpec(input_fn=make_eval_input_fn(eval_df, BATCH_SIZE), 
                                    steps=None,
                                    start_delay_secs = 1, # start evaluating after N seconds
                                    throttle_secs = EVAL_INTERVAL_SEC,     # evaluate every N seconds
                                    exporters=export_latest
                                    )

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
    return estimator 

In [17]:
outdir = '../trained_model/len_cl_st_pb_ct_ly_linear/'

In [18]:
shutil.rmtree(outdir, ignore_errors = True)
estimator = train_and_evaluate(outdir, model='linear')

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/len_cl_st_pb_ct_ly_linear/', '_tf_random_seed': None, '_save_summary_steps': 50, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x128eef278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 5 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was 

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-13-18:54:24
INFO:tensorflow:Saving dict for global step 1671: accuracy = 0.822902, accuracy_baseline = 0.93881804, auc = 0.8079915, auc_precision_recall = 0.4465047, average_loss = 1.1110822, global_step = 1671, label/mean = 0.061181948, loss = 282.9419, precision = 0.22268616, prediction/mean = 0.23855875, recall = 0.7606973
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'serving_default' : Classification input must be a single string Tensor; got {'url_length': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int64>,

INFO:tensorflow:Saving checkpoints for 2543 into ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt.
INFO:tensorflow:loss = 782.6003, step = 2543
INFO:tensorflow:global_step/sec: 122.03
INFO:tensorflow:loss = 163.954, step = 2643 (0.820 sec)
INFO:tensorflow:Saving checkpoints for 2743 into ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 225.529
INFO:tensorflow:loss = 1982.9365, step = 2743 (0.444 sec)
INFO:tensorflow:global_step/sec: 257.249
INFO:tensorflow:loss = 1036.1464, step = 2843 (0.389 sec)
INFO:tensorflow:Saving checkpoints for 2943 into ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 228.895
INFO:tensorflow:loss = 4108.0674, step = 2943 (0.437 sec)
INFO:tensorflow:global_step/sec: 276.043
INFO:tensorflow:loss = 744.2431, step = 3043 (0.362 sec)
INFO:tensorflow:Saving checkpoints for 3143 into ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 217.094
INFO:te

INFO:tensorflow:'classification' : Classification input must be a single string Tensor; got {'url_length': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int64>, 'contentLength': <tf.Tensor 'Placeholder_1:0' shape=(?,) dtype=int64>, 'serverType': <tf.Tensor 'Placeholder_2:0' shape=(?,) dtype=string>, 'poweredBy': <tf.Tensor 'Placeholder_3:0' shape=(?,) dtype=string>, 'contentType': <tf.Tensor 'Placeholder_4:0' shape=(?,) dtype=string>, 'last_year_modified': <tf.Tensor 'Placeholder_5:0' shape=(?,) dtype=string>}
INFO:tensorflow:'regression' : Regression input must be a single string Tensor; got {'url_length': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int64>, 'contentLength': <tf.Tensor 'Placeholder_1:0' shape=(?,) dtype=int64>, 'serverType': <tf.Tensor 'Placeholder_2:0' shape=(?,) dtype=string>, 'poweredBy': <tf.Tensor 'Placeholder_3:0' shape=(?,) dtype=string>, 'contentType': <tf.Tensor 'Placeholder_4:0' shape=(?,) dtype=string>, 'last_year_modified': <tf.Tensor 'Placeholder_5:0' shape=

INFO:tensorflow:Loss for final step: 911.75745.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-13-18:55:03
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt-6082
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-13-18:55:04
INFO:tensorflow:Saving dict for global step 6082: accuracy = 0.82164156, accuracy_baseline = 0.93881804, auc = 0.88509536, auc_precision_recall = 0.46740603, average_loss = 2.8749988, global_step = 6082, label/mean = 0.061181948, loss = 732.13086, precision = 0.23993975, prediction/mean = 0.23464495, recall = 0.8835182
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:S

INFO:tensorflow:Restoring parameters from ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt-6983
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b"../trained_model/len_cl_st_pb_ct_ly_linear/export/exporter/temp-b'1552503313'/saved_model.pb"
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt-6983
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 6984 into ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt.
INFO:tensorflow:loss = 1172.0397, step = 6984
INFO:tensorflow:global_step/sec: 119.26
INFO:tensorflow:loss = 1156.1503, step = 7084 (0.839 sec)
INFO:tensorflow:Saving checkpoints for 7150 into ../trained_model/len_cl_st_pb_ct_ly_linear/model.ckpt.

### Serving 

In [99]:
url_ex = "https://github.com/warmspringwinds/tf-image-segmentation/issues/10"
len(url_ex)

66

In [100]:
%%writefile url.json
{"url_lenght": 66}

Overwriting url.json


In [None]:
%%bash

location=/Users/paul/Desktop/Google_ASL/trained_model/url_len_dnn/export/exporter/1552430670
gcloud ml-engine local predict --model-dir=$location --json-instances=url.json --verbosity debug 

#### Load models

In [41]:
import tensorflow as tf

output_dir = '../trained_model/url_len_linear/'

def create_feature_cols():
    cols = [tf.feature_column.numeric_column('url_length')]
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                  keep_checkpoint_max=10,
                                  save_summary_steps=100, 
                                  save_checkpoints_steps=200)

estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                       config=run_config)

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/url_len_linear/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e068860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [42]:
predictions = estimator.predict(input_fn=make_test_input_fn(test_df))

In [43]:
list(predictions)[:20]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_linear/model.ckpt-7000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'logits': array([0.9270397], dtype=float32),
  'logistic': array([0.7164743], dtype=float32),
  'probabilities': array([0.28352568, 0.7164743 ], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object)},
 {'logits': array([-0.37803602], dtype=float32),
  'logistic': array([0.40660068], dtype=float32),
  'probabilities': array([0.59339935, 0.40660068], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-0.09155598], dtype=float32),
  'logistic': array([0.477127], dtype=float32),
  'probabilities': array([0.52287304, 0.477127  ], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-0.20296489], dtype=float32),
  'logistic': array([0.44943225], dtype=float32),
  'probabilities': array([0.55056775, 0.44943225], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([0.9270397], dtype=float32),
  'logistic': 