In [1]:
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

1.8.0


### Load data and preprocessing

In [2]:
train_df = pd.read_csv('./train.csv', sep=',')
eval_df = pd.read_csv('./eval.csv', sep=',')
test_df = pd.read_csv('./test.csv', sep=',')

In [3]:
train_df = train_df.drop(['poweredBy', 'lastModified', 'contentType'], axis=1)
eval_df = eval_df.drop(['poweredBy', 'lastModified', 'contentType'], axis=1)
test_df = test_df.drop(['poweredBy', 'lastModified', 'contentType'], axis=1)

In [4]:
train_df.head()

Unnamed: 0,url,compromissionType,isHiddenFraudulent,contentLength,serverType
0,9d345009-a-62cb3a1a-s-sites.googlegroups.com/s...,phishing,True,0,GSE
1,ok.aaweb.asia/m=weblogin/index.htm,phishing,True,23060,Microsoft-IIS/6.0
2,www.smalbro.dk/sejeal.jpg,defacement,True,104772,Apache/2.2
3,bjgdhm.com/index.html,defacement,False,1463,Microsoft-IIS/6.0
4,commsure.in/index.php,phishing,False,0,Apache/2.2


In [5]:
train_server_type = list(train_df.serverType.unique())
eval_server_type = list(eval_df.serverType.unique())
test_server_type = list(test_df.serverType.unique())

In [6]:
servertypes = set(train_server_type + eval_server_type + test_server_type)

In [7]:
def add_more_features(dataframe):
    df = dataframe.copy()
    df['url_length'] = df['url'].apply(lambda x:len(x))    
    return df

In [8]:
# Create pandas input function
def make_train_input_fn(df, batch_size, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        y = df['isHiddenFraudulent'],
        batch_size = batch_size,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1024,
        num_threads = 1
    )

In [9]:
def make_eval_input_fn(df, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        y = df['isHiddenFraudulent'], 
        batch_size = 256,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

In [10]:
def make_test_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        batch_size = 16,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

In [11]:
def create_feature_cols():
    cols = [tf.feature_column.numeric_column('url_length'),
           tf.feature_column.numeric_column('contentLength'),
           tf.feature_column.categorical_column_with_vocabulary_list('serverType', 
                                                                     vocabulary_list=servertypes)
           ]
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

In [12]:
def serving_input_fn():
    
    json_feature_placeholders = {
        'url_length': tf.placeholder(tf.int64, [None]),
        'contentLength': tf.placeholder(tf.int64, [None]),
        'serverType': tf.placeholder(tf.string, [None])
    }
    features = json_feature_placeholders
    
    return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

In [35]:
# def serving_input_fn():
    
#     def get_length(single_string):
#         x = tf.expand_dims(single_string,0)
#         splitted = tf.string_split(x, delimiter="").values
#         length = tf.size(splitted)
#         return length

#     def get_lengths(string_batch):
#         return tf.map_fn(get_length, string_batch, dtype=tf.int32)
    
#     json_feature_placeholders = {
#         'url': tf.placeholder(tf.string, None),
#     }
    
#     features = {
#         'url_length': get_lengths(json_feature_placeholders['url'])
#     }    

#     return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

### Each model function

In [13]:
# Create estimator train and evaluate function
NUM_EPOCH = 10
BATCH_SIZE = 256
SAVE_CKPT_STEPS = 200 # steps
EVAL_INTERVAL_SEC =  5 # sec
HIDDEN_UNITS = [32,8,2]
MAX_STEPS = train_df.shape[0]//BATCH_SIZE * NUM_EPOCH

def train_and_evaluate(output_dir, model='linear'):
    
    run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                        save_summary_steps=50,
                                        keep_checkpoint_max=10,
                                        save_checkpoints_steps=SAVE_CKPT_STEPS)
    
    if model == 'linear':
        estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                               config=run_config)
    else:
        estimator = tf.estimator.DNNClassifier(feature_columns=feature_cols, hidden_units=HIDDEN_UNITS,
                                           config=run_config)

    train_spec = tf.estimator.TrainSpec(input_fn=make_train_input_fn(train_df, BATCH_SIZE, NUM_EPOCH), 
                                        max_steps=MAX_STEPS)

    export_latest = tf.estimator.LatestExporter(name='exporter', 
                                                serving_input_receiver_fn=serving_input_fn,
                                                exports_to_keep=None)

    eval_spec = tf.estimator.EvalSpec(input_fn=make_eval_input_fn(eval_df, BATCH_SIZE), 
                                    steps=None,
                                    start_delay_secs = 1, # start evaluating after N seconds
                                    throttle_secs = EVAL_INTERVAL_SEC,     # evaluate every N seconds
                                    exporters=export_latest
                                    )

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
    return estimator 

In [14]:
outdir = '../trained_model/url_len_contlen_st_linear/'

In [15]:
shutil.rmtree(outdir, ignore_errors = True)
estimator = train_and_evaluate(outdir, model='linear')

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/url_len_contlen_st_linear/', '_tf_random_seed': None, '_save_summary_steps': 50, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e211438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 5 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was 

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_contlen_st_linear/model.ckpt-2248
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-13-10:51:04
INFO:tensorflow:Saving dict for global step 2248: accuracy = 0.71296227, accuracy_baseline = 0.5037596, auc = 0.71580374, auc_precision_recall = 0.7352486, average_loss = 1.9248013, global_step = 2248, label/mean = 0.4962404, loss = 490.2205, precision = 0.8199874, prediction/mean = 0.44756725, recall = 0.5401562
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'servi

INFO:tensorflow:Loss for final step: 565.64856.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-13-10:51:19
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_contlen_st_linear/model.ckpt-3874
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-13-10:51:21
INFO:tensorflow:Saving dict for global step 3874: accuracy = 0.72438216, accuracy_baseline = 0.5037596, auc = 0.72878754, auc_precision_recall = 0.80741245, average_loss = 0.9496045, global_step = 3874, label/mean = 0.4962404, loss = 241.85123, precision = 0.9440141, prediction/mean = 0.39713618, recall = 0.47261727
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Sign

INFO:tensorflow:global_step/sec: 251.779
INFO:tensorflow:loss = 1790.1038, step = 5258 (0.399 sec)
INFO:tensorflow:Saving checkpoints for 5358 into ../trained_model/url_len_contlen_st_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 237.158
INFO:tensorflow:loss = 756.9237, step = 5358 (0.420 sec)
INFO:tensorflow:global_step/sec: 262.713
INFO:tensorflow:loss = 400.04285, step = 5458 (0.380 sec)
INFO:tensorflow:Saving checkpoints for 5558 into ../trained_model/url_len_contlen_st_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 220.695
INFO:tensorflow:loss = 268.39264, step = 5558 (0.453 sec)
INFO:tensorflow:global_step/sec: 300.001
INFO:tensorflow:loss = 772.3366, step = 5658 (0.337 sec)
INFO:tensorflow:Saving checkpoints for 5758 into ../trained_model/url_len_contlen_st_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 269.244
INFO:tensorflow:loss = 220.45738, step = 5758 (0.368 sec)
INFO:tensorflow:global_step/sec: 311.497
INFO:tensorflow:loss = 521.0201, step = 5858 (0.321 sec

INFO:tensorflow:Restoring parameters from ../trained_model/url_len_contlen_st_linear/model.ckpt-7120
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b"../trained_model/url_len_contlen_st_linear/export/exporter/temp-b'1552474305'/saved_model.pb"


### Serving 

In [99]:
url_ex = "https://github.com/warmspringwinds/tf-image-segmentation/issues/10"
len(url_ex)

66

In [100]:
%%writefile url.json
{"url_lenght": 66}

Overwriting url.json


In [None]:
%%bash

location=/Users/paul/Desktop/Google_ASL/trained_model/url_len_dnn/export/exporter/1552430670
gcloud ml-engine local predict --model-dir=$location --json-instances=url.json --verbosity debug 

#### Load models

In [41]:
import tensorflow as tf

output_dir = '../trained_model/url_len_linear/'

def create_feature_cols():
    cols = [tf.feature_column.numeric_column('url_length')]
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                  keep_checkpoint_max=10,
                                  save_summary_steps=100, 
                                  save_checkpoints_steps=200)

estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                       config=run_config)

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/url_len_linear/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e068860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [42]:
predictions = estimator.predict(input_fn=make_test_input_fn(test_df))

In [43]:
list(predictions)[:20]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_linear/model.ckpt-7000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'logits': array([0.9270397], dtype=float32),
  'logistic': array([0.7164743], dtype=float32),
  'probabilities': array([0.28352568, 0.7164743 ], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object)},
 {'logits': array([-0.37803602], dtype=float32),
  'logistic': array([0.40660068], dtype=float32),
  'probabilities': array([0.59339935, 0.40660068], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-0.09155598], dtype=float32),
  'logistic': array([0.477127], dtype=float32),
  'probabilities': array([0.52287304, 0.477127  ], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-0.20296489], dtype=float32),
  'logistic': array([0.44943225], dtype=float32),
  'probabilities': array([0.55056775, 0.44943225], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([0.9270397], dtype=float32),
  'logistic': 

In [46]:
'1'.isalnum()

True