In [1]:
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

1.8.0


### Load data and preprocessing

In [2]:
train_df = pd.read_csv('./train.csv', sep=',')
eval_df = pd.read_csv('./eval.csv', sep=',')
test_df = pd.read_csv('./test.csv', sep=',')

In [3]:
train_df = train_df.drop(['poweredBy', 'serverType', 'lastModified', 'contentType'], axis=1)
eval_df = eval_df.drop(['poweredBy', 'serverType', 'lastModified', 'contentType'], axis=1)
test_df = test_df.drop(['poweredBy', 'serverType', 'lastModified', 'contentType'], axis=1)

In [4]:
train_df.head()

Unnamed: 0,url,compromissionType,isHiddenFraudulent,contentLength
0,9d345009-a-62cb3a1a-s-sites.googlegroups.com/s...,phishing,True,0
1,ok.aaweb.asia/m=weblogin/index.htm,phishing,True,23060
2,www.smalbro.dk/sejeal.jpg,defacement,True,104772
3,bjgdhm.com/index.html,defacement,False,1463
4,commsure.in/index.php,phishing,False,0


In [5]:
def add_more_features(dataframe):
    df = dataframe.copy()
    df['url_length'] = df['url'].apply(lambda x:len(x))    
    return df

In [6]:
# Create pandas input function
def make_train_input_fn(df, batch_size, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        y = df['isHiddenFraudulent'],
        batch_size = batch_size,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1024,
        num_threads = 1
    )

In [7]:
def make_eval_input_fn(df, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        y = df['isHiddenFraudulent'], 
        batch_size = 256,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

In [8]:
def make_test_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        batch_size = 16,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

In [18]:
def create_feature_cols():
    cols = [tf.feature_column.numeric_column('url_length'),
           tf.feature_column.numeric_column('contentLength')]
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

In [19]:
def serving_input_fn():
    
    json_feature_placeholders = {
        'url_length': tf.placeholder(tf.int64, [None]),
        'contentLength': tf.placeholder(tf.int64, [None])
    }
    features = json_feature_placeholders
    
    return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

In [10]:
# def serving_input_fn():
    
#     def get_length(single_string):
#         x = tf.expand_dims(single_string,0)
#         splitted = tf.string_split(x, delimiter="").values
#         length = tf.size(splitted)
#         return length

#     def get_lengths(string_batch):
#         return tf.map_fn(get_length, string_batch, dtype=tf.int32)
    
#     json_feature_placeholders = {
#         'url': tf.placeholder(tf.string, None),
#     }
    
#     features = {
#         'url_length': get_lengths(json_feature_placeholders['url'])
#     }    

#     return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

### Each model function

In [20]:
# Create estimator train and evaluate function
NUM_EPOCH = 10
BATCH_SIZE = 256
SAVE_CKPT_STEPS = 200 # steps
EVAL_INTERVAL_SEC =  5 # sec
HIDDEN_UNITS = [32,8,2]
MAX_STEPS = train_df.shape[0]//BATCH_SIZE * NUM_EPOCH

def train_and_evaluate(output_dir, model='linear'):
    
    run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                        save_summary_steps=50,
                                        keep_checkpoint_max=10,
                                        save_checkpoints_steps=SAVE_CKPT_STEPS)
    
    if model == 'linear':
        estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                               config=run_config)
    else:
        estimator = tf.estimator.DNNClassifier(feature_columns=feature_cols, hidden_units=HIDDEN_UNITS,
                                           config=run_config)

    train_spec = tf.estimator.TrainSpec(input_fn=make_train_input_fn(train_df, BATCH_SIZE, NUM_EPOCH), 
                                        max_steps=MAX_STEPS)

    export_latest = tf.estimator.LatestExporter(name='exporter', 
                                                serving_input_receiver_fn=serving_input_fn,
                                                exports_to_keep=None)

    eval_spec = tf.estimator.EvalSpec(input_fn=make_eval_input_fn(eval_df, BATCH_SIZE), 
                                    steps=None,
                                    start_delay_secs = 1, # start evaluating after N seconds
                                    throttle_secs = EVAL_INTERVAL_SEC,     # evaluate every N seconds
                                    exporters=export_latest
                                    )

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
    return estimator 

In [21]:
outdir = '../trained_model/url_len_contlen_linear/'

In [22]:
shutil.rmtree(outdir, ignore_errors = True)
estimator = train_and_evaluate(outdir, model='linear')

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/url_len_contlen_linear/', '_tf_random_seed': None, '_save_summary_steps': 50, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x122f2cb00>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 5 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was fin

INFO:tensorflow:global_step/sec: 397.706
INFO:tensorflow:loss = 2236.4492, step = 2836 (0.253 sec)
INFO:tensorflow:Saving checkpoints for 2936 into ../trained_model/url_len_contlen_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 331.195
INFO:tensorflow:loss = 4163.83, step = 2936 (0.301 sec)
INFO:tensorflow:global_step/sec: 383.219
INFO:tensorflow:loss = 802.14233, step = 3036 (0.260 sec)
INFO:tensorflow:Saving checkpoints for 3069 into ../trained_model/url_len_contlen_linear/model.ckpt.
INFO:tensorflow:Loss for final step: 546.5383.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-12-23:07:19
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_contlen_linear/model.ckpt-3069
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-12-23:07:20
INFO:tensorflow:Saving dict for global step 306

INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b"../trained_model/url_len_contlen_linear/export/exporter/temp-b'1552432047'/saved_model.pb"
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_contlen_linear/model.ckpt-4685
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 4686 into ../trained_model/url_len_contlen_linear/model.ckpt.
INFO:tensorflow:loss = 945.50653, step = 4686
INFO:tensorflow:global_step/sec: 220.994
INFO:tensorflow:loss = 369.11615, step = 4786 (0.455 sec)
INFO:tensorflow:Saving checkpoints for 4886 into ../trained_model/url_len_contlen_linear/model.ckpt.
INFO:tensorflow:global_step/sec: 330.58
INFO:tensorflow:loss = 742.90857, step = 4886 (0.301 sec)
INFO:tensorflow:global_step/sec: 409.165
INFO:tensor

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'serving_default' : Classification input must be a single string Tensor; got {'url_length': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int64>, 'contentLength': <tf.Tensor 'Placeholder_1:0' shape=(?,) dtype=int64>}
INFO:tensorflow:'classification' : Classification input must be a single string Tensor; got {'url_length': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int64>, 'contentLength': <tf.Tensor 'Placeholder_1:0' shape=(?,) dtype=int64>}
INFO:tensorflow:'regression' : Regression input must be a single string Tensor; got {'url_length': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int64>, 'contentLength': <tf.Tensor 'Placeholde

### Serving 

In [99]:
url_ex = "https://github.com/warmspringwinds/tf-image-segmentation/issues/10"
len(url_ex)

66

In [100]:
%%writefile url.json
{"url_lenght": 66}

Overwriting url.json


In [None]:
%%bash

location=/Users/paul/Desktop/Google_ASL/trained_model/url_len_dnn/export/exporter/1552430670
gcloud ml-engine local predict --model-dir=$location --json-instances=url.json --verbosity debug 

#### Load models

In [41]:
import tensorflow as tf

output_dir = '../trained_model/url_len_linear/'

def create_feature_cols():
    cols = [tf.feature_column.numeric_column('url_length')]
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                  keep_checkpoint_max=10,
                                  save_summary_steps=100, 
                                  save_checkpoints_steps=200)

estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                       config=run_config)

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/url_len_linear/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e068860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [42]:
predictions = estimator.predict(input_fn=make_test_input_fn(test_df))

In [43]:
list(predictions)[:20]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../trained_model/url_len_linear/model.ckpt-7000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'logits': array([0.9270397], dtype=float32),
  'logistic': array([0.7164743], dtype=float32),
  'probabilities': array([0.28352568, 0.7164743 ], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object)},
 {'logits': array([-0.37803602], dtype=float32),
  'logistic': array([0.40660068], dtype=float32),
  'probabilities': array([0.59339935, 0.40660068], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-0.09155598], dtype=float32),
  'logistic': array([0.477127], dtype=float32),
  'probabilities': array([0.52287304, 0.477127  ], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-0.20296489], dtype=float32),
  'logistic': array([0.44943225], dtype=float32),
  'probabilities': array([0.55056775, 0.44943225], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([0.9270397], dtype=float32),
  'logistic': 

In [46]:
'1'.isalnum()

True