In [1]:
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

1.8.0


In [21]:
train_df = pd.read_csv('./train.csv', sep=',')
eval_df = pd.read_csv('./eval.csv', sep=',')
test_df = pd.read_csv('./test.csv', sep=',')

In [22]:
train_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    91720
True     90612
Name: url, dtype: int64

In [23]:
def add_more_features(dataframe):
    df = dataframe.copy()
    df['url_length'] = df['url'].apply(lambda x:len(x))    
    return df

In [26]:
# Create pandas input function
def make_train_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        y = df['isHiddenFraudulent'],
        batch_size = 256,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1024,
        num_threads = 1
    )

In [27]:
def make_valid_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        y = df['isHiddenFraudulent'], 
        batch_size = 256,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

In [28]:
def make_test_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        batch_size = 256,
        num_epochs = 1,
        shuffle = False,
        queue_capacity = 1024,
        num_threads = 1
    )

In [57]:
def create_feature_cols():
    cols = [tf.feature_column.numeric_column('url_length')]
    #try bucketize url_length 5 
    return cols 

feature_cols = create_feature_cols()

In [97]:
def serving_input_fn():
    
    def get_length(single_string):
        x = tf.expand_dims(single_string,0)
        splitted = tf.string_split(x, delimiter="").values
        length = tf.size(splitted)
        return length

    def get_lengths(string_batch):
        return tf.map_fn(get_length, string_batch, dtype=tf.int32)
    
    json_feature_placeholders = {
        'url': tf.placeholder(tf.string, None),
    }
    
    features = {
        'url_length': get_lengths(json_feature_placeholders['url'])
    }    

    return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)

In [98]:
# Create estimator train and evaluate function
def train_and_evaluate(output_dir, num_train_steps):
    run_config = tf.estimator.RunConfig(model_dir=output_dir, 
                                      keep_checkpoint_max=10,
                                      save_summary_steps=100, 
                                      save_checkpoints_steps=200)

    estimator = tf.estimator.LinearClassifier(feature_columns=feature_cols, 
                                           config=run_config)

    #   estimator = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[32,8,2],
    #                                        config=run_config)

    train_spec = tf.estimator.TrainSpec(input_fn=make_train_input_fn(a, 10), 
                                      max_steps=num_train_steps)

    export_latest = tf.estimator.LatestExporter(name='linear', 
                                              serving_input_receiver_fn=serving_input_fn)

    eval_spec = tf.estimator.EvalSpec(input_fn=make_valid_input_fn(b), 
                                    steps=None,
                                    start_delay_secs = 1, # start evaluating after N seconds
                                    throttle_secs = 50,     # evaluate every N seconds
                                    exporters=export_latest
                                    )

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [99]:
a = train_df.drop(['poweredBy', 'serverType', 'lastModified', 'contentType'], axis=1)
b = eval_df.drop(['poweredBy', 'serverType', 'lastModified', 'contentType'], axis=1)

In [100]:
outdir = '../trained_model/url_len/'

In [101]:
shutil.rmtree(outdir, ignore_errors = True)
train_and_evaluate(outdir, 1000)

INFO:tensorflow:Using config: {'_model_dir': '../trained_model/url_len/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 10, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dd5e898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 50 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:

In [102]:
%%writefile url.json
{"url": "https://github.com/warmspringwinds/tf-image-segmentation/issues/10"}
{"url": "https://github.com/warmspringwinds/tf-image-segmentation/issues/0"}

Overwriting url.json


In [103]:
%%bash

location=/Users/paul/Desktop/Google_ASL/trained_model/url_len/export/linear/1552408727

gcloud ml-engine local predict --model-dir=$location --json-instances=url.json --verbosity debug 

DEBUG: Running [gcloud.ml-engine.local.predict] with arguments: [--json-instances: "url.json", --model-dir: "/Users/paul/Desktop/Google_ASL/trained_model/url_len/export/linear/1552408727", --verbosity: "debug"]
DEBUG: (gcloud.ml-engine.local.predict) 2019-03-12 16:39:02.838389: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Traceback (most recent call last):
  File "/Users/paul/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml_engine/local_predict.py", line 184, in <module>
    main()
  File "/Users/paul/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml_engine/local_predict.py", line 179, in main
    signature_name=args.signature_name)
  File "/Users/paul/google-cloud-sdk/lib/third_party/ml_sdk/cloud/ml/prediction/prediction_lib.py", line 101, in local_predict
    predictions = model.predict(instances, signature_name=signature_name)
  File "/Users/paul/google-cloud-sdk/lib/third_party/m

CalledProcessError: Command 'b'\nlocation=/Users/paul/Desktop/Google_ASL/trained_model/url_len/export/linear/1552408727\n\ngcloud ml-engine local predict --model-dir=$location --json-instances=url.json --verbosity debug \n'' returned non-zero exit status 1.

In [66]:
!ls -a

[1m[36m.[m[m                                     dataset_no_dupli_no_http.csv
[1m[36m..[m[m                                    dataset_no_dupli_no_http_only_url.csv
[1m[36m.git[m[m                                  encoding.ipynb
.gitignore                            eval.csv
[1m[36m.ipynb_checkpoints[m[m                    playground.ipynb
[31mHiddenFraudulentURLs.csv[m[m              test.csv
README.md                             tokenization.ipynb
Untitled.ipynb                        train.csv
analysis_paul.ipynb                   transform_csv_file.ipynb
basic_linear_model.ipynb              url.json
data_visualisation.ipynb


In [178]:
tf.__version__

'1.8.0'