In [21]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [22]:
import tempfile
import urllib

train_file = 'data/clean_training_data.csv'
test_file = 'data/clean_testing_data.csv'

In [23]:
import pandas as pd

COLUMNS = ['TIMESTAMP','FISHING_STATUS','CHECKSUM','slot_increment','sync_state','true_heading','sog','slots_to_allocate','rot','nav_status','repeat_indicator','raim','id','spare','keep_flag','cog','timestamp','y','x','position_accuracy','rot_over_range','mmsi','special_manoeuvre','slot_timeout','slot_offset','slot_number','received_stations','utc_spare','utc_min','utc_hour','unit_flag','spare2','mode_flag','m22_flag','commstate_cs_fill','commstate_flag','display_flag','dsc_flag','band_flag','gnss','fix_type','type_and_cargo','spare3','dim_d','assigned_mode','dim_b','dim_c','dim_a','name','dte']
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True, skiprows=1)

COLUMNS_TEST = ['TIMESTAMP','CHECKSUM','slot_increment','sync_state','true_heading','sog','slots_to_allocate','rot','nav_status','repeat_indicator','raim','id','spare','keep_flag','cog','timestamp','y','x','position_accuracy','rot_over_range','mmsi','special_manoeuvre','slot_timeout','slot_offset','slot_number','received_stations','utc_spare','utc_min','utc_hour','unit_flag','spare2','mode_flag','m22_flag','commstate_cs_fill','commstate_flag','display_flag','dsc_flag','band_flag','gnss','fix_type','type_and_cargo','spare3','dim_d','assigned_mode','dim_b','dim_c','dim_a','name','dte']
df_test = pd.read_csv(test_file, names=COLUMNS_TEST, skipinitialspace=True, skiprows=1)


In [24]:
LABEL_COLUMN = "label"
def status_as_num(x):
    if x == 'Unknown':
        return 0
    if x == 'Fishing':
        return 1
    if x == "Not_Fishing":
        return 0

def status_unknown(x):
    return 0

df_train[LABEL_COLUMN] = (df_train["FISHING_STATUS"].apply(status_as_num)).astype(int)
df_test[LABEL_COLUMN] = (df_test["TIMESTAMP"].apply(status_unknown)).astype(int)


In [25]:
CATEGORICAL_COLUMNS = []
CONTINUOUS_COLUMNS = ["sog", "cog", "x", "y"]


In [26]:
import tensorflow as tf

def train_fn():
    df = df_train
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS}

    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols.items() + categorical_cols.items())

    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    
    # Returns the feature columns and the label.
    return feature_cols, label


def predict_fn():
    df = df_test
    
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS}

    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols.items() + categorical_cols.items())

    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    
    # Returns the feature columns and the label.
    return feature_cols, label


In [27]:
feature_sog = tf.contrib.layers.real_valued_column("sog")
feature_cog = tf.contrib.layers.real_valued_column("cog")
feature_x = tf.contrib.layers.real_valued_column("x")
feature_y = tf.contrib.layers.real_valued_column("y")

# feature_raim = tf.contrib.layers.sparse_column_with_hash_bucket("raim", hash_bucket_size=100)

In [28]:
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(
    feature_columns=[feature_sog, feature_cog, feature_x, feature_y],
    model_dir=model_dir)




In [29]:
m.fit(input_fn=train_fn, steps=5000)




LinearClassifier()

In [30]:
prediction = m.predict_proba(input_fn=predict_fn)
pp.pprint(prediction)

outputFile = open('data/predictions_c.txt', 'wb')
for p in prediction:
    outputFile.write(str(float(p[1])) + '\n')
outputFile.close()

array([[  2.72093490e-02,   9.72790599e-01],
       [  9.39468503e-01,   6.05315641e-02],
       [  1.00000000e+00,   2.03838321e-10],
       ..., 
       [  9.97358859e-01,   2.64117937e-03],
       [  1.00000000e+00,   2.51744439e-08],
       [  2.76224494e-01,   7.23775506e-01]], dtype=float32)


In [31]:
prediction = m.predict(input_fn=predict_fn)
pp.pprint(prediction)

outputFile = open('data/predictions_d.txt', 'wb')
for p in prediction:
    outputFile.write(str(float(p)) + '\n')
outputFile.close()

array([1, 0, 0, ..., 0, 0, 1])
