In [1]:
import pandas as pd
import pickle
import numpy as np

import tensorflow as tf

import six
import os
import numpy as np

import tensorflow_ranking as tfr
    

races = pd.read_csv("Data/races.csv")
runs = pd.read_csv("Data/runs.csv")

runs["horse_country"] = runs["horse_country"].replace(np.nan, runs["horse_country"].mode()[0])
runs["horse_type"] = runs["horse_type"].replace(np.nan, runs["horse_type"].mode()[0])

In [2]:
COUNTRY_VOCAB = np.unique(np.array(runs["horse_country"]))
TYPE_VOCAB = np.unique(np.array(runs["horse_type"]))
GOING_VOCAB = np.unique(np.array(races["going"]))

# Store the paths to files containing training and test instances.
_TRAIN_DATA_PATH = "Data/train.tfrecord"
_TEST_DATA_PATH = "Data/test.tfrecords"

# Store the vocabulary path for query and document tokens.
_VOCAB_PATH = "/tmp/vocab.txt" # DOES NOT EXIST

# The maximum number of documents per query in the dataset.
# Document lists are padded or truncated to this size.
_LIST_SIZE = 15

# The document relevance label.
_LABEL_FEATURE = "horse_placing"

# Padding labels are set negative so that the corresponding examples can be
# ignored in loss and metrics.
_PADDING_LABEL = -1

# Learning rate for optimizer.
_LEARNING_RATE = 0.05

# Parameters to the scoring function.
_BATCH_SIZE = 32
_HIDDEN_LAYER_DIMS = ["64", "32", "16"]
_DROPOUT_RATE = 0.2
_GROUP_SIZE = 1  # Pointwise scoring.

# Location of model directory and number of training steps.
_MODEL_DIR = "Model/ranking_model_dir"
_NUM_TRAIN_STEPS = 15 * 1000

In [3]:
def context_feature_columns():
    '''CONTEXT FEATURES: distance, surface, going'''
    
    distance_column = tf.feature_column.numeric_column(key="distance")
    
    surface_column = tf.feature_column.numeric_column(key="surface")
    
    going_column = tf.feature_column.categorical_column_with_vocabulary_list(
        key="going",
        vocabulary_list=GOING_VOCAB)
    going_one_hot_column = tf.feature_column.indicator_column(going_column)
    
    return {"distance":distance_column,
            "surface":surface_column,
            "going":going_one_hot_column}

def example_feature_columns():
    '''EXAMPLE FEATURES: horse_age, horse_country, horse_type, horse_rating, horse_placing'''
    
    age_column = tf.feature_column.numeric_column(key="horse_age")
    
    country_column = tf.feature_column.categorical_column_with_vocabulary_list(
        key="horse_country",
        vocabulary_list=COUNTRY_VOCAB)
    country_one_hot_column = tf.feature_column.indicator_column(country_column)
    
    type_column = tf.feature_column.categorical_column_with_vocabulary_list(
        key="horse_type",
        vocabulary_list=TYPE_VOCAB)
    type_one_hot_column = tf.feature_column.indicator_column(type_column)
    
    rating_column = tf.feature_column.numeric_column(key="horse_rating")
    
    placing_column = tf.feature_column.numeric_column(key="horse_placing",
                                                      dtype=tf.int64,
                                                      default_value=_PADDING_LABEL)
    
    return {"horse_age":age_column,
            "horse_country":country_one_hot_column,
            "horse_type":type_one_hot_column,
            #"horse_placing":placing_column
           }

def input_fn(path, num_epochs=None):
    context_feature_spec = tf.feature_column.make_parse_example_spec(
        context_feature_columns().values())
    
    label_column = tf.feature_column.numeric_column(
        _LABEL_FEATURE, dtype=tf.int64, default_value=_PADDING_LABEL)
    
    example_feature_spec = tf.feature_column.make_parse_example_spec(
        list(example_feature_columns().values()) + [label_column])
    
    dataset = tfr.data.build_ranking_dataset(
        file_pattern=path,
        #data_format=tfr.data.ELWC,
        data_format='example_list_with_context',
        batch_size=_BATCH_SIZE,
        list_size=_LIST_SIZE,
        context_feature_spec=context_feature_spec,
        example_feature_spec=example_feature_spec,
        reader=tf.data.TFRecordDataset,
        shuffle=False,
        num_epochs=num_epochs)
    
    features = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
    label = tf.squeeze(features.pop(_LABEL_FEATURE), axis=2)
    label = tf.cast(label, tf.float32)

    return features, label

def make_transform_fn():
    def _transform_fn(features, mode):
        """Defines transform_fn."""
        context_features, example_features = tfr.feature.encode_listwise_features(
            features=features,
            context_feature_columns=context_feature_columns(),
            example_feature_columns=example_feature_columns(),
            mode=mode,
            scope="transform_layer")

        return context_features, example_features
    return _transform_fn

def make_score_fn():
    """Returns a scoring function to build `EstimatorSpec`."""

    def _score_fn(context_features, group_features, mode, params, config):
        """Defines the network to score a group of documents."""
        with tf.compat.v1.name_scope("input_layer"):
            context_input = [
                tf.compat.v1.layers.flatten(context_features[name])
                for name in sorted(context_feature_columns())
            ]
            group_input = [
                tf.compat.v1.layers.flatten(group_features[name])
                for name in sorted(example_feature_columns())
            ]
            input_layer = tf.concat(context_input + group_input, 1)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        cur_layer = input_layer
        cur_layer = tf.compat.v1.layers.batch_normalization(
            cur_layer,
            training=is_training,
            momentum=0.99)

        for i, layer_width in enumerate(int(d) for d in _HIDDEN_LAYER_DIMS):
            cur_layer = tf.compat.v1.layers.dense(cur_layer, units=layer_width)
            cur_layer = tf.compat.v1.layers.batch_normalization(
                cur_layer,
                training=is_training,
                momentum=0.99)
            cur_layer = tf.nn.relu(cur_layer)
            cur_layer = tf.compat.v1.layers.dropout(
                inputs=cur_layer, rate=_DROPOUT_RATE, training=is_training)
            
        logits = tf.compat.v1.layers.dense(cur_layer, units=_GROUP_SIZE)
        return logits

    return _score_fn

def eval_metric_fns():

    metric_fns = {}
    metric_fns.update({
        "metric/ndcg@%d" % topn: tfr.metrics.make_ranking_metric_fn(
        tfr.metrics.RankingMetricKey.NDCG, topn=topn)
        for topn in [1, 3, 5, 10]
    })

    return metric_fns

optimizer = tf.compat.v1.train.AdagradOptimizer(
    learning_rate=_LEARNING_RATE)

def _train_op_fn(loss):
    """Defines train op used in ranking head."""
    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
    minimize_op = optimizer.minimize(
        loss=loss, global_step=tf.compat.v1.train.get_global_step())
    
    train_op = tf.group([update_ops, minimize_op])
    return train_op

def train_and_eval_fn():
    """Train and eval function used by `tf.estimator.train_and_evaluate`."""
    run_config = tf.estimator.RunConfig(
        save_checkpoints_steps=1000)

    ranker = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=_MODEL_DIR,
        config=run_config)

    train_input_fn = lambda: input_fn(_TRAIN_DATA_PATH)
    eval_input_fn = lambda: input_fn(_TEST_DATA_PATH, num_epochs=1)

    train_spec = tf.estimator.TrainSpec(
        input_fn=train_input_fn, max_steps=_NUM_TRAIN_STEPS)
    eval_spec =  tf.estimator.EvalSpec(
        name="eval",
        input_fn=eval_input_fn,
        throttle_secs=15)
    
    return (ranker, train_spec, eval_spec)

In [4]:
_LOSS = tfr.losses.RankingLossKey.APPROX_NDCG_LOSS
loss_fn = tfr.losses.make_loss_fn(_LOSS)

ranking_head = tfr.head.create_ranking_head(
    loss_fn=loss_fn,
    eval_metric_fns=eval_metric_fns(),
    train_op_fn=_train_op_fn)

In [5]:
model_fn = tfr.model.make_groupwise_ranking_fn(
    group_score_fn=make_score_fn(),
    transform_fn=make_transform_fn(),
    group_size=_GROUP_SIZE,
    ranking_head=ranking_head)

In [None]:
! rm -rf "Model/ranking_model_dir"  # Clean up the model directory.
ranker, train_spec, eval_spec = train_and_eval_fn()
print("starting here")
tf.estimator.train_and_evaluate(ranker, train_spec, eval_spec)

W0429 12:24:28.377306 4445152704 model_fn.py:630] Estimator's model_fn (<function _make_model_fn.<locals>._model_fn at 0x136d332f0>) includes params argument, but params are not passed to Estimator.
W0429 12:24:28.389637 4445152704 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1635: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0429 12:24:28.390735 4445152704 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Var

Got here
Got here
Got here
Got here
Got here


W0429 12:24:29.779824 4445152704 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_core/python/feature_column/feature_column_v2.py:4267: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0429 12:24:29.780683 4445152704 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_core/python/feature_column/feature_column_v2.py:4322: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0429 12:24:29.985708 4445152704 deprecation.py