In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import itertools

# Get the data
runs = pd.read_csv("Data/runs.csv")
races = pd.read_csv("Data/races.csv")
## quick fix some nan values
runs["horse_country"] = runs["horse_country"].replace(np.nan, runs["horse_country"].mode()[0])
runs["horse_type"] = runs["horse_type"].replace(np.nan, runs["horse_type"].mode()[0])

EXAMPLE_FEATURES = ['horse_age', 'horse_country', 'horse_type', 'horse_rating']
CONTEXT_FEATURES = ['surface', 'distance', 'going']

In [22]:
# https://gist.github.com/fabianp/2020955

from sklearn import svm, linear_model, cross_validation


def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking
    Transforms a n-class ranking problem into a two-class classification
    problem. Subclasses implementing particular strategies for choosing
    pairs should override this method.
    In this method, all pairs are choosen, except for those that have the
    same target value. The output is an array of balanced classes, i.e.
    there are the same number of -1 as +1
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data
    y : array, shape (n_samples,) or (n_samples, 2)
        Target labels. If it's a 2D array, the second column represents
        the grouping of samples, i.e., samples with different groups will
        not be considered.
    Returns
    -------
    X_trans : array, shape (k, n_feaures)
        Data as pairs
    y_trans : array, shape (k,)
        Output class labels, where classes have values {-1, +1}
    """
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    comb = itertools.combinations(range(X.shape[0]), 2)
    for k, (i, j) in enumerate(comb):
        if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
            # skip if same target or different group
            continue
        X_new.append(X[i] - X[j])
        y_new.append(np.sign(y[i, 0] - y[j, 0]))
        # output balanced classes
        if y_new[-1] != (-1) ** k:
            y_new[-1] = - y_new[-1]
            X_new[-1] = - X_new[-1]
    return np.asarray(X_new), np.asarray(y_new).ravel()


class RankSVM(svm.LinearSVC):
    """Performs pairwise ranking with an underlying LinearSVC model
    Input should be a n-class ranking problem, this object will convert it
    into a two-class classification problem, a setting known as
    `pairwise ranking`.
    See object :ref:`svm.LinearSVC` for a full description of parameters.
    """

    def fit(self, X, y):
        """
        Fit a pairwise ranking model.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        y : array, shape (n_samples,) or (n_samples, 2)
        Returns
        -------
        self
        """
        X_trans, y_trans = transform_pairwise(X, y)
        super(RankSVM, self).fit(X_trans, y_trans)
        return self

    def decision_function(self, X):
        return np.dot(X, self.coef_.ravel())

    def predict(self, X):
        """
        Predict an ordering on X. For a list of n samples, this method
        returns a list from 0 to n-1 with the relative order of the rows of X.
        The item is given such that items ranked on top have are
        predicted a higher ordering (i.e. 0 means is the last item
        and n_samples would be the item ranked on top).
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        Returns
        -------
        ord : array, shape (n_samples,)
            Returns a list of integers representing the relative order of
            the rows in X.
        """
        if hasattr(self, 'coef_'):
            return np.argsort(np.dot(X, self.coef_.ravel()))
        else:
            raise ValueError("Must call fit() prior to predict()")

    def score(self, X, y):
        """
        Because we transformed into a pairwise problem, chance level is at 0.5
        """
        X_trans, y_trans = transform_pairwise(X, y)
        return np.mean(super(RankSVM, self).predict(X_trans) == y_trans)    

In [63]:
# get my features and targets
features = []
labels = []

max_race_num = 500  # only take this many races
for ID in range(max_race_num):
    race = races.iloc[ID, :]
    raceRuns = runs.loc[runs["race_id"]==ID]
    for run_num in range(len(raceRuns)):
        run = raceRuns.iloc[run_num]
        tempFeatures = [run["horse_age"], run["horse_rating"], run["place_odds"], run["win_odds"]] 
        tempFeatures.extend(oneHotEncodeVal("horse_type", run))
        tempFeatures.extend(oneHotEncodeVal("horse_country", run))
        tempLabel = [run["result"], ID]
        
        features.append(tempFeatures)
        labels.append(tempLabel)
        
n_samples = len(features)        

cv = cross_validation.KFold(n_samples, 5)
train, test = iter(cv).__next__()

X = np.array(features)
Y = np.array(labels)

# print the performance of ranking
rank_svm = RankSVM().fit(X[train], Y[train])
print ('Performance of ranking ', rank_svm.score(X[test], Y[test]))

Performance of ranking  0.6755757748080751


In [42]:
runs.columns

Index(['race_id', 'horse_no', 'horse_id', 'result', 'won', 'lengths_behind',
       'horse_age', 'horse_country', 'horse_type', 'horse_rating',
       'horse_gear', 'declared_weight', 'actual_weight', 'draw',
       'position_sec1', 'position_sec2', 'position_sec3', 'position_sec4',
       'position_sec5', 'position_sec6', 'behind_sec1', 'behind_sec2',
       'behind_sec3', 'behind_sec4', 'behind_sec5', 'behind_sec6', 'time1',
       'time2', 'time3', 'time4', 'time5', 'time6', 'finish_time', 'win_odds',
       'place_odds', 'trainer_id', 'jockey_id'],
      dtype='object')

In [58]:
def oneHotEncodeVal(col, run):
    val = run[col]
    uniqueVals = np.unique(runs[col])
    output = np.zeros(len(uniqueVals))
    for index, uVal in enumerate(uniqueVals):
        if val == uVal:
            output[index] = 1
            return output

In [None]:
#### THIS FRAMEWORK WILL WORK. NEED TO RECOVER OLD FEATURE ENGINEERING STUFF TO TRY TO MAKE IT A LOT BETTER
### THEN NEED TO FIGURE OUT PARAMS FOR SVM
#### AFTER THAT, INCORPERATE BETTING FRAMEWORK INTO IT (LONG TERM ...)


## XGBOOST OBJECTIVE VALS ARE: rank:pairwise, rank:ndcg, rank:map