In [1]:
import numpy as np
import matplotlib.pyplot as plot
import pandas as pd
from helper import *

main = pd.read_pickle("../Data/clean_main.df")

In [2]:
# The plan for taking into account sectional distances:
# Add a speed rating for the first sectional (200 or 400) and for the last sectional (400):

# Steps: get avg first sectional times for each type of race
# Convert all horse records into this raw rating
# Get class avg ratings for this raw rating
# Get track variant
# Get final raw first sectional rating

In [3]:
date = pd.to_datetime("1998-07-19")
data = main[main.date < date]
model_data = main[main.date >= date]

samples = len(data)
dic = {"samples":samples}

venues = np.unique(data["venue"])
s = 0.
for v in venues:
    vData = data.loc[data["venue"]==v]
    surfaces = np.unique(vData["surface"])
    
    for s in surfaces:
        sData = vData.loc[data["surface"]==s]
        distances = np.unique(sData["distance"])

        for d in distances:
            dData = vData.loc[vData["distance"]==d]

            # Average finish time difference
            avgTime = np.mean(dData["time1"])
            string = v + " " + str(s) + " " + str(d)
            dic[string] = avgTime
np.save('sec1_averages.npy', dic)

In [4]:
# Add speed ratings to each run in the model data
def addRawSpeedRatings_sec1(data):
    data = data.copy()
    dataSize = len(data)
    ratings = np.zeros(dataSize)
    
    for i in range(dataSize):
        printProgress(i, dataSize)
        
        run = data.iloc[i, :]
        venue, surface, distance, time = run["venue"], run["surface"], run["distance"], run["time1"]
        
        rating = speed2rating_sec1(time, distance, venue, surface)
        ratings[i] = rating
        
    data["sec1_speed_rating_1"] = ratings
    
    return data

main = addRawSpeedRatings_sec1(main)
main.to_pickle("../Data/clean_main.df")

0.00% Complete ...
1.31% Complete ...
2.63% Complete ...
3.94% Complete ...


KeyboardInterrupt: 

In [None]:
# DO THE SAME THING FOR CLASSES
splitDate = pd.to_datetime("1998-07-19")
data = main[main.date < splitDate]

# Get only fast tracks
fast_tracks = ["GOOD", "GOOD TO FIRM", "FAST"]
data = data.loc[data.going.isin(fast_tracks)]

# Get class avgs
classAvgs = {}
classes = np.unique(main["race_class"])
for c in classes:
    
    races = data.loc[data.race_class == c]
    
    avgRating = races["sec1_speed_rating_1"].mean()
    classAvgs[c] = avgRating

np.save('sec1_class_averages.npy', classAvgs)

In [None]:
# Now we create a track variant based on this

dataSize = len(main)
track_vars = np.zeros(dataSize)
dates = np.unique(main["date"])
venues = np.unique(main["venue"])
count = 0
for date in dates:
    printProgress(count, len(dates), jump=100)
    count += 1
    for venue in venues:
        races = main.loc[(main.date == date) & (main.venue == venue)]
        raceI = races.index
        race_ids = np.unique(races.race_id)
        if len(race_ids)==0:
            continue
        diffs = 0.
    
        for race_id in race_ids:
            race = races.loc[races.race_id==race_id]
            c = race.iloc[0,:]["race_class"]
        
            raceAvg = race["sec1_speed_rating_1"].mean()
            classAvg = SEC1_CLASS_AVGS[c]
            diffs += classAvg - raceAvg
        
            if race.loc[race.race_class==c].shape[0] != race.shape[0]:
                print("ISSUE")
        
        trackVar = diffs / len(race_ids)

        for i in raceI:
            track_vars[i] = trackVar
    
main["sec1_track_variant"] = track_vars

main.to_pickle("../Data/clean_main.df")

In [None]:
main["sec1_speed_rating"] = main.apply(lambda x: x.sec1_speed_rating_1 + x.sec1_track_variant, axis=1)

In [None]:
main.to_pickle("../Data/clean_main.df")

In [None]:
main["sec1_speed_rating"]

In [None]:
# DO THE SAME THING FOR THE LAST SECTION
date = pd.to_datetime("1998-07-19")
data = main[main.date < date]
model_data = main[main.date >= date]

samples = len(data)
dic = {"samples":samples}

venues = np.unique(data["venue"])
s = 0.
for v in venues:
    vData = data.loc[data["venue"]==v]
    surfaces = np.unique(vData["surface"])
    
    for s in surfaces:
        sData = vData.loc[data["surface"]==s]
        distances = np.unique(sData["distance"])

        for d in distances:
            dData = vData.loc[vData["distance"]==d]
            string = v + " " + str(s) + " " + str(d)
            secDists = SECTIONAL_DISTANCES[string]
            n = len(secDists)
            
            # Average finish time difference
            avgTime = np.mean(dData["time"+str(n)])
            
            dic[string] = avgTime
np.save('last_sec_averages.npy', dic)

# Add speed ratings to each run in the model data
def addRawSpeedRatings_last_sec(data):
    data = data.copy()
    dataSize = len(data)
    ratings = np.zeros(dataSize)
    
    for i in range(dataSize):
        printProgress(i, dataSize)
        
        run = data.iloc[i, :]
        
        venue, surface, distance = run["venue"], run["surface"], run["distance"]
        string = venue + " " + str(surface) + " " + str(distance)
        secDists = SECTIONAL_DISTANCES[string]
        n = len(secDists)
        time = run["time"+str(n)]
        
        rating = speed2rating_last_sec(time, distance, venue, surface)
        ratings[i] = rating
        
    data["last_sec_speed_rating_1"] = ratings
    
    return data

main = addRawSpeedRatings_last_sec(main)
main.to_pickle("../clean_main.df")

# DO THE SAME THING FOR CLASSES
splitDate = pd.to_datetime("1998-07-19")
data = main[main.date < splitDate]

# Get only fast tracks
fast_tracks = ["GOOD", "GOOD TO FIRM", "FAST"]
data = data.loc[data.going.isin(fast_tracks)]

# Get class avgs
classAvgs = {}
classes = np.unique(main["race_class"])
for c in classes:
    
    races = data.loc[data.race_class == c]
    
    avgRating = races["last_sec_speed_rating_1"].mean()
    classAvgs[c] = avgRating

np.save('last_sec_class_averages.npy', classAvgs)

In [None]:
# Now we create a track variant based on this
LAST_SEC_CLASS_AVGS = np.load("last_sec_class_averages.npy", allow_pickle='TRUE').item()
dataSize = len(main)
track_vars = np.zeros(dataSize)
dates = np.unique(main["date"])
venues = np.unique(main["venue"])
count = 0
for date in dates:
    printProgress(count, len(dates), jump=100)
    count += 1
    for venue in venues:
        races = main.loc[(main.date == date) & (main.venue == venue)]
        raceI = races.index
        race_ids = np.unique(races.race_id)
        if len(race_ids)==0:
            continue
        diffs = 0.
    
        for race_id in race_ids:
            race = races.loc[races.race_id==race_id]
            c = race.iloc[0,:]["race_class"]
        
            raceAvg = race["sec1_speed_rating_1"].mean()
            classAvg = LAST_SEC_CLASS_AVGS[c]
            diffs += classAvg - raceAvg
        
            if race.loc[race.race_class==c].shape[0] != race.shape[0]:
                print("ISSUE")
        
        trackVar = diffs / len(race_ids)

        for i in raceI:
            track_vars[i] = trackVar
    
main["last_sec_track_variant"] = track_vars

main.to_pickle("../Data/clean_main.df")

main["last_sec_speed_rating"] = main.apply(lambda x: x.last_sec_speed_rating_1 + x.last_sec_track_variant, axis=1)

main.to_pickle("../Data/clean_main.df")

In [None]:
#main["speed_rating_ratio"] = main.apply(lambda x: x.sec1_speed_rating / x.last_sec_speed_rating, axis=1)
main.to_pickle("../Data/clean_main.df")