In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView

from prediction.models.fit_model import TSModel
from prediction.models.model_parameters import ModelParameters
from prediction.models.factor_management import FactorList
from prediction.models.preprocessing import load_slices, print_factor_order

from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.plotting import varinfo
from os import listdir

In [2]:
av = ArrayView.from_file('../datadev/brain_final2cut.av.bcolz')
tsav = load_slices(path='../datadev/')

In [3]:
# Take absolute value
take_abs = ['DistanceLast', 'Speed', 'RacesSincePreviousTrainerWin', 'DaysSinceLastRun', 
            'Race2RunsAgoRaceClass', 'Race1RunAgoRaceClass', 'Race4RunsAgo', 'Race5RunsAgo', 
            'Race3RunsAgoRaceClass', 'Race2RunsAgo', 'Race5RunsAgoRaceClass', 'Race3RunsAgo']

In [4]:
# Select new factors
fpath = "../ukhr_factors/"
pcnt = 0.5
newfactors_chunk1 = np.load(fpath + 'newfactors_ordered_chunk1.npy').tolist()
sel_chunk1 = newfactors_chunk1[: int(round(pcnt*len(newfactors_chunk1)))]
newfactors_chunk2 = np.load(fpath + 'newfactors_ordered_chunk2.npy').tolist()
sel_chunk2 = newfactors_chunk2[: int(round(pcnt*len(newfactors_chunk2)))]
newfactors_chunk3 = np.load(fpath + 'newfactors_ordered_chunk3.npy').tolist()
sel_chunk3 = newfactors_chunk3[: int(round(pcnt*len(newfactors_chunk3)))]
newfactors_chunk4 = np.load(fpath + 'newfactors_ordered_chunk4.npy').tolist()
sel_chunk4 = newfactors_chunk4[: int(round(pcnt*len(newfactors_chunk4)))]
newfactors_chunk5 = np.load(fpath + 'newfactors_ordered_chunk5.npy').tolist()
sel_chunk5 = newfactors_chunk5[: int(round(pcnt*len(newfactors_chunk5)))]
newfactors_chunk6 = np.load(fpath + 'newfactors_ordered_chunk6.npy').tolist()
sel_chunk6 = newfactors_chunk6[: int(round(pcnt*len(newfactors_chunk6)))]
newfactors_chunk7 = np.load(fpath + 'newfactors_ordered_chunk7.npy').tolist()
sel_chunk7 = newfactors_chunk7[: int(round(pcnt*len(newfactors_chunk7)))]
newfactors_chunk8 = np.load(fpath + 'newfactors_ordered_chunk8.npy').tolist()
sel_chunk8 = newfactors_chunk8[: int(round(pcnt*len(newfactors_chunk8)))]
newfactors = sel_chunk1 + sel_chunk2 + sel_chunk3 + sel_chunk4 + sel_chunk5 + \
             sel_chunk6 + sel_chunk7 + sel_chunk8


In [5]:
rem = ['NumberOfResults', 'KouldsScore_Distance20pc_Sire', 'LastTimeTrainerChange', 
       'LastTimeDistanceChange', 'RecentWins']

In [6]:
# Add new factors
for f in newfactors:
    if f in rem:
        continue
    arr = pd.read_csv(fpath + f + '.csv', header=None).values.flatten()
    arr[arr == 0] = np.nan
    arr[arr == 999] = np.nan
    if f in take_abs:
        arr = np.abs(arr)
    av[f] = arr
    factornames_trimmed += [f]
    pars = ModelParameters(av, oos_start=factor_build_end+YEAR, depth=3, lmbd=10, verbose=True)
    fl = FactorList(av, factornames_trimmed)
    factors = fl.asmatrix()
    missing = np.isnan(factors)
    allgood = ~np.any(missing, axis=0) & pars.build_mask
    if ~np.any(allgood):
        del av[f]
        factornames_trimmed.remove(f)
        newfactors.remove(f)

In [7]:
pars = ModelParameters(av, oos_start=factor_build_end+YEAR, depth=3, lmbd=10, verbose=True)
fl = FactorList(av, factornames_trimmed)
fl.preprocess(pars)
factors = fl.asmatrix()

INFO:models:Getting factors from av and rescaling...


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 100


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

INFO:models:Filling in missing values...


INFO:models:Computing each factor as linear combination of all the others...


 200


  out.B = 1 - np.sum((y1 - x1.dot(out.b)) ** 2) / np.sum((y1 - np.mean(y1)) ** 2)
  out.t = out.b / np.sqrt(out.sb2)
  out.t = out.b / np.sqrt(out.sb2)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 100


.

 .

 .

 .

 .

 .

 .

  out.B = 1 - np.sum((y1 - x1.dot(out.b)) ** 2) / np.sum((y1 - np.mean(y1)) ** 2)


 .

 .

 .

 200


INFO:models:Number of missing patterns: 171364


.

 .

 .

 .

 

.

 .

 .

 .

 .

 .

 10000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 20000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 30000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 40000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 50000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 60000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 70000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 80000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 90000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 100000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 110000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 120000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 130000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 140000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 150000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 160000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 170000


.




In [8]:
tsmod = TSModel(factors, tsav, pars)
tsmod.fit_slices()

INFO:models:Fitting slice 0


  stats, self.step2probs[sl] = self.concat_and_fit(strata, result, nonrunner, [fback, flay, np.log(probs)], ts_idx, valid2, verbose=False, step=2)


INFO:models:Fitting slice 1


INFO:models:Fitting slice 2


INFO:models:Fitting slice 3


INFO:models:Fitting slice 4


INFO:models:Fitting slice 5


INFO:models:Fitting slice 6


INFO:models:Fitting slice 7


INFO:models:Fitting slice 8


INFO:models:Fitting slice 9


In [9]:
sorted_factors = print_factor_order(tsmod.stats1, factornames_trimmed)

  0:                                                              RecentWins    t-score sum:  nan
  1:                                                              z027f9f0f5    t-score sum: 97.48
  2:                                                              zec0c22a48    t-score sum: 89.73
  3:                                                              zac38414de    t-score sum: 67.43
  4:                                                              z6809c316d    t-score sum: 60.84
  5:                                                              z412893062    t-score sum: 58.68
  6:                                                              z245159235    t-score sum: 57.96
  7:                                                              ze4c91eac0    t-score sum: 57.69
  8:                                                              z5981b9f89    t-score sum: 52.99
  9:                                                              z77c9cc0a5    t-score sum: 51.97
 10:       

In [10]:
tsmod.write_simdata('simdata.p')