In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView

from prediction.models.fit_model import TSModel
from prediction.models.model_parameters import ModelParameters
from prediction.models.factor_management import FactorList
from prediction.models.preprocessing import load_slices, print_factor_order

from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.plotting import varinfo
from os import listdir

In [2]:
av = ArrayView.from_file('../datadev/brain_final2cut.av.bcolz')
tsav = load_slices(path='../datadev/')

In [3]:
# Take absolute value
take_abs = ['Race4RunsAgoRaceClass', 'DistanceLast', 'Race5RunsAgoRaceType', 'Speed', 
            'RacesSincePreviousTrainerWin']

In [4]:
# Remove from list of new factors
rem = ['7DaysWins', 'LstRanking', 'KouldsScore_RaceClass_Dam', 'DistanceRegression', 
       'Dam_GoingDistance_Win', 'Dam_Going_Distance_SR ', 'KouldsScore_GoingDistance_Dam']

In [5]:
fpath = "../ukhr_facs/"
newfactors_chunk1 = np.load(fpath + 'newfactors_chunk1.npy').tolist()
newfactors_chunk2 = np.load(fpath + 'newfactors_chunk2.npy').tolist()
newfactors_chunk3 = np.load(fpath + 'newfactors_chunk3.npy').tolist()
newfactors = newfactors_chunk1 + newfactors_chunk2 + newfactors_chunk3
for r in rem:
    if r in newfactors:
        newfactors.remove(r)

In [6]:
# Add new factors
for f in newfactors:
    arr = pd.read_csv(fpath + f + '.csv', header=None).values.flatten()
    arr[arr == 0] = np.nan
    arr[arr == 999] = np.nan
    if f in take_abs:
        arr = np.abs(arr)
    av[f] = arr
    factornames_trimmed += [f]

In [7]:
pars = ModelParameters(av, oos_start=factor_build_end+YEAR, depth=3, lmbd=10, verbose=True)
fl = FactorList(av, factornames_trimmed)
fl.preprocess(pars)
factors = fl.asmatrix()

INFO:models:Getting factors from av and rescaling...


.

 .

 .

 .

 .

 .

 .

 .

 .

INFO:models:Filling in missing values...


INFO:models:Computing each factor as linear combination of all the others...


  out.t = out.b / np.sqrt(out.sb2)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0


 .

 .

 .

 .

 .

 .

 .

 .

 .

INFO:models:Number of missing patterns: 248457


 .

 .

 .

 .

 .

 .

 .

 .

 .

 .

 10000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 20000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 30000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 40000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 50000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 60000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 70000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 80000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 90000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 100000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 110000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 120000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 130000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 140000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 150000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 160000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 170000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 180000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 190000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 200000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 210000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 220000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 230000


.

 .

 .

 .

 .

 .

 .

 .

 .

 .

 240000


.

 .

 .

 .

 .

 .

 .

 .




In [8]:
tsmod = TSModel(factors, tsav, pars)
tsmod.fit_slices()

INFO:models:Fitting slice 0


  stats, self.step2probs[sl] = self.concat_and_fit(strata, result, nonrunner, [fback, flay, np.log(probs)], ts_idx, valid2, verbose=False, step=2)


INFO:models:Fitting slice 1


INFO:models:Fitting slice 2


INFO:models:Fitting slice 3


INFO:models:Fitting slice 4


INFO:models:Fitting slice 5


INFO:models:Fitting slice 6


INFO:models:Fitting slice 7


INFO:models:Fitting slice 8


INFO:models:Fitting slice 9


In [9]:
tsmod.stats1.ll

array([[-1828.70799792, -1828.70799792, -2085.07664627],
       [-1834.06939713, -1834.06939713, -2072.98080725],
       [-1839.1884764 , -1839.1884764 , -2058.84316501],
       [-1842.82566958, -1842.82566958, -2039.63052015],
       [-1860.3580681 , -1860.3580681 , -1995.0186356 ],
       [-1877.77651178, -1877.77651178, -1968.57109916],
       [-1913.6261206 , -1913.6261206 , -1922.29410662],
       [-1927.82737437, -1927.82737437, -1898.10700051],
       [-1930.30687733, -1930.30687733, -1894.90693772],
       [-1934.36713373, -1934.36713373, -1881.55572481],
       [           nan,            nan,            nan]])

In [10]:
tsmod.stats2.ll

array([[-1817.67691656, -2101.6073103 , -2101.6073103 ],
       [-1823.07778118, -2088.90836444, -2088.90836444],
       [-1828.30980478, -2073.89053089, -2073.89053089],
       [-1831.63404125, -2053.0454477 , -2053.0454477 ],
       [-1849.56915012, -2002.50469715, -2002.50469715],
       [-1867.22892933, -1968.3025247 , -1968.3025247 ],
       [-1902.71533146, -1915.6966515 , -1915.6966515 ],
       [-1917.54851191, -1883.18971331, -1883.18971331],
       [-1920.30086556, -1879.7581569 , -1879.7581569 ],
       [-1924.88772399, -1865.96603558, -1865.96603558],
       [           nan,            nan,            nan]])

In [11]:
sorted_factors = print_factor_order(tsmod.stats1, factornames_trimmed)

  0:                                                              z027f9f0f5    t-score sum: 114.86
  1:                                                              zec0c22a48    t-score sum: 84.91
  2:                                                              z412893062    t-score sum: 64.61
  3:                                                              ze4c91eac0    t-score sum: 60.64
  4:                                                              z5981b9f89    t-score sum: 54.70
  5:                                                              z245159235    t-score sum: 54.04
  6:                                                              z34b78e584    t-score sum: 52.46
  7:                                                             RAdjRanking    t-score sum: 47.57
  8:                                                              z6809c316d    t-score sum: 46.74
  9:                                                              z77c9cc0a5    t-score sum: 45.77
 10:     

In [12]:
tsmod.write_simdata('simdata.p')