# ThoroughBet Simulation


## Load necessary modules

In [1]:
import numpy as np

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata

## Load data

In [2]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [3]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

## Preprocessing

In [4]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [5]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']

In [6]:
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [7]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . .

  coefse = np.sqrt(np.diag(information_matrix))


 . .CPU times: user 3min 9s, sys: 1.78 s, total: 3min 11s
Wall time: 55.4 s



In [8]:
factors.T.shape

(1631851, 57)

In [9]:
predict_mask = mod.is1|mod.is2|mod.oos

In [10]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))
    factors_new[:, predict_mask] = X.T
    
    return factors_new

In [11]:
def DF(mask, factors, av, factors_names, other_names):
    
    import pandas as pd
    
    df = pd.DataFrame(data =factors[:, mask].T , columns = factors_names)
    for col in other_names :
        
        df[col] = av[col][mask]
        
    return df

In [12]:
import pandas as pd

pd.set_option('display.max_columns', 90)

col_names = ['f{}'.format(i) for i in range(1,58)]

df = DF (predict_mask, factors, av, col_names, ['event_id', 'runner_id', 'result'])
df['is1'] = mod.is1 [predict_mask]
df['oos'] = mod.oos [predict_mask]
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,oos
0,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,-0.487135,0.084544,-0.116107,0.002079,-0.001362,-0.160866,0.091133,0.966224,2.384186e-07,2.49246,0.753668,-0.000403,0.234473,0.250251,0.023256,0.628183,0.0986,-0.014886,-0.089427,0.53691,-0.012463,-0.143313,0.943943,0.417261,-0.000921,-0.00078,0.1193,293661,360456,3,True,False
1,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,-0.030274,-0.498561,0.035182,-0.01871,0.029094,0.031624,-0.061174,0.254454,2.384186e-07,1.648349,0.073634,-0.650115,0.177804,-0.052191,-0.034483,0.370725,-0.247407,-0.023127,0.550295,-0.121513,0.112166,0.040612,0.224962,0.057615,-0.016085,-0.01733,0.026977,293661,375590,5,True,False
2,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,-0.26665,0.043125,0.001112,0.002079,-0.011063,0.171808,0.014969,0.155725,2.384186e-07,0.941642,0.146075,-0.109316,0.106961,0.005249,0.083641,0.144267,0.0986,0.08141,-0.152771,-0.183572,-0.012463,0.159303,0.48867,0.036059,0.0148,0.014925,0.051179,293661,374610,7,True,False
3,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,0.250868,0.486319,0.086636,0.002079,-0.009765,0.1904,0.130183,0.447011,2.384186e-07,0.498932,-0.041847,0.597555,0.001817,-0.037324,-0.019826,-0.163311,-0.247407,-0.014886,0.461782,0.289931,-0.012463,0.088135,-0.126984,-0.495465,0.012784,0.012911,-0.262583,293661,373638,1,True,False
4,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,-0.043148,-0.245328,-0.009627,0.002079,0.011331,-0.122763,-0.217356,1.080508,2.384186e-07,-0.004469,0.049962,0.495737,0.116301,-0.110655,-0.044347,-0.163311,0.0986,0.0239,-0.237007,-0.440662,-0.012463,-0.125458,0.136239,0.442488,-0.024187,-0.024024,0.267151,293661,347906,4,True,False


##### use PCA 

In [None]:
from sklearn.decomposition import PCA
decomp = PCA(n_components =25)
pca_data = decomp.fit_transform(df.ix[:, 'f1':'f57'].values)
print decomp.explained_variance_

In [None]:
print decomp.explained_variance_ratio_

In [None]:
new_factors =  new_factors_array (pca_data)
new_factors.shape

In [None]:
%time new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood, inds \
    =   mod.fit_slices(tsav, new_factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'new_model_likelihood'   
print new_model_likelihood
print '..................'

##### добавим новые полиномиальныг факторы 

In [None]:
from itertools import combinations
df_pca = pd.DataFrame(data =pca_data, columns = ['pca_f%s'%i for i in range(1,26)])
df_pca.head()

In [None]:
for i, j in combinations(df_pca.columns, 2):
    df_pca[i +'_'+j] = df_pca[i].values * df_pca[j].values
df_pca.head()

In [None]:
new_factors =  new_factors_array (df_pca)
new_factors.shape

In [None]:
%time new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood, inds \
    =   mod.fit_slices(tsav, new_factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'new_model_likelihood'   
print new_model_likelihood
print '..................'

In [None]:
new_factors =  np.vstack((factors,new_factors_array (df_pca)))
new_factors.shape

In [None]:
%time new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood, inds \
    =   mod.fit_slices(tsav, new_factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'new_model_likelihood'   
print new_model_likelihood
print '..................'

In [None]:
from prediction.tools.clustering import write_dic_to_simdata, dic_to_tenzor, ll_diff
write_dic_to_simdata('simdata_new_pca_polyfactors.p', new_model_step1prob, new_model_coefs, mod.oos, av =av)