In [1]:
# ThoroughBet Simulation


## Load necessary modules

In [2]:
import numpy as np
from scipy.spatial.distance import cosine

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata, dic_to_tenzor

## Load data

In [3]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [4]:
av_w = ArrayView.from_file(settings.paths.join('weather.av.bcolz'))

In [5]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

In [6]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [7]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [8]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . .

  coefse = np.sqrt(np.diag(information_matrix))


 . .CPU times: user 3min 9s, sys: 1.81 s, total: 3min 10s
Wall time: 55.5 s



In [9]:
predict_mask = mod.is1|mod.is2|mod.oos

In [10]:
# создаем новые факторы из столбцов таблицы
def old_data(num, is1=mod.is1):
    
    first_is1 = np.where(is1 ==True)[0][0]
    past_events = np.unique(av.event_id[av.event_id < av.event_id[first_is1]])[-int(num):]
    
    return np.in1d(av.event_id, past_events)

In [11]:
mask_past = old_data(4000)

In [12]:
np.where(predict_mask ==True)[0][0], np.where(mask_past ==True)[0][0]

(1123738, 1085943)

In [13]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))
    factors_new[:, predict_mask] = X.T
    
    return factors_new

In [14]:
# создаем дата фрейм из данных 
def DF(mask, factors, av, factors_names, other_names):
    
    import pandas as pd
    
    df = pd.DataFrame(data =factors[:, mask].T , columns = factors_names)
    for col in other_names :
        
        df[col] = av[col][mask]
        
    return df

In [15]:
import pandas as pd
pd.set_option('display.max_columns', 90)

col_names = ['f{}'.format(i) for i in range(1,58)]

df = DF (predict_mask, factors, av, col_names, ['event_id', 'runner_id', 'result', 'start_time', 'jockey', 'trainer',
                                               'prize'])
df['is1'] = mod.is1 [predict_mask]
df['oos'] = mod.oos [predict_mask]


df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,start_time,jockey,trainer,prize,is1,oos
0,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,-0.487135,0.084544,-0.116107,0.002079,-0.001362,-0.160866,0.091133,0.966224,2.384186e-07,2.49246,0.753668,-0.000403,0.234473,0.250251,0.023256,0.628183,0.0986,-0.014886,-0.089427,0.53691,-0.012463,-0.143313,0.943943,0.417261,-0.000921,-0.00078,0.1193,293661,360456,3,1443704000.0,5870,5165,3235.0,True,False
1,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,-0.030274,-0.498561,0.035182,-0.01871,0.029094,0.031624,-0.061174,0.254454,2.384186e-07,1.648349,0.073634,-0.650115,0.177804,-0.052191,-0.034483,0.370725,-0.247407,-0.023127,0.550295,-0.121513,0.112166,0.040612,0.224962,0.057615,-0.016085,-0.01733,0.026977,293661,375590,5,1443704000.0,10816,448,3235.0,True,False
2,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,-0.26665,0.043125,0.001112,0.002079,-0.011063,0.171808,0.014969,0.155725,2.384186e-07,0.941642,0.146075,-0.109316,0.106961,0.005249,0.083641,0.144267,0.0986,0.08141,-0.152771,-0.183572,-0.012463,0.159303,0.48867,0.036059,0.0148,0.014925,0.051179,293661,374610,7,1443704000.0,10817,10804,3235.0,True,False
3,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,0.250868,0.486319,0.086636,0.002079,-0.009765,0.1904,0.130183,0.447011,2.384186e-07,0.498932,-0.041847,0.597555,0.001817,-0.037324,-0.019826,-0.163311,-0.247407,-0.014886,0.461782,0.289931,-0.012463,0.088135,-0.126984,-0.495465,0.012784,0.012911,-0.262583,293661,373638,1,1443704000.0,63,64,3235.0,True,False
4,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,-0.043148,-0.245328,-0.009627,0.002079,0.011331,-0.122763,-0.217356,1.080508,2.384186e-07,-0.004469,0.049962,0.495737,0.116301,-0.110655,-0.044347,-0.163311,0.0986,0.0239,-0.237007,-0.440662,-0.012463,-0.125458,0.136239,0.442488,-0.024187,-0.024024,0.267151,293661,347906,4,1443704000.0,10921,299,3235.0,True,False


In [16]:
from sklearn.decomposition import PCA
seed =7
pca = PCA(n_components =51, random_state =seed)
name_pca = ['pca_f%s'%i for i in range(1,52)]

df_pca = pd.DataFrame(data = pca.fit_transform(df.loc[:,'f1':'f57']), columns =name_pca) 
#df_pca = df_pca.append(pd.DataFrame(data = pca.transform(df.ix[df.oos.values,'f1':'f57']), columns =name_pca))

df_pca.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51
0,-3.96563,0.475024,-0.458396,-0.437761,0.459468,-0.971297,1.009715,-0.549283,0.092487,-0.436671,0.288159,-0.383869,-0.496406,-0.225241,0.044298,-0.001694,0.345873,0.129309,-0.170947,-0.282424,-0.260633,-0.128401,-0.24067,0.047086,-0.015175,0.021386,-0.281638,0.329844,0.152668,0.181906,-0.228965,0.047199,0.166749,-0.096677,0.114633,-0.095767,0.149982,-0.029129,0.014214,0.094271,-0.029527,-0.010815,0.02253,0.042726,0.032898,0.01826,-0.021068,0.001954,0.030846,-0.016273,0.004811
1,-1.368812,-0.006947,-0.092028,-0.709256,0.177596,-0.400748,-0.432677,-0.340324,0.244105,0.464674,0.609016,0.010729,-0.893683,-0.408752,0.082643,0.075791,-0.125073,0.150471,-0.358299,-0.02455,0.008662,0.027252,0.206629,0.008957,0.04562,-0.173685,0.195684,-0.064586,0.095305,-0.031151,-0.207162,-0.355872,0.019206,-0.143756,0.186557,-0.090538,0.103422,-0.022464,-0.111724,-0.304364,0.128531,0.066386,0.052475,-0.144639,-0.144402,-0.027196,0.045439,0.002482,0.008272,-0.004227,-0.014723
2,-0.958706,-0.110968,0.112799,-0.253621,-0.233773,-0.4487,0.078242,-0.387077,0.210951,-0.204571,0.384731,-0.326743,-0.060698,-0.473298,-0.043503,0.410968,0.384015,0.309765,-0.252184,-0.007075,-0.142722,0.248955,-0.103244,0.067656,0.101819,0.092514,0.138265,-0.016673,0.041744,-0.014458,-0.112835,0.044704,0.104382,-0.042308,-0.151314,0.008361,-0.047636,0.021052,0.013151,0.018956,-0.008926,-0.081109,-0.029422,0.005524,0.022653,-0.021301,-0.000937,-0.018574,-0.025988,-0.002146,0.0678
3,-0.91103,-0.01337,-0.459013,0.196185,0.053641,-0.314621,-0.861711,0.039117,-0.175894,-0.135009,-0.103966,0.133928,0.165781,-0.216123,-0.379095,-0.3778,0.037477,-0.211301,0.185288,-0.417862,0.734184,-0.095048,0.299701,0.000799,0.003822,0.098978,0.11154,-0.062437,-0.296788,-0.111029,0.207358,-0.071403,-0.07887,0.042843,-0.023222,-0.00542,-0.014227,0.043674,-0.001294,-0.147973,0.053836,-0.00015,-0.00604,0.043153,0.045375,-0.001744,0.001126,-0.026722,-0.012083,0.010317,0.000236
4,-1.30464,0.336474,-1.493462,0.310679,0.046025,0.595846,0.370766,0.064179,-0.141242,-0.33912,0.037521,-0.478799,0.449629,-0.190379,-0.11085,-0.112689,0.162289,0.145524,-0.033562,0.236474,-0.643673,-7.1e-05,-0.018297,0.029151,0.046942,0.097443,0.177624,0.031643,0.131602,0.126083,-0.175732,-0.063868,-0.065255,0.025453,-0.01258,0.007448,-0.143996,0.034184,0.061798,0.051067,-0.024434,-0.035129,-0.019685,-0.009579,0.029039,0.069514,-0.037928,0.013388,-0.041592,0.000152,-0.030064


In [17]:
nor_vectors = np.diag(np.ones(df_pca[name_pca].shape[1]))

In [18]:
def matrix_cosine(matrix,vector):
    
    matrix_norms = np.linalg.norm(matrix, axis=1)
    vector_norm = np.linalg.norm(vector)
    return 1- np.divide(matrix.dot(vector),np.multiply(matrix_norms, vector_norm))

In [19]:
%%time 
name_cos = []
matrix_norms = np.linalg.norm(df_pca[name_pca].values, axis=1)
vector_norm = np.linalg.norm(nor_vectors[0])

for i, v  in enumerate(nor_vectors):
    
    df_pca['cos_' + str(i+1)] = 1- np.divide(df_pca[name_pca].values.dot(v),np.multiply(matrix_norms, vector_norm))
    name_cos.append('cos_' + str(i+1))

CPU times: user 6.99 s, sys: 184 ms, total: 7.17 s
Wall time: 1.82 s


In [20]:
pd.set_option('display.max_columns', 110)
df_pca.tail(10)

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
181979,0.211059,0.05407,0.795858,-0.382294,-0.002393,-0.148049,0.33723,0.274592,-0.152361,-0.40414,0.088301,-0.342493,-0.067645,0.405253,-0.208383,0.248669,0.258422,-0.136216,0.242593,-0.060472,-0.080809,-0.036422,-0.321072,0.175922,0.194853,-0.03167,-0.108172,0.073268,0.170937,0.13689,-0.292075,-0.011445,0.143889,-0.090297,-0.123755,-0.009325,0.049854,-0.050399,-0.054588,-0.019705,-0.02146,0.000215,0.096617,-0.023062,-0.032873,0.057042,-0.007014,-0.018469,0.00278,-0.011142,-0.0089,0.856752,0.963302,0.459843,1.259467,1.001624,1.100482,0.771118,0.813632,1.103409,1.274294,0.940069,1.232454,1.045912,0.72495,1.141432,0.831226,0.824606,1.092451,0.83535,1.041043,1.054846,1.02472,1.217915,0.8806,0.867752,1.021495,1.073418,0.950272,0.883983,0.907092,1.198234,1.007768,0.902341,1.061285,1.083994,1.006329,0.966163,1.034207,1.03705,1.013374,1.014565,0.999854,0.934425,1.015653,1.022312,0.961285,1.00476,1.012535,0.998113,1.007562,1.006041
181980,0.721035,-0.082679,-0.848246,-0.581641,-0.048463,0.215278,-0.806905,-0.475266,0.220289,-0.239829,-0.262521,0.202969,-0.061466,0.072381,0.20862,-0.060288,-0.149071,0.327345,0.052632,-0.144824,0.070166,-0.136132,0.154311,0.066727,-0.032392,-0.087445,-0.137621,0.090301,-0.058077,-0.127949,0.196954,0.186229,0.044436,-0.111812,-0.061102,-0.111549,0.215595,-0.055803,-0.021514,-0.062224,-0.027549,-0.004759,0.004299,-0.041577,-0.007073,0.031065,-0.003847,-0.011595,0.009052,0.001678,0.008601,0.597234,1.046184,1.473826,1.324902,1.027071,0.879747,1.450733,1.265481,0.876948,1.133967,1.146643,0.886622,1.034335,0.959568,0.883466,1.033676,1.083271,0.817147,0.9706,1.080898,0.960806,1.076043,0.913802,0.962726,1.018094,1.048847,1.076874,0.949558,1.032441,1.071472,0.889983,0.895973,0.975179,1.062458,1.034131,1.062311,0.87957,1.031171,1.012018,1.034758,1.015389,1.002658,0.997599,1.023225,1.003951,0.982647,1.002149,1.006477,0.994944,0.999063,0.995195
181981,1.692738,-0.219342,-0.134714,0.519794,0.943814,-0.713607,0.680144,-0.225059,-0.530265,0.341555,-0.165134,0.144984,0.33471,-0.023614,0.051197,-0.111221,-0.007463,-0.150731,-0.015587,-0.174697,0.056703,0.20057,-0.045,-0.010865,-0.203451,0.026715,0.073639,-0.017371,-0.119201,0.129613,0.083644,0.131863,-0.131548,-0.051737,0.101705,0.007219,-0.072997,0.001956,0.046797,0.146782,0.072005,0.029037,0.030708,-0.033533,-0.056242,-0.043098,0.039586,0.04982,0.011285,-0.003145,-0.007938,0.307089,1.089786,1.055144,0.787226,0.613656,1.29211,0.721588,1.092127,1.217061,0.860187,1.067596,0.940652,0.862988,1.009666,0.979043,1.045528,1.003055,1.061701,1.006381,1.071511,0.976789,0.917898,1.01842,1.004447,1.083281,0.989065,0.969856,1.007111,1.048794,0.946944,0.965761,0.946023,1.053848,1.021178,0.958368,0.997045,1.029881,0.999199,0.980844,0.939916,0.970525,0.988114,0.98743,1.013727,1.023022,1.017642,0.983796,0.979606,0.995381,1.001287,1.003249
181982,-1.812888,0.102399,0.252073,-0.673932,-0.52225,-0.854582,-0.467537,-0.176926,0.923483,0.017217,-0.2495,-0.458307,0.054009,0.420046,-0.35582,-0.15016,-0.006861,-0.232413,-0.282938,-0.506817,0.182472,0.110418,0.20341,-0.190111,0.014969,0.040793,-0.093644,0.06387,-0.029692,-0.018305,-0.288024,0.091321,-0.107956,0.350536,0.117863,-0.103272,-0.088038,0.046236,0.116649,0.074016,0.016732,-0.043987,0.088667,0.20257,0.23419,0.060343,-0.004825,0.01465,0.025548,0.037734,-0.002836,1.664509,0.962466,0.907603,1.247028,1.191429,1.313245,1.171374,1.064852,0.6615,0.993689,1.091454,1.167991,0.980203,0.846033,1.130425,1.055041,1.002515,1.08519,1.10371,1.185772,0.933115,0.959527,0.925441,1.069685,0.994513,0.985048,1.034325,0.976589,1.010883,1.00671,1.105574,0.966526,1.039571,0.871512,0.956798,1.037854,1.03227,0.983052,0.957243,0.97287,0.993867,1.016123,0.967499,0.925749,0.914158,0.977882,1.001768,0.99463,0.990635,0.986169,1.001039
181983,-1.352817,0.226384,0.268604,-0.07364,0.09723,0.051188,-0.162312,-0.200879,0.420369,0.38927,0.033909,0.022665,0.282691,-0.32552,-0.178908,0.215604,0.165224,-0.015817,-0.06682,-0.030703,-0.179009,0.047273,-0.056042,-0.169904,-0.017591,-0.053434,0.109515,-0.079134,-0.001937,-0.132688,-0.171081,0.023648,0.100862,-0.185796,-0.028445,0.129773,-0.160382,0.035679,0.010514,0.028954,-0.061234,-0.00885,0.00883,0.136853,-0.16452,0.01675,0.065316,-0.013027,-0.064454,-0.084009,-0.022659,1.786772,0.86834,0.843785,1.042827,0.943453,0.97023,1.094398,1.116827,0.755521,0.773608,0.980279,0.986819,0.835593,1.189316,1.10405,0.874609,0.903909,1.009199,1.038861,1.017856,1.104108,0.972507,1.032593,1.098813,1.01023,1.031076,0.936308,1.046023,1.001126,1.077169,1.099497,0.986247,0.941341,1.108055,1.016543,0.924527,1.093275,0.97925,0.993885,0.983161,1.035613,1.005147,0.994865,0.920409,1.095682,0.990259,0.962013,1.007576,1.037485,1.048858,1.013178
181984,-0.205857,0.046213,-0.262328,-0.544579,0.391583,-0.263815,-0.091872,-0.187139,-0.607862,0.036442,-0.275287,-0.201388,0.106388,-0.249283,0.988086,0.066394,-0.007639,0.128353,0.117004,0.196537,0.169025,-0.418349,-0.062843,0.139732,-0.063081,0.003425,-0.019238,0.022993,0.160207,0.196936,0.221904,-0.10407,-0.104814,-0.271816,0.043671,0.013885,0.003504,-0.016249,0.049688,0.010381,-0.010131,0.001626,0.083532,-0.039795,-0.02452,0.01787,-0.022241,0.041709,0.056745,-0.00531,0.007664,1.124184,0.972122,1.15825,1.328519,0.763777,1.159147,1.055422,1.112892,1.366694,0.978016,1.166067,1.121488,0.935821,1.15038,0.403935,0.959948,1.004608,0.922571,0.929417,0.881438,0.898035,1.25237,1.03791,0.915706,1.038054,0.997934,1.011606,0.986129,0.903355,0.881198,0.866136,1.06278,1.063229,1.163974,0.973655,0.991624,0.997886,1.009802,0.970025,0.993738,1.006112,0.999019,0.949609,1.024006,1.014792,0.98922,1.013417,0.974839,0.965768,1.003203,0.995377
181985,-0.560418,-0.066324,0.209394,0.603091,0.034402,0.135804,0.606321,0.344407,0.363943,-0.367062,0.536879,0.780592,-0.102542,-0.038813,-0.51607,0.12361,0.110038,-0.150697,0.010191,0.263108,0.025078,-0.043705,0.188683,-0.023777,0.048957,-0.086187,0.007946,0.162077,-0.050827,-0.011938,0.067505,0.076057,0.058427,0.126559,-0.220457,-0.177614,0.049682,0.007364,-0.133332,-0.027734,0.082446,-0.209408,0.072863,-0.510571,0.424915,0.051416,-0.158318,-0.027375,-0.016481,0.199499,-0.01082,1.296653,1.035108,0.889159,0.680759,0.981789,0.928113,0.679049,0.817691,0.80735,1.194301,0.715808,0.5868,1.05428,1.020545,1.273177,0.934568,0.941752,1.07977,0.994605,0.860726,0.986725,1.023135,0.900122,1.012586,0.974085,1.045622,0.995794,0.914206,1.026905,1.006319,0.964267,0.95974,0.969072,0.933007,1.116697,1.094018,0.973701,0.996102,1.070578,1.014681,0.956358,1.110848,0.961431,1.270267,0.775075,0.972784,1.083805,1.014491,1.008724,0.894397,1.005727
181986,1.080464,-0.125079,0.117892,0.357487,0.082169,0.091252,-0.567474,-0.045162,-0.118669,0.095505,0.07216,-0.352333,-0.038792,-0.332391,0.626519,-0.147912,-0.154666,0.157899,0.030803,-0.121196,0.119843,-0.092696,-0.183843,0.400229,-0.03302,0.001834,0.163093,-0.124005,-0.07843,0.057751,0.097834,-0.074739,-0.074104,0.000237,0.016498,0.159829,0.187909,-0.059914,0.018854,0.156631,0.112,0.253506,-0.19257,0.089835,-0.142241,-0.15384,-0.028278,0.024257,0.014283,-0.00449,0.061224,0.371812,1.072722,0.931457,0.792155,0.952226,0.946946,1.329933,1.026258,1.068995,0.944473,0.958046,1.204849,1.022554,1.193254,0.635738,1.085997,1.089924,0.908196,0.982091,1.070464,0.930323,1.053894,1.106887,0.767304,1.019198,0.998934,0.905177,1.072097,1.0456,0.966423,0.943119,1.043454,1.043085,0.999862,0.990408,0.907074,0.890749,1.034834,0.989038,0.908934,0.934883,0.85261,1.111962,0.947769,1.0827,1.089444,1.016441,0.985897,0.991696,1.00261,0.964404
181987,1.322641,0.038814,-0.234455,0.012107,-0.160549,0.335845,0.813282,0.47036,-0.144126,-0.782871,0.227114,0.354059,-0.317756,0.787691,-0.957388,0.264934,0.45828,-0.018025,-0.018637,0.248426,-0.289012,0.469922,-0.141957,-0.014098,0.129316,0.001226,-0.289352,-0.024394,0.018035,0.040297,0.02805,-0.051894,0.226373,-0.003471,0.22752,0.014412,0.101259,-0.047992,0.014958,-0.090137,-0.112244,0.035137,0.000278,0.139267,-0.230695,-0.01437,0.151181,-0.063509,0.003662,-0.143638,-0.051802,0.473491,0.984549,1.09333,0.99518,1.06391,0.866309,0.676254,0.812762,1.057373,1.31164,0.909592,0.859059,1.12649,0.686441,1.381111,0.894537,0.817571,1.007175,1.007419,0.901108,1.115048,0.812937,1.056509,1.005612,0.948523,0.999512,1.115183,1.009711,0.992821,0.983959,0.988834,1.020658,0.909887,1.001382,0.90943,0.994263,0.959692,1.019105,0.994046,1.035881,1.044681,0.986013,0.999889,0.944561,1.091834,1.00572,0.939819,1.025281,0.998542,1.057179,1.020621
181988,1.528876,-0.222407,-0.35118,0.319467,0.077415,0.504309,-0.130409,-0.20466,-0.837138,0.611499,-0.345275,-0.145286,0.016002,-0.26173,0.393581,-0.372469,-0.564377,0.130698,0.210396,-0.049357,-0.028397,-0.072864,0.052593,-0.14207,-0.07955,0.092344,0.121681,-0.021408,-0.017356,-0.132053,0.043813,0.039677,-0.098787,-0.016249,-0.15665,-0.037014,-0.093934,0.034877,-0.07733,-0.152112,-0.027567,-0.028024,-0.061599,-0.018159,-0.09713,0.021833,-0.002834,0.023295,-0.019302,0.000213,0.01923,0.320578,1.098836,1.156062,0.858031,0.965597,0.775888,1.057953,1.09095,1.372018,0.728254,1.153438,1.064564,0.992889,1.116311,0.825095,1.165523,1.250805,0.941919,0.906501,1.021934,1.012619,1.03238,0.976628,1.063135,1.035351,0.958963,0.945926,1.009513,1.007713,1.058684,0.98053,0.982368,1.0439,1.007221,1.069614,1.016449,1.041743,0.984501,1.034365,1.067598,1.012251,1.012454,1.027374,1.00807,1.043164,0.990298,1.00126,0.989648,1.008578,0.999905,0.991454


In [21]:
#['cos_{}'.format(i+1) for i in range(df_max[name_pca].shape[1])]

In [22]:
factors_new = new_factors_array (df_pca.values, predict_mask =predict_mask)
factors_new.shape, factors.shape

((102, 1631851), (57, 1631851))

In [23]:
%%time
model_coefs, model_step1prob, model_step2prob, model_likelihood, ints\
    = mod.fit_slices(tsav, factors_new,  depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'LL  new factors'
print model_likelihood
print '..................'

  step2factors = np.concatenate((fb, fl, np.log(probs).reshape((1, -1))), axis=0)


. . . . . . . . . . 10
LL  new factors
[[-1808.71557812 -1992.82943243 -1992.82943243]
 [-1814.4090244  -1988.92765527 -1988.92765527]
 [-1819.66062556 -1986.73887511 -1986.73887511]
 [-1824.12421928 -1973.87326735 -1973.87326735]
 [-1844.11188405 -1950.43933282 -1950.43933282]
 [-1861.74520165 -1944.34215217 -1944.34215217]
 [-1902.22076716 -1919.06177509 -1919.06177509]
 [-1919.38657427 -1899.38725376 -1899.38725376]
 [-1922.10047255 -1899.48668594 -1899.48668594]
 [-1926.83740096 -1893.03254249 -1893.03254249]
 [    0.             0.             0.        ]]
..................
CPU times: user 6min 56s, sys: 1.68 s, total: 6min 58s
Wall time: 4min 59s


In [24]:
old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood , ints\
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print old_model_likelihood
print '..................'

. . . . . . . . . . 10
[[-1809.6054581  -1997.1291991  -1997.1291991 ]
 [-1815.2956033  -1993.38729757 -1993.38729757]
 [-1820.46573327 -1991.13279384 -1991.13279384]
 [-1825.04841825 -1978.80723507 -1978.80723507]
 [-1845.34679474 -1957.00310259 -1957.00310259]
 [-1863.07407718 -1951.28943996 -1951.28943996]
 [-1903.75005812 -1924.17873453 -1924.17873453]
 [-1921.12493606 -1902.65155706 -1902.65155706]
 [-1923.89396228 -1902.50302182 -1902.50302182]
 [-1928.8879264  -1895.42641802 -1895.42641802]
 [    0.             0.             0.        ]]
..................


In [25]:
write_dic_to_simdata('simdata_new_factors_direction.p', model_step1prob, model_coefs, mod.oos, av =av)

In [33]:
df_pca['event_id'] = df['event_id']
df_cos = df_pca.loc[:,'cos_1':'cos_51'].groupby(df_pca['event_id']).mean()
df_cos.head()

Unnamed: 0_level_0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
293661,1.095792,0.994007,1.011225,0.977154,0.973338,0.977658,1.049414,1.000369,0.981387,0.996478,0.990889,1.008413,1.006978,1.002633,1.00854,0.997568,0.986594,1.005529,0.986894,0.99592,0.985788,0.990883,1.009079,0.994576,0.991504,0.984495,0.983,0.998787,1.006202,1.006594,0.999237,0.999092,1.001359,0.99637,1.004886,0.997014,1.002142,0.998557,1.00092,1.005058,0.999184,1.002646,0.998527,1.000729,0.999938,1.001018,0.999019,1.000739,1.000654,0.999596,1.000511
293662,1.025135,0.979642,1.025552,0.992886,0.969676,0.994301,0.955914,1.02803,1.012701,1.003368,1.005515,0.995615,0.991229,1.001656,1.005208,1.008859,0.988792,1.001711,0.995409,0.979123,1.01301,1.025566,1.007082,1.004161,0.997656,0.98754,0.985548,0.996589,1.00975,1.007894,1.004133,1.009986,1.004703,0.997008,1.00634,1.001369,1.000799,1.000202,0.996975,1.002385,0.998332,0.997793,0.996886,1.002168,0.99985,1.00346,0.997962,1.000378,0.997231,1.000167,0.999904
293663,1.011137,1.003384,1.00466,0.958547,1.022334,1.001572,1.024124,0.998489,0.9815,0.984965,0.969786,1.005971,0.998021,1.009537,0.991968,0.998966,1.021593,0.99693,1.005226,0.978555,0.986599,1.009311,1.009341,1.004019,1.006003,1.005937,1.001762,1.008899,1.007479,1.001174,1.007209,1.005057,1.007169,1.002005,1.002979,1.002996,0.997694,0.999174,1.00386,1.003514,0.997164,0.99952,0.996967,1.00763,1.004028,0.999908,0.994414,1.000442,0.996174,0.998233,0.99855
293664,0.893475,1.027394,1.020122,1.003275,0.980024,0.998363,1.038385,0.997252,1.035559,1.021719,0.954591,1.017489,0.98721,0.991652,1.01063,1.011965,1.025384,0.97685,1.002648,0.99848,0.968368,1.011126,0.999998,0.962511,0.984131,0.989343,0.971042,0.989141,0.989329,0.991079,1.006707,0.999673,0.994402,1.013263,1.001517,0.99109,0.998787,1.000976,1.005643,1.000362,1.001541,1.00151,1.002814,1.000977,1.000006,1.002049,0.9965,1.002766,0.995952,0.999726,0.998088
293665,1.059561,1.002933,0.997229,0.97676,1.004475,1.05209,1.032997,1.009778,0.942629,0.974189,0.957041,1.005772,1.030626,1.006366,0.996323,0.998388,1.013236,0.996004,0.987724,0.974568,0.990866,1.020934,1.012389,0.996887,0.991306,0.992306,0.983831,0.985945,0.999543,1.011462,1.014309,1.008134,0.99592,0.999427,0.997972,0.999264,0.994718,1.000905,1.003551,1.002153,0.999989,1.004694,1.000479,1.005647,1.007181,1.002705,0.999116,1.000095,1.001242,0.999854,1.000654


In [37]:
df_cos.drop('cluster', axis =1, inplace =True)

In [38]:
from sklearn.mixture import BayesianGaussianMixture
BGM = BayesianGaussianMixture(n_components =5)
BGM.fit(df_cos.values)
df_cos['cluster'] = BGM.predict(df_cos.values)
score = BGM.score(df_cos.loc[:,'cos_1':'cos_51'].values)
print score

183.577236753
