In [14]:
# ThoroughBet Simulation


## Load necessary modules

In [15]:
import numpy as np

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata, ll_diff, ll_for_each_cluster, ll_for_mix_clusters, dic_to_tenzor

## Load data

In [16]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [17]:
av_w = ArrayView.from_file(settings.paths.join('weather.av.bcolz'))

In [18]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

In [19]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [20]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [21]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . . . .CPU times: user 5min 43s, sys: 16 s, total: 5min 59s
Wall time: 6min 5s



In [22]:
predict_mask = mod.is1|mod.is2|mod.oos

In [23]:
def old_data(num, is1=mod.is1):
    
    first_is1 = np.where(is1 ==True)[0][0]
    past_events = np.unique(av.event_id[av.event_id < av.event_id[first_is1]])[-int(num):]
    
    return np.in1d(av.event_id, past_events)

In [24]:
mask_past = old_data(4000)

In [25]:
np.where(predict_mask ==True)[0][0], np.where(mask_past ==True)[0][0]

(1123738, 1085943)

In [26]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))

    j=0
    for i,flag in enumerate(predict_mask):
        if flag:
            factors_new[:,i] = X[j,:]
            j +=1
    return factors_new

In [61]:
np.all(np.in1d(av.event_id, df_pca['event_id'].values) == predict_mask)

False

In [27]:
import pandas as pd
pd.set_option('display.max_columns', 90)


col_names = ['f{}'.format(i) for i in range(1,58)]

df = pd.DataFrame(data =factors[:, predict_mask].T , columns = col_names)
df['event_id'] = av.event_id[predict_mask]
df['runner_id'] = av.runner_id[predict_mask]
df['result'] = av.result[predict_mask]
df['time'] =av.start_time[predict_mask]
df['is1'] = mod.is1[predict_mask]
df['oos'] = mod.oos[predict_mask]

In [66]:
from sklearn.decomposition import PCA
seed =7
pca = PCA(n_components =51, random_state =seed)
name_pca = ['pca_f%s'%i for i in range(1,52)]

df_pca = pd.DataFrame(data = pca.fit_transform(df.ix[:,'f1':'f57']), columns =name_pca) 
#df_pca = df_pca.append(pd.DataFrame(data = pca.transform(df.ix[df.oos.values,'f1':'f57']), columns =name_pca))

df_pca.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51
0,-3.96563,0.475024,-0.458396,-0.437761,0.459468,-0.971297,1.009715,-0.549283,0.092487,-0.436671,0.288159,-0.383869,-0.496406,-0.225241,0.044298,-0.001694,0.345873,0.129309,-0.170947,-0.282424,-0.260633,-0.128401,-0.24067,0.047086,-0.015175,0.021386,-0.281638,0.329844,0.152668,0.181906,-0.228965,0.047199,0.166749,-0.096677,0.114633,-0.095767,0.149982,-0.029129,0.014214,0.094271,-0.029527,-0.010815,0.02253,0.042726,0.032898,0.01826,-0.021068,0.001954,0.030846,-0.016273,0.004811
1,-1.368812,-0.006947,-0.092028,-0.709256,0.177596,-0.400748,-0.432677,-0.340324,0.244105,0.464674,0.609016,0.01073,-0.893683,-0.408752,0.082643,0.075791,-0.125073,0.150471,-0.358299,-0.02455,0.008662,0.027252,0.206629,0.008957,0.04562,-0.173685,0.195684,-0.064586,0.095305,-0.031151,-0.207162,-0.355872,0.019206,-0.143756,0.186557,-0.090538,0.103422,-0.022464,-0.111724,-0.304364,0.128531,0.066386,0.052475,-0.144639,-0.144402,-0.027196,0.045439,0.002482,0.008272,-0.004227,-0.014723
2,-0.958706,-0.110968,0.112799,-0.253621,-0.233773,-0.4487,0.078242,-0.387077,0.210951,-0.204571,0.384731,-0.326743,-0.060698,-0.473298,-0.043503,0.410968,0.384015,0.309765,-0.252184,-0.007075,-0.142722,0.248955,-0.103244,0.067656,0.101819,0.092514,0.138265,-0.016673,0.041744,-0.014458,-0.112835,0.044704,0.104382,-0.042308,-0.151314,0.008361,-0.047636,0.021052,0.013151,0.018956,-0.008926,-0.081109,-0.029422,0.005524,0.022653,-0.021301,-0.000937,-0.018574,-0.025988,-0.002146,0.0678
3,-0.91103,-0.01337,-0.459013,0.196185,0.053641,-0.314621,-0.861711,0.039117,-0.175894,-0.135009,-0.103966,0.133928,0.165781,-0.216123,-0.379095,-0.3778,0.037477,-0.211301,0.185288,-0.417862,0.734184,-0.095048,0.299701,0.000799,0.003822,0.098978,0.11154,-0.062437,-0.296788,-0.111029,0.207358,-0.071403,-0.07887,0.042843,-0.023222,-0.00542,-0.014227,0.043674,-0.001294,-0.147973,0.053836,-0.00015,-0.00604,0.043153,0.045375,-0.001744,0.001126,-0.026722,-0.012083,0.010317,0.000236
4,-1.30464,0.336474,-1.493462,0.310679,0.046025,0.595846,0.370766,0.064179,-0.141242,-0.33912,0.037521,-0.478799,0.449629,-0.190379,-0.11085,-0.112689,0.162289,0.145524,-0.033562,0.236474,-0.643673,-7.1e-05,-0.018297,0.029151,0.046942,0.097443,0.177624,0.031643,0.131602,0.126083,-0.175732,-0.063868,-0.065255,0.025453,-0.01258,0.007448,-0.143996,0.034184,0.061798,0.051067,-0.024434,-0.035129,-0.019685,-0.009579,0.029039,0.069514,-0.037928,0.013388,-0.041592,0.000152,-0.030064


In [68]:
df_pca['radius'] =df_pca[name_pca].apply(np.linalg.norm, axis =1)

In [69]:
df_pca['radius'].describe()

count    181989.000000
mean          2.550329
std           1.369593
min           0.378159
25%           1.800111
50%           2.213164
75%           2.915041
max          18.895350
Name: radius, dtype: float64

In [70]:
df_pca['event_id'] = df['event_id']
df_pca = df_pca.join(df_pca.groupby('event_id')['radius'].max(), on ='event_id', rsuffix ='_max')
df_pca.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,radius,event_id,radius_max
0,-3.96563,0.475024,-0.458396,-0.437761,0.459468,-0.971297,1.009715,-0.549283,0.092487,-0.436671,0.288159,-0.383869,-0.496406,-0.225241,0.044298,-0.001694,0.345873,0.129309,-0.170947,-0.282424,-0.260633,-0.128401,-0.24067,0.047086,-0.015175,0.021386,-0.281638,0.329844,0.152668,0.181906,-0.228965,0.047199,0.166749,-0.096677,0.114633,-0.095767,0.149982,-0.029129,0.014214,0.094271,-0.029527,-0.010815,0.02253,0.042726,0.032898,0.01826,-0.021068,0.001954,0.030846,-0.016273,0.004811,4.510677,293661,5.316903
1,-1.368812,-0.006947,-0.092028,-0.709256,0.177596,-0.400748,-0.432677,-0.340324,0.244105,0.464674,0.609016,0.01073,-0.893683,-0.408752,0.082643,0.075791,-0.125073,0.150471,-0.358299,-0.02455,0.008662,0.027252,0.206629,0.008957,0.04562,-0.173685,0.195684,-0.064586,0.095305,-0.031151,-0.207162,-0.355872,0.019206,-0.143756,0.186557,-0.090538,0.103422,-0.022464,-0.111724,-0.304364,0.128531,0.066386,0.052475,-0.144639,-0.144402,-0.027196,0.045439,0.002482,0.008272,-0.004227,-0.014723,2.284585,293661,5.316903
2,-0.958706,-0.110968,0.112799,-0.253621,-0.233773,-0.4487,0.078242,-0.387077,0.210951,-0.204571,0.384731,-0.326743,-0.060698,-0.473298,-0.043503,0.410968,0.384015,0.309765,-0.252184,-0.007075,-0.142722,0.248955,-0.103244,0.067656,0.101819,0.092514,0.138265,-0.016673,0.041744,-0.014458,-0.112835,0.044704,0.104382,-0.042308,-0.151314,0.008361,-0.047636,0.021052,0.013151,0.018956,-0.008926,-0.081109,-0.029422,0.005524,0.022653,-0.021301,-0.000937,-0.018574,-0.025988,-0.002146,0.0678,1.634844,293661,5.316903
3,-0.91103,-0.01337,-0.459013,0.196185,0.053641,-0.314621,-0.861711,0.039117,-0.175894,-0.135009,-0.103966,0.133928,0.165781,-0.216123,-0.379095,-0.3778,0.037477,-0.211301,0.185288,-0.417862,0.734184,-0.095048,0.299701,0.000799,0.003822,0.098978,0.11154,-0.062437,-0.296788,-0.111029,0.207358,-0.071403,-0.07887,0.042843,-0.023222,-0.00542,-0.014227,0.043674,-0.001294,-0.147973,0.053836,-0.00015,-0.00604,0.043153,0.045375,-0.001744,0.001126,-0.026722,-0.012083,0.010317,0.000236,1.863281,293661,5.316903
4,-1.30464,0.336474,-1.493462,0.310679,0.046025,0.595846,0.370766,0.064179,-0.141242,-0.33912,0.037521,-0.478799,0.449629,-0.190379,-0.11085,-0.112689,0.162289,0.145524,-0.033562,0.236474,-0.643673,-7.1e-05,-0.018297,0.029151,0.046942,0.097443,0.177624,0.031643,0.131602,0.126083,-0.175732,-0.063868,-0.065255,0.025453,-0.01258,0.007448,-0.143996,0.034184,0.061798,0.051067,-0.024434,-0.035129,-0.019685,-0.009579,0.029039,0.069514,-0.037928,0.013388,-0.041592,0.000152,-0.030064,2.438887,293661,5.316903
5,0.000535,0.13452,0.596155,0.402677,0.30002,0.493215,-0.346666,0.246078,0.379585,0.212086,0.022802,-0.011649,-0.123005,0.515537,-0.003459,-0.046785,0.092989,-0.273655,0.527123,0.267024,0.513868,-0.25472,-0.383389,0.09521,0.024512,0.136582,0.077999,0.08057,-0.137795,-0.198805,0.070151,0.133378,-0.006246,0.083279,-0.022083,0.002876,0.02081,-0.01676,-0.017828,0.022481,-0.030027,0.011096,0.019567,0.008383,0.018752,0.000505,0.017003,0.008444,0.04063,0.009491,-0.057232,1.602946,293661,5.316903
6,-0.957834,0.229514,-0.203633,1.101768,0.282164,0.773431,-0.215984,0.506458,-0.43464,0.040251,-0.260971,0.14943,0.237915,0.53614,0.429116,0.149656,-0.260472,-0.332732,0.008406,0.025503,-0.546819,0.289024,-0.293931,-0.106773,0.202367,0.161234,-0.070617,0.073264,0.218016,-0.045353,0.053643,-0.061382,0.04286,0.119892,-0.095153,0.096932,0.026709,-0.067741,0.052954,0.128038,-0.037895,-0.006387,0.058605,0.02603,-0.020248,-0.022058,-0.021927,0.018073,0.040196,-0.018838,-0.009946,2.213521,293661,5.316903
7,2.462464,-0.261173,0.160705,-0.385356,0.154379,-0.115922,-0.157528,-0.103482,-0.000773,0.125062,-0.413551,0.4119,0.252282,0.04123,0.295538,-0.320853,-0.180385,0.216677,0.18045,-0.121318,-0.013936,-0.073435,0.319274,0.10129,-0.253398,0.233162,-0.223069,-0.070703,-0.261051,0.143701,0.011369,0.227519,-0.13265,-0.019788,0.114215,0.024445,-0.008316,0.09737,-0.035611,-0.07934,0.002638,-0.036082,-0.060787,0.038189,0.025903,-0.009168,-0.001215,-0.019059,-0.075668,0.0154,-0.002613,2.758966,293661,5.316903
8,2.311735,-0.33316,0.888049,-0.055469,0.293719,0.81761,0.002488,0.085601,0.488964,0.118073,-0.488595,0.117333,-0.098115,0.184198,-0.624981,0.163312,0.00992,-0.11681,0.082097,0.331556,0.122922,0.418558,-0.013515,-0.10462,-0.004308,-0.263821,0.112994,-0.020507,0.089697,0.170798,0.228067,0.228036,-0.076414,0.054422,-0.06218,0.139406,-0.064391,-0.016524,-0.003993,0.161795,-0.064668,0.017555,0.042095,-0.033056,-0.023866,-0.017041,0.040438,0.006658,-0.001982,0.006961,0.028112,2.933358,293661,5.316903
9,4.691919,-0.449914,0.948822,-0.169847,-1.533237,-0.428813,0.553352,0.438735,-0.663541,0.155227,-0.075147,0.37774,0.5663,0.236688,0.310296,0.060093,-0.466631,-0.01725,-0.16837,-0.007328,0.228148,-0.432113,0.227442,-0.138757,-0.152199,-0.403794,-0.238782,-0.280415,-0.033398,-0.221694,0.154105,-0.12831,0.026238,-0.02336,-0.048874,-0.087747,-0.022356,-0.043664,0.028336,0.055066,0.010471,0.074634,-0.079341,0.023266,0.013894,0.010232,-0.020931,0.013356,0.037368,-0.000838,0.013619,5.316903,293661,5.316903


In [73]:
max_radius = df_pca.groupby('event_id')['radius'].apply(np.argmax)

In [74]:
df_max = df_pca.iloc[max_radius.values][name_pca +['event_id']]

In [75]:
df_max['is1'] = df.iloc[max_radius.values]['is1']

In [76]:
df_max.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,event_id,is1
9,4.691919,-0.449914,0.948822,-0.169847,-1.533237,-0.428813,0.553352,0.438735,-0.663541,0.155227,-0.075147,0.37774,0.5663,0.236688,0.310296,0.060093,-0.466631,-0.01725,-0.16837,-0.007328,0.228148,-0.432113,0.227442,-0.138757,-0.152199,-0.403794,-0.238782,-0.280415,-0.033398,-0.221694,0.154105,-0.12831,0.026238,-0.02336,-0.048874,-0.087747,-0.022356,-0.043664,0.028336,0.055066,0.010471,0.074634,-0.079341,0.023266,0.013894,0.010232,-0.020931,0.013356,0.037368,-0.000838,0.013619,293661,True
16,-3.970241,-0.162222,0.404456,0.307221,-0.805654,0.438222,-1.129917,0.968541,0.427536,0.892115,0.285503,0.188584,-0.650898,-0.168965,0.056062,-0.036568,0.408664,0.435107,0.075067,0.081057,0.230703,0.371322,0.179907,-0.014915,-0.330678,-0.206459,-0.089993,0.336668,0.05711,0.195535,0.23609,0.157076,0.172642,0.048234,0.060378,0.005622,0.110438,0.056431,-0.141493,-0.012251,0.013835,-0.100256,-0.036284,0.100952,-0.038118,0.132528,-0.076812,0.193731,-0.050362,-0.015881,-0.030421,293662,True
24,-3.232836,0.237165,-0.272715,-0.623113,0.848147,-0.197541,-0.169192,0.227447,-0.103452,-0.4,-0.431264,0.12488,-0.233259,-0.379633,-0.223627,-0.094527,0.315743,-0.257729,0.225395,-0.420748,-0.290764,0.099113,0.135497,-0.145315,0.020058,-0.14055,0.137613,0.250853,0.155612,0.0983,-0.198457,-0.012839,0.139257,-0.012181,0.031985,0.008579,-0.32086,0.069528,0.179752,0.07843,-0.002521,0.091253,-0.087994,-0.050898,-0.007073,0.003155,-0.007592,-0.154382,0.039227,0.049126,-0.016845,293663,True
33,-3.025733,0.452009,0.569835,-0.012156,0.077119,0.603476,-0.061175,0.494815,0.255032,0.180839,-0.497236,0.153445,-0.157729,-0.019355,0.237743,-0.176041,-0.021561,-0.084512,0.033696,0.214463,-0.162928,0.270032,-0.102771,-0.47829,0.016493,-0.253174,-0.357991,-0.286963,0.102944,-0.108259,0.003469,0.035401,-0.100359,0.193346,0.0293,-0.149154,-0.102377,0.009684,0.024743,0.058796,-0.086154,0.002345,-0.103037,-0.026533,-0.028263,-0.021843,0.002863,-0.016609,0.014546,0.00379,0.019878,293664,True
47,4.187463,-0.394494,0.479898,-0.233034,-0.596459,0.218469,0.878399,0.214015,-0.842525,-0.479583,-0.51523,-0.088774,0.163516,0.01194,-0.007484,-0.268606,0.137545,-0.14231,-0.164881,-0.369439,-0.232278,0.263299,0.006496,-0.085548,-0.124817,-0.210908,-0.281515,-0.296583,0.074475,0.192288,0.204259,0.114066,-0.072448,0.01538,-0.064868,0.059536,-0.154255,0.013063,0.059729,0.07747,-0.046465,0.089331,0.000975,0.060426,0.082871,0.036766,-0.02011,-0.000661,0.043974,0.0026,-0.014989,293665,True


In [77]:
df_max.tail()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,event_id,is1
181954,-0.420782,0.367295,-2.215311,0.147098,-1.450481,0.00525,-0.060005,-0.634016,0.295776,-0.457399,-0.214347,-0.200175,0.036148,0.191502,-0.059799,0.178121,0.041926,0.174877,0.114498,-0.145265,0.002056,-0.11718,-0.178593,0.012704,-0.068009,0.227211,0.22543,0.088106,0.200091,-0.129507,0.027137,0.139372,-0.041286,-0.004751,0.069629,0.132614,0.212089,-0.09429,-0.137207,0.080912,-0.014117,0.033936,-0.029023,-0.044744,-0.054258,-0.048317,0.001716,-0.028512,0.046213,0.019799,0.023445,348040,False
181964,0.446541,0.033489,-1.222378,0.056826,-0.804497,0.762249,0.001153,-0.813617,-0.397407,0.145464,-0.134352,-0.285464,-0.141053,-0.24918,-0.032567,-0.244437,0.002551,-0.169704,-0.218846,0.519649,-0.105973,0.005506,-0.168053,0.203887,-0.064271,-0.070512,0.170159,-0.204129,0.007461,0.130664,0.14789,-0.195335,-0.00129,-0.151047,0.15806,-0.113908,0.140165,-0.095795,0.073935,0.249861,0.088759,0.015884,0.148719,0.10391,0.073405,0.071875,-0.045114,-0.01971,0.030754,-0.016497,0.088911,348041,False
181967,-2.898259,0.082841,1.085912,-0.379985,0.152615,-0.031621,0.425386,0.391143,-0.390524,0.173267,0.449516,-0.483977,0.069807,-0.283488,0.394206,-0.070617,-0.05191,0.027058,0.068334,0.118263,0.101347,0.013189,-0.081431,0.028748,-0.013882,-0.069852,-0.026399,0.235593,0.108625,0.102518,-0.139234,-0.06278,0.017846,-0.159302,-0.02768,0.075878,0.004525,0.021143,-0.091958,-0.01206,-0.035594,0.094391,0.038643,0.057716,0.016988,0.003894,-0.030481,0.01223,0.013907,0.028344,-0.024252,348042,False
181981,1.692738,-0.219342,-0.134714,0.519794,0.943814,-0.713607,0.680144,-0.225059,-0.530265,0.341555,-0.165134,0.144984,0.33471,-0.023614,0.051197,-0.111221,-0.007463,-0.150731,-0.015587,-0.174697,0.056703,0.20057,-0.045,-0.010865,-0.203451,0.026715,0.073639,-0.017371,-0.119201,0.129613,0.083644,0.131863,-0.131548,-0.051737,0.101705,0.007219,-0.072997,0.001956,0.046797,0.146782,0.072005,0.029037,0.030708,-0.033533,-0.056242,-0.043098,0.039586,0.04982,0.011285,-0.003145,-0.007938,348043,False
181982,-1.812888,0.102399,0.252073,-0.673932,-0.52225,-0.854582,-0.467537,-0.176926,0.923483,0.017217,-0.2495,-0.458307,0.054009,0.420046,-0.35582,-0.15016,-0.006861,-0.232413,-0.282938,-0.506817,0.182472,0.110418,0.20341,-0.190111,0.014969,0.040793,-0.093644,0.06387,-0.029692,-0.018305,-0.288024,0.091321,-0.107956,0.350536,0.117863,-0.103272,-0.088038,0.046236,0.116649,0.074016,0.016732,-0.043987,0.088667,0.20257,0.23419,0.060343,-0.004825,0.01465,0.025548,0.037734,-0.002836,348044,False


In [78]:
from sklearn.mixture import BayesianGaussianMixture
BGM = BayesianGaussianMixture(n_components =11, random_state= seed)
#%time BGM.fit(df_max[name_pca].values) # fit is1 | oos
%time BGM.fit(df_max[name_pca].values[df_max.is1.values]) # fit only is1
#%time BGM.fit(df[col_names].values[df.is1.values])

CPU times: user 1min 45s, sys: 804 ms, total: 1min 45s
Wall time: 56.2 s


BayesianGaussianMixture(covariance_prior=None, covariance_type='full',
            degrees_of_freedom_prior=None, init_params='kmeans',
            max_iter=100, mean_precision_prior=None, mean_prior=None,
            n_components=11, n_init=1, random_state=7, reg_covar=1e-06,
            tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
            weight_concentration_prior=None,
            weight_concentration_prior_type='dirichlet_process')

In [79]:
clusters = BGM.predict(df_max[name_pca].values)
#clusters = BGM.predict(df_pca[['radius_max']].values)
clusters[:50]

array([ 1, 10, 10, 10,  1,  9, 10, 10,  1, 10, 10,  5, 10, 10,  0,  3, 10,
        5, 10,  7,  3, 10,  6,  4,  4,  1,  7,  1,  1,  1,  5, 10,  8,  0,
       10,  3,  9,  3, 10,  9,  5,  7,  8,  9,  5,  6, 10, 10,  0,  7])

In [80]:
name_cl, fr_cl =np.unique(clusters, return_counts =True)
print name_cl
print fr_cl

[ 0  1  2  3  4  5  6  7  8  9 10]
[1338  994  330 1509 1731 2209 1814 1757 1098 1448 3609]


In [81]:
df_max['cluster'] = clusters

In [82]:
df_pca =df_pca.join(pd.DataFrame(df_max['cluster'].values, index= df_max.event_id, columns =['cluster']), on ='event_id')
df_pca.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,radius,event_id,radius_max,cluster
0,-3.96563,0.475024,-0.458396,-0.437761,0.459468,-0.971297,1.009715,-0.549283,0.092487,-0.436671,0.288159,-0.383869,-0.496406,-0.225241,0.044298,-0.001694,0.345873,0.129309,-0.170947,-0.282424,-0.260633,-0.128401,-0.24067,0.047086,-0.015175,0.021386,-0.281638,0.329844,0.152668,0.181906,-0.228965,0.047199,0.166749,-0.096677,0.114633,-0.095767,0.149982,-0.029129,0.014214,0.094271,-0.029527,-0.010815,0.02253,0.042726,0.032898,0.01826,-0.021068,0.001954,0.030846,-0.016273,0.004811,4.510677,293661,5.316903,1
1,-1.368812,-0.006947,-0.092028,-0.709256,0.177596,-0.400748,-0.432677,-0.340324,0.244105,0.464674,0.609016,0.01073,-0.893683,-0.408752,0.082643,0.075791,-0.125073,0.150471,-0.358299,-0.02455,0.008662,0.027252,0.206629,0.008957,0.04562,-0.173685,0.195684,-0.064586,0.095305,-0.031151,-0.207162,-0.355872,0.019206,-0.143756,0.186557,-0.090538,0.103422,-0.022464,-0.111724,-0.304364,0.128531,0.066386,0.052475,-0.144639,-0.144402,-0.027196,0.045439,0.002482,0.008272,-0.004227,-0.014723,2.284585,293661,5.316903,1
2,-0.958706,-0.110968,0.112799,-0.253621,-0.233773,-0.4487,0.078242,-0.387077,0.210951,-0.204571,0.384731,-0.326743,-0.060698,-0.473298,-0.043503,0.410968,0.384015,0.309765,-0.252184,-0.007075,-0.142722,0.248955,-0.103244,0.067656,0.101819,0.092514,0.138265,-0.016673,0.041744,-0.014458,-0.112835,0.044704,0.104382,-0.042308,-0.151314,0.008361,-0.047636,0.021052,0.013151,0.018956,-0.008926,-0.081109,-0.029422,0.005524,0.022653,-0.021301,-0.000937,-0.018574,-0.025988,-0.002146,0.0678,1.634844,293661,5.316903,1
3,-0.91103,-0.01337,-0.459013,0.196185,0.053641,-0.314621,-0.861711,0.039117,-0.175894,-0.135009,-0.103966,0.133928,0.165781,-0.216123,-0.379095,-0.3778,0.037477,-0.211301,0.185288,-0.417862,0.734184,-0.095048,0.299701,0.000799,0.003822,0.098978,0.11154,-0.062437,-0.296788,-0.111029,0.207358,-0.071403,-0.07887,0.042843,-0.023222,-0.00542,-0.014227,0.043674,-0.001294,-0.147973,0.053836,-0.00015,-0.00604,0.043153,0.045375,-0.001744,0.001126,-0.026722,-0.012083,0.010317,0.000236,1.863281,293661,5.316903,1
4,-1.30464,0.336474,-1.493462,0.310679,0.046025,0.595846,0.370766,0.064179,-0.141242,-0.33912,0.037521,-0.478799,0.449629,-0.190379,-0.11085,-0.112689,0.162289,0.145524,-0.033562,0.236474,-0.643673,-7.1e-05,-0.018297,0.029151,0.046942,0.097443,0.177624,0.031643,0.131602,0.126083,-0.175732,-0.063868,-0.065255,0.025453,-0.01258,0.007448,-0.143996,0.034184,0.061798,0.051067,-0.024434,-0.035129,-0.019685,-0.009579,0.029039,0.069514,-0.037928,0.013388,-0.041592,0.000152,-0.030064,2.438887,293661,5.316903,1


In [83]:
df_pca.tail()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,radius,event_id,radius_max,cluster
181984,-0.205857,0.046213,-0.262328,-0.544579,0.391583,-0.263815,-0.091872,-0.187139,-0.607862,0.036442,-0.275287,-0.201388,0.106388,-0.249283,0.988086,0.066394,-0.007639,0.128353,0.117004,0.196537,0.169025,-0.418349,-0.062843,0.139732,-0.063081,0.003425,-0.019238,0.022993,0.160207,0.196936,0.221904,-0.10407,-0.104814,-0.271816,0.043671,0.013885,0.003504,-0.016249,0.049688,0.010381,-0.010131,0.001626,0.083532,-0.039795,-0.02452,0.01787,-0.022241,0.041709,0.056745,-0.00531,0.007664,1.657682,348044,2.728162,4
181985,-0.560418,-0.066324,0.209394,0.603091,0.034402,0.135804,0.606321,0.344407,0.363943,-0.367062,0.536879,0.780592,-0.102542,-0.038813,-0.51607,0.12361,0.110038,-0.150697,0.010191,0.263108,0.025078,-0.043705,0.188683,-0.023777,0.048957,-0.086187,0.007946,0.162077,-0.050827,-0.011938,0.067505,0.076057,0.058427,0.126559,-0.220457,-0.177614,0.049682,0.007364,-0.133332,-0.027734,0.082446,-0.209408,0.072863,-0.510571,0.424915,0.051416,-0.158318,-0.027375,-0.016481,0.199499,-0.01082,1.889139,348044,2.728162,4
181986,1.080464,-0.125079,0.117892,0.357487,0.082169,0.091252,-0.567474,-0.045162,-0.118669,0.095505,0.07216,-0.352333,-0.038792,-0.332391,0.626519,-0.147912,-0.154666,0.157899,0.030803,-0.121196,0.119843,-0.092696,-0.183843,0.400229,-0.03302,0.001834,0.163093,-0.124005,-0.07843,0.057751,0.097834,-0.074739,-0.074104,0.000237,0.016498,0.159829,0.187909,-0.059914,0.018854,0.156631,0.112,0.253506,-0.19257,0.089835,-0.142241,-0.15384,-0.028278,0.024257,0.014283,-0.00449,0.061224,1.719968,348044,2.728162,4
181987,1.322641,0.038814,-0.234455,0.012107,-0.160549,0.335845,0.813282,0.47036,-0.144126,-0.782871,0.227114,0.354059,-0.317756,0.787691,-0.957388,0.264934,0.45828,-0.018025,-0.018637,0.248426,-0.289012,0.469922,-0.141957,-0.014098,0.129316,0.001226,-0.289352,-0.024394,0.018035,0.040297,0.02805,-0.051894,0.226373,-0.003471,0.22752,0.014412,0.101259,-0.047992,0.014958,-0.090137,-0.112244,0.035137,0.000278,0.139267,-0.230695,-0.01437,0.151181,-0.063509,0.003662,-0.143638,-0.051802,2.512098,348044,2.728162,4
181988,1.528876,-0.222407,-0.35118,0.319467,0.077415,0.504309,-0.130409,-0.20466,-0.837138,0.611499,-0.345275,-0.145286,0.016002,-0.26173,0.393581,-0.372469,-0.564377,0.130698,0.210396,-0.049357,-0.028397,-0.072864,0.052593,-0.14207,-0.07955,0.092344,0.121681,-0.021408,-0.017356,-0.132053,0.043813,0.039677,-0.098787,-0.016249,-0.15665,-0.037014,-0.093934,0.034877,-0.07733,-0.152112,-0.027567,-0.028024,-0.061599,-0.018159,-0.09713,0.021833,-0.002834,0.023295,-0.019302,0.000213,0.01923,2.250259,348044,2.728162,4


In [42]:
#df_pca['cluster'] = clusters

In [84]:
df_pca['cluster'] =df_pca['cluster'].replace({x: 11 for x,fr in zip(name_cl, fr_cl) if fr <=400 })

In [85]:
is1 = mod.is1.copy()
is2 = mod.is2.copy()
oos = mod.oos.copy()

In [86]:
df_pca['cluster'].value_counts()

10    33755
5     22746
4     18656
7     17289
6     17189
3     15632
0     14568
9     14477
8     12472
1      9908
11     5297
Name: cluster, dtype: int64

In [87]:
df_pca.ix[df_pca['cluster'] == 11, 'radius'].describe()

count    5297.000000
mean        6.161232
std         4.430684
min         1.961421
25%         3.821712
50%         4.190160
75%         5.113119
max        18.895350
Name: radius, dtype: float64

In [88]:
df_pca['cluster'].unique()

array([ 1, 10,  9,  5,  0,  3,  7,  6,  4,  8, 11])

In [89]:
%%time
model_coefs, model_step1prob, model_step2prob, model_likelihood = {}, {}, {}, {}
train_val_test = {}
    
for cl in df_pca['cluster'].unique():

    events_tr = df_pca['event_id'][df.is1.values & (df_pca['cluster'] ==cl)].values
    events_ts = df_pca['event_id'][df.oos.values & (df_pca['cluster'] ==cl)].values
    train_val_test[cl] = events_tr, events_tr, events_ts
       
        
    mod.is1 = np.in1d(av.event_id, events_tr)
    mod.is2 = np.in1d(av.event_id, events_tr)
    mod.oos = np.in1d(av.event_id, events_ts)

        
         
    model_coefs[cl], model_step1prob[cl], model_step2prob[cl], model_likelihood[cl]\
    = mod.fit_slices(tsav, factors,  depth=3, lmbd=10, verbose=False, fit_afresh=True)

    print 'cluster {}  number  {}'.format(cl, df_pca['cluster'].value_counts()[cl])
    print 'LL  {}          {}            {}'.format (len(events_tr), len(events_tr), len(events_ts))
    print model_likelihood[cl]
    print '..................'

. . . . . . . . . . 10
cluster 1  number  9908
LL  7541          7541            2367
[[-1677.20289616 -1781.05918683 -1781.05918683]
 [-1674.9021682  -1777.11448796 -1777.11448796]
 [-1678.98021447 -1766.19597252 -1766.19597252]
 [-1670.80294596 -1752.78431763 -1752.78431763]
 [-1689.52509483 -1731.82275672 -1731.82275672]
 [-1698.90619836 -1722.19876846 -1722.19876846]
 [-1739.97309349 -1758.6834853  -1758.6834853 ]
 [-1753.98917413 -1776.16346442 -1776.16346442]
 [-1743.07102253 -1761.49921566 -1761.49921566]
 [-1732.93524184 -1758.99813183 -1758.99813183]
 [    0.             0.             0.        ]]
..................
. . . . . . . . . . 10
cluster 10  number  33755
LL  20916          20916            12839
[[-1649.537185   -1837.08054072 -1837.08054072]
 [-1653.55299908 -1838.28369477 -1838.28369477]
 [-1657.84603456 -1839.56125189 -1839.56125189]
 [-1662.50145941 -1817.97453035 -1817.97453035]
 [-1675.83815275 -1801.74898641 -1801.74898641]
 [-1716.0544172  -1766.85633838 -17

In [90]:
mod.is1 = is1
mod.is2 = is2
mod.oos = oos

old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood \
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print old_model_likelihood
print '..................'

. . . . . . . . . . 10
[[-1809.60545794 -1997.12919856 -1997.12919856]
 [-1815.29560313 -1993.38729693 -1993.38729693]
 [-1820.46573311 -1991.13279316 -1991.13279316]
 [-1825.04841809 -1978.8072344  -1978.8072344 ]
 [-1845.34679456 -1957.00310184 -1957.00310184]
 [-1863.07407698 -1951.28943929 -1951.28943929]
 [-1903.75005795 -1924.17873405 -1924.17873405]
 [-1921.12493591 -1902.65155685 -1902.65155685]
 [-1923.8939621  -1902.50302162 -1902.50302162]
 [-1928.88792622 -1895.4264179  -1895.4264179 ]
 [    0.             0.             0.        ]]
..................


In [94]:
cl_list = [10, 9, 1, 6]
df1 = df_pca.groupby('event_id')['cluster'].first()

In [92]:
cl_n = np.zeros((len(av.event_id)))
for i,k in enumerate(df_pca['cluster'].unique()):
    mask = np.in1d(av.event_id,df1.index[df1 ==k])
    cl_n[mask] = i+1
            
np.unique(cl_n, return_counts =True)

(array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.]),
 array([1449862,    9908,   33755,   14477,   22746,   14568,   15632,
          17289,   17189,   18656,   12472,    5297]))

In [93]:
np.unique(cl_n[oos], return_counts =True)

(array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.]),
 array([ 2367, 12839,  4808,  7083,  4805,  4085,  3929,  4124,  6367,
         3577,  2291]))

In [95]:
cl_list = [10, 9, 1, 6]
print 'count events  ',fr_cl[np.in1d(name_cl, cl_list)].sum()
write_dic_to_simdata('simdata_max_radius_claster.p', old_model_step1prob, old_model_coefs, mod.oos, av =av, data =df1,
                    cluster_step1probs =model_step1prob, cluster_coefs =model_coefs, cluster_names =cl_list)

count events   7865


In [96]:
old_model_likelihood[:-1].mean(axis =0)

array([-1865.6492966 , -1949.35087946, -1949.35087946])

In [97]:
for cl in df_pca['cluster'].unique():
    print 'cluster   ', cl
    print model_likelihood[cl][:-1].mean(axis =0)
    print '.................'

cluster    1
[-1706.028805   -1758.65197873 -1758.65197873]
.................
cluster    10
[-1710.93846292 -1783.6658981  -1783.6658981 ]
.................
cluster    9
[-1707.68244908 -1841.81721083 -1841.81721083]
.................
cluster    5
[-1987.96470228 -2106.67310486 -2106.67310486]
.................
cluster    0
[-2032.75522223 -2102.42332001 -2102.42332001]
.................
cluster    3
[-1935.69701619 -2051.43355806 -2051.43355806]
.................
cluster    7
[-1896.28706098 -1953.93867134 -1953.93867134]
.................
cluster    6
[-1671.92912664 -1880.93150794 -1880.93150794]
.................
cluster    4
[-1895.15785243 -2023.917965   -2023.917965  ]
.................
cluster    8
[-2015.51932318 -2077.46622197 -2077.46622197]
.................
cluster    11
[-2017.82520628 -2220.05530298 -2220.05530298]
.................


In [98]:
cl_list = [10, 9, 1, 6, 7]
print 'count events  ',fr_cl[np.in1d(name_cl, cl_list)].sum()
write_dic_to_simdata('simdata_max_radius_claster2.p', old_model_step1prob, old_model_coefs, mod.oos, av =av, data =df1,
                    cluster_step1probs =model_step1prob, cluster_coefs =model_coefs, cluster_names =cl_list)

count events   9622


In [99]:
cl_list = [10, 9, 1, 6, 7, 4]
print 'count events  ',fr_cl[np.in1d(name_cl, cl_list)].sum()

write_dic_to_simdata('simdata_max_radius_claster3.p', old_model_step1prob, old_model_coefs, mod.oos, av =av, data =df1,
                    cluster_step1probs =model_step1prob, cluster_coefs =model_coefs, cluster_names =cl_list)

count events   11353


In [131]:
df_max[['cos_{}'.format(i+1) for i in range(df_max[name_pca].shape[1])]].head()

Unnamed: 0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
9,0.117547,1.08462,0.821546,1.031945,1.28837,1.080651,0.895926,0.917483,1.124798,0.970805,1.014134,0.928955,0.893491,0.955484,0.94164,0.988698,1.087764,1.003244,1.031667,1.001378,0.95709,1.081271,0.957223,1.026097,1.028625,1.075945,1.04491,1.05274,1.006281,1.041696,0.971016,1.024133,0.995065,1.004393,1.009192,1.016503,1.004205,1.008212,0.994671,0.989643,0.998031,0.985963,1.014922,0.995624,0.997387,0.998076,1.003937,0.997488,0.992972,1.000158,0.997439
16,1.850288,1.034742,0.913379,0.934204,1.172543,0.906148,1.241989,0.792572,0.908437,0.80894,0.938855,0.959612,1.1394,1.036186,0.987994,1.007832,0.912478,0.906815,0.983923,0.98264,0.950591,0.920476,0.96147,1.003194,1.07082,1.044216,1.019273,0.927897,0.987769,0.958123,0.949438,0.96636,0.963026,0.98967,0.987069,0.998796,0.976348,0.987914,1.030303,1.002624,0.997037,1.021471,1.007771,0.978379,1.008164,0.971617,1.016451,0.95851,1.010786,1.003401,1.006515
24,1.88461,0.935104,1.074624,1.170504,0.767919,1.054054,1.046296,0.937763,1.028308,1.109453,1.118008,0.965829,1.063827,1.10388,1.061192,1.025866,0.913602,1.070523,0.938324,1.115131,1.079562,0.972879,0.962924,1.039763,0.994511,1.038459,0.962345,0.931358,0.957419,0.973102,1.054304,1.003513,0.961895,1.003333,0.991248,0.997652,1.087798,0.980975,0.950814,0.978539,1.00069,0.97503,1.024078,1.013927,1.001935,0.999137,1.002077,1.042244,0.989266,0.986557,1.004609
33,1.889062,0.867185,0.832563,1.003572,0.97734,0.822678,1.017975,0.854607,0.925063,0.946863,1.146105,0.954913,1.046346,1.005687,0.930143,1.051727,1.006335,1.024833,0.990099,0.936983,1.047874,0.920655,1.030198,1.140538,0.995154,1.074391,1.10519,1.084319,0.969752,1.03181,0.998981,0.989598,1.029489,0.943188,0.991391,1.043827,1.030082,0.997155,0.99273,0.982724,1.025315,0.999311,1.030276,1.007796,1.008305,1.006418,0.999159,1.00488,0.995726,0.998886,0.994159
47,0.091347,1.085603,0.895865,1.050567,1.129428,0.952594,0.809393,0.95356,1.182823,1.104067,1.111802,1.019263,0.964518,0.997409,1.001624,1.058286,0.970154,1.03088,1.035778,1.080166,1.050403,0.942866,0.998591,1.018563,1.027085,1.045766,1.061087,1.064357,0.983839,0.958275,0.955677,0.975248,1.015721,0.996663,1.014076,0.987081,1.033472,0.997165,0.987039,0.983189,1.010083,0.980616,0.999788,0.986888,0.982017,0.992022,1.004364,1.000144,0.990458,0.999436,1.003253


In [132]:
%time BGM.fit(df_max[['cos_{}'.format(i+1) for i in range(df_max[name_pca].shape[1])]].values[df_max.is1.values]) # fit only is1

CPU times: user 2min 36s, sys: 1.67 s, total: 2min 38s
Wall time: 1min 41s


BayesianGaussianMixture(covariance_prior=None, covariance_type='full',
            degrees_of_freedom_prior=None, init_params='kmeans',
            max_iter=100, mean_precision_prior=None, mean_prior=None,
            n_components=11, n_init=1, random_state=7, reg_covar=1e-06,
            tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
            weight_concentration_prior=None,
            weight_concentration_prior_type='dirichlet_process')

In [133]:
clusters = BGM.predict(df_max[['cos_{}'.format(i+1) for i in range(df_max[name_pca].shape[1])]].values)
name_cl, fr_cl =np.unique(clusters, return_counts =True)
name_cl, fr_cl 

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([1792, 2401, 1972,  327, 1853, 1199, 1317, 3057,  630, 1254, 2035]))