In [1]:
# ThoroughBet Simulation


## Load necessary modules

In [2]:
import numpy as np

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata, dic_to_tenzor, ll_diff

## Load data

In [3]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [4]:
av_w = ArrayView.from_file(settings.paths.join('weather.av.bcolz'))

In [5]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file('/home/oleg/thbmodel/racingdata/brain_final2_slice_%s.av.bcolz' % sl)
    except ValueError:
        break
    sl += 1

In [6]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [7]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [8]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . .

  coefse = np.sqrt(np.diag(information_matrix))


 . .CPU times: user 3min 44s, sys: 2.58 s, total: 3min 46s
Wall time: 1min 7s



In [9]:
predict_mask = mod.is1|mod.is2|mod.oos

In [10]:
def old_data(num, is1=mod.is1):
    
    first_is1 = np.where(is1 ==True)[0][0]
    past_events = np.unique(av.event_id[av.event_id < av.event_id[first_is1]])[-int(num):]
    
    return np.in1d(av.event_id, past_events)

In [11]:
mask_past = old_data(4000)

In [12]:
np.where(predict_mask ==True)[0][0], np.where(mask_past ==True)[0][0]

(1123738, 1085943)

In [13]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))
    factors_new[:, predict_mask] = X.T
    
    return factors_new

In [14]:
def DF(mask, factors, av, factors_names, other_names):
    
    import pandas as pd
    
    df = pd.DataFrame(data =factors[:, mask].T , columns = factors_names)
    for col in other_names :
        
        df[col] = av[col][mask]
        
    return df

In [15]:
import pandas as pd
pd.set_option('display.max_columns', 90)

col_names = ['f{}'.format(i) for i in range(1,58)]

df = DF (predict_mask, factors, av, col_names, ['event_id', 'runner_id', 'result', 'start_time', 'going', 
                                                'obstacle', 'distance', 'prize'])
df['is1'] = mod.is1 [predict_mask]
df['oos'] = mod.oos [predict_mask]
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,start_time,going,obstacle,distance,prize,is1,oos
0,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,-0.487135,0.084544,-0.116107,0.002079,-0.001362,-0.160866,0.091133,0.966224,2.384186e-07,2.49246,0.753668,-0.000403,0.234473,0.250251,0.023256,0.628183,0.0986,-0.014886,-0.089427,0.53691,-0.012463,-0.143313,0.943943,0.417261,-0.000921,-0.00078,0.1193,293661,360456,3,1443704000.0,GD-FM,F,1700.784058,3235.0,True,False
1,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,-0.030274,-0.498561,0.035182,-0.01871,0.029094,0.031624,-0.061174,0.254454,2.384186e-07,1.648349,0.073634,-0.650115,0.177804,-0.052191,-0.034483,0.370725,-0.247407,-0.023127,0.550295,-0.121513,0.112166,0.040612,0.224962,0.057615,-0.016085,-0.01733,0.026977,293661,375590,5,1443704000.0,GD-FM,F,1700.784058,3235.0,True,False
2,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,-0.26665,0.043125,0.001112,0.002079,-0.011063,0.171808,0.014969,0.155725,2.384186e-07,0.941642,0.146075,-0.109316,0.106961,0.005249,0.083641,0.144267,0.0986,0.08141,-0.152771,-0.183572,-0.012463,0.159303,0.48867,0.036059,0.0148,0.014925,0.051179,293661,374610,7,1443704000.0,GD-FM,F,1700.784058,3235.0,True,False
3,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,0.250868,0.486319,0.086636,0.002079,-0.009765,0.1904,0.130183,0.447011,2.384186e-07,0.498932,-0.041847,0.597555,0.001817,-0.037324,-0.019826,-0.163311,-0.247407,-0.014886,0.461782,0.289931,-0.012463,0.088135,-0.126984,-0.495465,0.012784,0.012911,-0.262583,293661,373638,1,1443704000.0,GD-FM,F,1700.784058,3235.0,True,False
4,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,-0.043148,-0.245328,-0.009627,0.002079,0.011331,-0.122763,-0.217356,1.080508,2.384186e-07,-0.004469,0.049962,0.495737,0.116301,-0.110655,-0.044347,-0.163311,0.0986,0.0239,-0.237007,-0.440662,-0.012463,-0.125458,0.136239,0.442488,-0.024187,-0.024024,0.267151,293661,347906,4,1443704000.0,GD-FM,F,1700.784058,3235.0,True,False


In [16]:
from sklearn.decomposition import PCA
seed =7
pca = PCA(n_components =51, random_state =seed)
name_pca = ['pca_f%s'%i for i in range(1,52)]

df_pca = pd.DataFrame(data = pca.fit_transform(df.ix[:,'f1':'f57']), columns =name_pca) 
#df_pca = df_pca.append(pd.DataFrame(data = pca.transform(df.ix[df.oos.values,'f1':'f57']), columns =name_pca))

df_pca.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51
0,-3.96563,0.475024,-0.458396,-0.437761,0.459468,-0.971297,1.009715,-0.549283,0.092487,-0.436671,0.288159,-0.383869,-0.496406,-0.225241,0.044298,-0.001694,0.345873,0.129309,-0.170947,-0.282424,-0.260633,-0.128401,-0.24067,0.047086,-0.015175,0.021386,-0.281638,0.329844,0.152668,0.181906,-0.228965,0.047199,0.166749,-0.096677,0.114633,-0.095767,0.149982,-0.029129,0.014214,0.094271,-0.029527,-0.010815,0.02253,0.042726,0.032898,0.01826,-0.021068,0.001954,0.030846,-0.016273,0.004811
1,-1.368812,-0.006947,-0.092028,-0.709256,0.177596,-0.400748,-0.432677,-0.340324,0.244105,0.464674,0.609016,0.010729,-0.893683,-0.408752,0.082643,0.075791,-0.125073,0.150471,-0.358299,-0.02455,0.008662,0.027252,0.206629,0.008957,0.04562,-0.173685,0.195684,-0.064586,0.095305,-0.031151,-0.207162,-0.355872,0.019206,-0.143756,0.186557,-0.090538,0.103422,-0.022464,-0.111724,-0.304364,0.128531,0.066386,0.052475,-0.144639,-0.144402,-0.027196,0.045439,0.002482,0.008272,-0.004227,-0.014723
2,-0.958706,-0.110968,0.112799,-0.253621,-0.233773,-0.4487,0.078242,-0.387077,0.210951,-0.204571,0.384731,-0.326743,-0.060698,-0.473298,-0.043503,0.410968,0.384015,0.309765,-0.252184,-0.007075,-0.142722,0.248955,-0.103244,0.067656,0.101819,0.092514,0.138265,-0.016673,0.041744,-0.014458,-0.112835,0.044704,0.104382,-0.042308,-0.151314,0.008361,-0.047636,0.021052,0.013151,0.018956,-0.008926,-0.081109,-0.029422,0.005524,0.022653,-0.021301,-0.000937,-0.018574,-0.025988,-0.002146,0.0678
3,-0.91103,-0.01337,-0.459013,0.196185,0.053641,-0.314621,-0.861711,0.039117,-0.175894,-0.135009,-0.103966,0.133928,0.165781,-0.216123,-0.379095,-0.3778,0.037477,-0.211301,0.185288,-0.417862,0.734184,-0.095048,0.299701,0.000799,0.003822,0.098978,0.11154,-0.062437,-0.296788,-0.111029,0.207358,-0.071403,-0.07887,0.042843,-0.023222,-0.00542,-0.014227,0.043674,-0.001294,-0.147973,0.053836,-0.00015,-0.00604,0.043153,0.045375,-0.001744,0.001126,-0.026722,-0.012083,0.010317,0.000236
4,-1.30464,0.336474,-1.493462,0.310679,0.046025,0.595846,0.370766,0.064179,-0.141242,-0.33912,0.037521,-0.478799,0.449629,-0.190379,-0.11085,-0.112689,0.162289,0.145524,-0.033562,0.236474,-0.643673,-7.1e-05,-0.018297,0.029151,0.046942,0.097443,0.177624,0.031643,0.131602,0.126083,-0.175732,-0.063868,-0.065255,0.025453,-0.01258,0.007448,-0.143996,0.034184,0.061798,0.051067,-0.024434,-0.035129,-0.019685,-0.009579,0.029039,0.069514,-0.037928,0.013388,-0.041592,0.000152,-0.030064


In [17]:
df_pca['event_id'] = df['event_id']
df_winner = df_pca[df['result'] ==1][name_pca +['event_id']] # run_id only winners in each event_id

In [18]:
df_winner.tail()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,event_id
181957,2.34745,-0.100543,-0.334828,-0.175671,-0.179005,-0.210946,-0.199818,0.041611,-0.059043,-0.330114,-0.193491,-0.25457,0.378511,0.675069,0.073055,-0.18703,-0.06043,-0.172028,-0.377847,0.127879,-0.041111,-0.423021,-0.282202,0.192302,-0.1327,0.024536,-0.154612,-0.161062,-0.077698,-0.092356,0.153722,0.028502,0.023486,-0.007815,0.112169,-0.017988,0.064274,-0.050321,-0.083449,0.043324,0.063299,0.132962,0.146946,0.031076,-0.022518,0.042288,-0.031309,-0.025016,-0.209699,0.049595,0.036,348040
181964,0.446541,0.033489,-1.222378,0.056826,-0.804497,0.762249,0.001153,-0.813617,-0.397407,0.145464,-0.134352,-0.285464,-0.141053,-0.24918,-0.032567,-0.244437,0.002551,-0.169704,-0.218846,0.519649,-0.105973,0.005506,-0.168053,0.203887,-0.064271,-0.070512,0.170159,-0.204129,0.007461,0.130664,0.14789,-0.195335,-0.00129,-0.151047,0.15806,-0.113908,0.140165,-0.095795,0.073935,0.249861,0.088759,0.015884,0.148719,0.103909,0.073405,0.071875,-0.045114,-0.01971,0.030754,-0.016497,0.088911,348041
181971,-0.164926,0.067174,0.491288,-0.639513,0.114992,1.000453,0.631925,-0.81664,1.003752,-0.372973,-0.124748,0.467406,-0.231623,-0.205614,-0.401444,0.223316,0.292653,-0.262529,0.422491,-0.13245,0.28521,-0.065408,-0.184555,0.113024,-0.155853,0.246227,-0.085645,-0.01885,0.27855,0.096669,0.144906,-0.015112,0.201984,0.14354,0.249521,0.097246,-0.178665,0.030642,-0.019237,-0.027219,0.111184,0.084722,0.046927,-0.006709,0.003695,-0.018969,-0.078802,-0.048503,-0.031204,-0.014812,0.035665,348042
181981,1.692738,-0.219342,-0.134714,0.519794,0.943814,-0.713607,0.680144,-0.225059,-0.530265,0.341555,-0.165134,0.144984,0.33471,-0.023614,0.051197,-0.111221,-0.007463,-0.150731,-0.015587,-0.174697,0.056703,0.20057,-0.045,-0.010865,-0.203451,0.026715,0.073639,-0.017371,-0.119201,0.129613,0.083644,0.131863,-0.131548,-0.051737,0.101705,0.007219,-0.072997,0.001956,0.046797,0.146782,0.072005,0.029037,0.030708,-0.033533,-0.056242,-0.043098,0.039586,0.04982,0.011285,-0.003145,-0.007938,348043
181982,-1.812888,0.102399,0.252073,-0.673932,-0.52225,-0.854582,-0.467537,-0.176926,0.923483,0.017217,-0.2495,-0.458307,0.054009,0.420046,-0.35582,-0.15016,-0.006861,-0.232413,-0.282938,-0.506817,0.182472,0.110418,0.20341,-0.190111,0.014969,0.040793,-0.093644,0.06387,-0.029692,-0.018305,-0.288024,0.091321,-0.107956,0.350536,0.117863,-0.103272,-0.088038,0.046236,0.116649,0.074016,0.016732,-0.043987,0.088667,0.20257,0.23419,0.060343,-0.004825,0.01465,0.025548,0.037734,-0.002836,348044


In [19]:
nor_vectors = np.diag(np.ones(df_pca[name_pca].shape[1]))

In [20]:
name_cos = []
# norms of vectors in matrixs 
matrix_norms = np.linalg.norm(df_winner[name_pca].values, axis=1)
vector_norm = np.linalg.norm(nor_vectors[0])

for i, v  in enumerate(nor_vectors):
     # compute the direction of each event_id for radius of winners direction
    df_winner['cos_' + str(i+1)] = 1- np.divide(df_winner[name_pca].values.dot(v),np.multiply(matrix_norms, vector_norm))
    name_cos.append('cos_' + str(i+1))

In [21]:
pd.set_option('display.max_columns', 110)
df_winner.tail(10)

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,event_id,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
181912,-0.477686,0.109361,-0.056161,0.91458,-0.904739,-0.041272,0.576404,-0.002466,0.03087,-0.375486,-0.581624,-0.153817,0.20173,0.286129,0.487511,0.056934,-0.26166,-0.134552,0.18405,0.005368,-0.360007,-0.248688,0.459467,0.120446,-0.092835,0.04859,0.082258,0.042486,0.122306,-0.033337,-0.03676,-0.14026,0.119471,0.041349,-0.192145,0.033287,-0.085161,0.017781,-0.123643,0.021145,0.018652,0.091271,-0.01339,0.070377,0.049677,0.00904,-0.004055,0.036713,0.043558,0.016816,-0.006555,348035,1.245336,0.943833,1.028844,0.530278,1.464668,1.021197,0.703963,1.001266,0.984145,1.192847,1.298718,1.078999,0.896393,0.853046,0.749618,0.970759,1.134387,1.069105,0.905473,0.997243,1.184897,1.127724,0.764021,0.93814,1.047679,0.975045,0.957753,0.978179,0.937184,1.017122,1.01888,1.072037,0.93864,0.978763,1.098684,0.982904,1.043738,0.990868,1.063502,0.98914,0.99042,0.953124,1.006877,0.963855,0.974486,0.995357,1.002083,0.981144,0.977629,0.991363,1.003366
181916,-0.069705,-0.014259,0.180493,-0.49935,0.703352,-0.164158,0.28891,-0.004151,0.301327,-0.084002,0.578644,0.304035,0.312488,-0.495298,0.337092,-0.009163,-0.078073,-0.086692,-0.237219,0.277968,-0.293277,0.229099,0.201743,-0.022607,-0.037163,-0.102205,-0.10142,0.122075,0.222947,-0.121841,0.127857,0.031497,0.210727,-0.153054,0.017294,-0.158553,-0.033362,0.011872,-0.00512,0.054447,-0.013092,0.034165,-0.007024,0.084819,0.006203,-0.009814,0.005198,-0.002573,-0.013169,0.009915,0.007448,348036,1.04475,1.009154,0.884125,1.320578,0.548454,1.105388,0.814523,1.002665,0.806551,1.053928,0.628516,0.804812,0.799386,1.317977,0.78359,1.005883,1.050122,1.055655,1.152293,0.821547,1.188281,0.85292,0.870483,1.014514,1.023859,1.065614,1.065111,0.921629,0.85687,1.078221,0.917917,0.97978,0.864715,1.098259,0.988897,1.101789,1.021418,0.992378,1.003287,0.965046,1.008405,0.978066,1.004509,0.945547,0.996018,1.006301,0.996663,1.001652,1.008454,0.993635,0.995218
181927,-1.345241,0.159079,0.116341,0.506872,-0.159032,-1.368643,0.173116,0.279453,-0.934759,-0.249042,0.573034,-0.225098,-0.004024,0.407235,-0.361267,0.291857,-0.007398,0.471653,0.285426,0.158763,-0.390301,-0.102354,0.040219,-0.096412,0.073291,0.010134,0.039173,-0.115565,-0.055572,-0.095728,0.380535,0.024592,-0.074146,-0.000161,-0.055169,0.026434,-0.044547,0.029118,-0.18763,-0.038034,0.017244,-0.071232,0.026736,0.030278,0.030684,-0.013854,0.109848,-0.021858,0.041891,0.027186,-0.018279,348037,1.525167,0.937897,0.954582,0.802123,1.062084,1.534303,0.932417,0.890905,1.36492,1.097223,0.776294,1.087876,1.001571,0.84102,1.141035,0.886062,1.002888,0.815872,0.888573,0.938021,1.152369,1.039958,0.984299,1.037638,0.971388,0.996044,0.984707,1.045115,1.021695,1.037371,0.851443,0.9904,1.028946,1.000063,1.021538,0.98968,1.017391,0.988633,1.073249,1.014848,0.993268,1.027808,0.989562,0.98818,0.988021,1.005409,0.957116,1.008533,0.983646,0.989387,1.007136
181934,-3.323079,0.205726,0.593426,-0.116868,-0.051936,-1.002001,-0.369207,0.753756,-0.009457,0.153512,-0.085058,-0.232132,-0.445498,-0.241983,0.602776,-0.237257,-0.379724,0.279706,0.181352,-0.032902,0.045122,-0.162371,0.103653,-0.116657,0.185517,0.173218,0.348594,-0.439935,-0.08484,-0.211219,0.045359,0.033233,0.071449,0.060407,0.007378,-0.060237,0.122381,-0.026316,-0.081273,-0.014706,-0.048272,0.139217,-0.033051,-0.03424,-0.006293,-0.023008,-0.024114,0.053783,0.023932,0.019637,0.030428,348038,1.865784,0.946401,0.845391,1.030448,1.013531,1.261058,1.096192,0.803619,1.002464,0.960004,1.022161,1.060479,1.116069,1.063045,0.842955,1.061814,1.098932,0.927126,0.952751,1.008572,0.988244,1.042304,0.972995,1.030393,0.951666,0.95487,0.909179,1.114619,1.022104,1.05503,0.988182,0.991342,0.981385,0.984262,0.998078,1.015694,0.968115,1.006856,1.021175,1.003831,1.012577,0.963729,1.008611,1.008921,1.00164,1.005994,1.006283,0.985988,0.993765,0.994884,0.992072
181943,-0.61847,0.235083,-0.351832,-0.329897,-0.231972,0.141431,0.325838,-0.270053,-0.179281,-0.312232,-0.380848,-0.018538,-0.456412,-0.004798,-0.047867,0.165421,0.197334,0.635976,-0.118141,-0.099006,0.113962,0.1222,-0.053628,0.026133,0.033393,0.097449,0.030971,0.016783,0.089331,0.026081,-0.096806,0.044448,0.080034,-0.176127,-0.206926,0.097872,0.038799,-0.006901,-0.058776,-0.103493,-0.037326,0.020558,-0.012368,0.021116,-0.031039,0.009715,-0.00954,-0.014056,0.005533,0.009604,0.032897,348039,1.429108,0.836894,1.244109,1.22889,1.160947,0.901872,0.773926,1.187369,1.124389,1.216634,1.264241,1.012862,1.316669,1.003329,1.033211,0.885227,0.863085,0.558746,1.081969,1.068693,0.920931,0.915215,1.037208,0.981868,0.976831,0.932388,0.978512,0.988356,0.93802,0.981904,1.067166,0.969161,0.944471,1.122201,1.14357,0.932094,0.973081,1.004788,1.04078,1.071806,1.025898,0.985736,1.008581,0.985349,1.021536,0.99326,1.006619,1.009752,0.996161,0.993337,0.977175
181957,2.34745,-0.100543,-0.334828,-0.175671,-0.179005,-0.210946,-0.199818,0.041611,-0.059043,-0.330114,-0.193491,-0.25457,0.378511,0.675069,0.073055,-0.18703,-0.06043,-0.172028,-0.377847,0.127879,-0.041111,-0.423021,-0.282202,0.192302,-0.1327,0.024536,-0.154612,-0.161062,-0.077698,-0.092356,0.153722,0.028502,0.023486,-0.007815,0.112169,-0.017988,0.064274,-0.050321,-0.083449,0.043324,0.063299,0.132962,0.146946,0.031076,-0.022518,0.042288,-0.031309,-0.025016,-0.209699,0.049595,0.036,348040,0.134565,1.037067,1.123441,1.064765,1.065994,1.077769,1.073667,0.984659,1.021768,1.121703,1.071334,1.093852,0.860454,0.751122,0.973067,1.068953,1.022279,1.063422,1.139301,0.952855,1.015156,1.155955,1.10404,0.929104,1.048923,0.990954,1.057001,1.059379,1.028645,1.034049,0.943327,0.989492,0.991342,1.002881,0.958647,1.006632,0.976304,1.018552,1.030765,0.984028,0.976664,0.950981,0.945825,0.988543,1.008302,0.98441,1.011543,1.009223,1.07731,0.981716,0.986728
181964,0.446541,0.033489,-1.222378,0.056826,-0.804497,0.762249,0.001153,-0.813617,-0.397407,0.145464,-0.134352,-0.285464,-0.141053,-0.24918,-0.032567,-0.244437,0.002551,-0.169704,-0.218846,0.519649,-0.105973,0.005506,-0.168053,0.203887,-0.064271,-0.070512,0.170159,-0.204129,0.007461,0.130664,0.14789,-0.195335,-0.00129,-0.151047,0.15806,-0.113908,0.140165,-0.095795,0.073935,0.249861,0.088759,0.015884,0.148719,0.103909,0.073405,0.071875,-0.045114,-0.01971,0.030754,-0.016497,0.088911,348041,0.796499,0.984738,1.557071,0.974103,1.366631,0.652622,0.999475,1.370787,1.181109,0.933708,1.061228,1.130094,1.064282,1.113558,1.014842,1.111397,0.998838,1.077339,1.099734,0.763182,1.048295,0.997491,1.076586,0.907083,1.02929,1.032134,0.922454,1.093027,0.9966,0.940453,0.932603,1.08902,1.000588,1.068836,0.927968,1.051911,0.936123,1.043657,0.966306,0.886131,0.95955,0.992761,0.932225,0.952646,0.966548,0.967245,1.02056,1.008982,0.985985,1.007518,0.959481
181971,-0.164926,0.067174,0.491288,-0.639513,0.114992,1.000453,0.631925,-0.81664,1.003752,-0.372973,-0.124748,0.467406,-0.231623,-0.205614,-0.401444,0.223316,0.292653,-0.262529,0.422491,-0.13245,0.28521,-0.065408,-0.184555,0.113024,-0.155853,0.246227,-0.085645,-0.01885,0.27855,0.096669,0.144906,-0.015112,0.201984,0.14354,0.249521,0.097246,-0.178665,0.030642,-0.019237,-0.027219,0.111184,0.084722,0.046927,-0.006709,0.003695,-0.018969,-0.078802,-0.048503,-0.031204,-0.014812,0.035665,348042,1.07141,0.970915,0.787282,1.276896,0.950211,0.566824,0.726389,1.353589,0.565395,1.16149,1.054014,0.797623,1.100288,1.089027,1.173817,0.903309,0.873287,1.11367,0.81707,1.057348,0.87651,1.028321,1.079909,0.951063,1.067481,0.893389,1.037082,1.008162,0.879393,0.958144,0.937258,1.006543,0.912545,0.93785,0.891962,0.957895,1.077359,0.986733,1.008329,1.011785,0.95186,0.963317,0.979681,1.002905,0.9984,1.008213,1.03412,1.021001,1.013511,1.006413,0.984558
181981,1.692738,-0.219342,-0.134714,0.519794,0.943814,-0.713607,0.680144,-0.225059,-0.530265,0.341555,-0.165134,0.144984,0.33471,-0.023614,0.051197,-0.111221,-0.007463,-0.150731,-0.015587,-0.174697,0.056703,0.20057,-0.045,-0.010865,-0.203451,0.026715,0.073639,-0.017371,-0.119201,0.129613,0.083644,0.131863,-0.131548,-0.051737,0.101705,0.007219,-0.072997,0.001956,0.046797,0.146782,0.072005,0.029037,0.030708,-0.033533,-0.056242,-0.043098,0.039586,0.04982,0.011285,-0.003145,-0.007938,348043,0.307089,1.089786,1.055144,0.787226,0.613656,1.29211,0.721588,1.092127,1.217061,0.860187,1.067596,0.940652,0.862988,1.009666,0.979043,1.045528,1.003055,1.061701,1.006381,1.071511,0.976789,0.917898,1.01842,1.004447,1.083281,0.989065,0.969856,1.007111,1.048794,0.946944,0.965761,0.946023,1.053848,1.021178,0.958368,0.997045,1.029881,0.999199,0.980844,0.939916,0.970525,0.988114,0.98743,1.013727,1.023022,1.017642,0.983796,0.979606,0.995381,1.001287,1.003249
181982,-1.812888,0.102399,0.252073,-0.673932,-0.52225,-0.854582,-0.467537,-0.176926,0.923483,0.017217,-0.2495,-0.458307,0.054009,0.420046,-0.35582,-0.15016,-0.006861,-0.232413,-0.282938,-0.506817,0.182472,0.110418,0.20341,-0.190111,0.014969,0.040793,-0.093644,0.06387,-0.029692,-0.018305,-0.288024,0.091321,-0.107956,0.350536,0.117863,-0.103272,-0.088038,0.046236,0.116649,0.074016,0.016732,-0.043987,0.088667,0.20257,0.23419,0.060343,-0.004825,0.01465,0.025548,0.037734,-0.002836,348044,1.664509,0.962466,0.907603,1.247028,1.191429,1.313245,1.171374,1.064852,0.6615,0.993689,1.091454,1.167991,0.980203,0.846033,1.130425,1.055041,1.002515,1.08519,1.10371,1.185772,0.933115,0.959527,0.925441,1.069685,0.994513,0.985048,1.034325,0.976589,1.010883,1.00671,1.105574,0.966526,1.039571,0.871512,0.956798,1.037854,1.03227,0.983052,0.957243,0.97287,0.993867,1.016123,0.967499,0.925749,0.914158,0.977882,1.001768,0.99463,0.990635,0.986169,1.001039


In [22]:
#['cos_{}'.format(i+1) for i in range(df_max[name_pca].shape[1])]

In [23]:
from sklearn.mixture import BayesianGaussianMixture
BGM = BayesianGaussianMixture(n_components =5, random_state= seed) # deffine 11 clusters component

%time BGM.fit(df_winner[name_cos].values) # fit the clustering model on is1|oos
#%time BGM.fit(df[col_names].values[df.is1.values])

CPU times: user 26.6 s, sys: 544 ms, total: 27.1 s
Wall time: 6.86 s




BayesianGaussianMixture(covariance_prior=None, covariance_type='full',
            degrees_of_freedom_prior=None, init_params='kmeans',
            max_iter=100, mean_precision_prior=None, mean_prior=None,
            n_components=5, n_init=1, random_state=7, reg_covar=1e-06,
            tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
            weight_concentration_prior=None,
            weight_concentration_prior_type='dirichlet_process')

In [24]:
clusters = BGM.predict(df_winner[name_cos].values)  # descover the number cluster for each event_id
name_cl, fr_cl =np.unique(clusters, return_counts =True)
name_cl, fr_cl # names of clusters, frequence of names

(array([0, 1, 2, 3, 4]), array([2822, 3509, 2825, 4269, 4412]))

In [25]:
df_winner['cluster'] = clusters

In [26]:
df_pca =df_pca.join(pd.DataFrame(df_winner['cluster'].values, index= df_winner.event_id, columns =['cluster']), on ='event_id')
df_pca.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,event_id,cluster
0,-3.96563,0.475024,-0.458396,-0.437761,0.459468,-0.971297,1.009715,-0.549283,0.092487,-0.436671,0.288159,-0.383869,-0.496406,-0.225241,0.044298,-0.001694,0.345873,0.129309,-0.170947,-0.282424,-0.260633,-0.128401,-0.24067,0.047086,-0.015175,0.021386,-0.281638,0.329844,0.152668,0.181906,-0.228965,0.047199,0.166749,-0.096677,0.114633,-0.095767,0.149982,-0.029129,0.014214,0.094271,-0.029527,-0.010815,0.02253,0.042726,0.032898,0.01826,-0.021068,0.001954,0.030846,-0.016273,0.004811,293661,1
1,-1.368812,-0.006947,-0.092028,-0.709256,0.177596,-0.400748,-0.432677,-0.340324,0.244105,0.464674,0.609016,0.010729,-0.893683,-0.408752,0.082643,0.075791,-0.125073,0.150471,-0.358299,-0.02455,0.008662,0.027252,0.206629,0.008957,0.04562,-0.173685,0.195684,-0.064586,0.095305,-0.031151,-0.207162,-0.355872,0.019206,-0.143756,0.186557,-0.090538,0.103422,-0.022464,-0.111724,-0.304364,0.128531,0.066386,0.052475,-0.144639,-0.144402,-0.027196,0.045439,0.002482,0.008272,-0.004227,-0.014723,293661,1
2,-0.958706,-0.110968,0.112799,-0.253621,-0.233773,-0.4487,0.078242,-0.387077,0.210951,-0.204571,0.384731,-0.326743,-0.060698,-0.473298,-0.043503,0.410968,0.384015,0.309765,-0.252184,-0.007075,-0.142722,0.248955,-0.103244,0.067656,0.101819,0.092514,0.138265,-0.016673,0.041744,-0.014458,-0.112835,0.044704,0.104382,-0.042308,-0.151314,0.008361,-0.047636,0.021052,0.013151,0.018956,-0.008926,-0.081109,-0.029422,0.005524,0.022653,-0.021301,-0.000937,-0.018574,-0.025988,-0.002146,0.0678,293661,1
3,-0.91103,-0.01337,-0.459013,0.196185,0.053641,-0.314621,-0.861711,0.039117,-0.175894,-0.135009,-0.103966,0.133928,0.165781,-0.216123,-0.379095,-0.3778,0.037477,-0.211301,0.185288,-0.417862,0.734184,-0.095048,0.299701,0.000799,0.003822,0.098978,0.11154,-0.062437,-0.296788,-0.111029,0.207358,-0.071403,-0.07887,0.042843,-0.023222,-0.00542,-0.014227,0.043674,-0.001294,-0.147973,0.053836,-0.00015,-0.00604,0.043153,0.045375,-0.001744,0.001126,-0.026722,-0.012083,0.010317,0.000236,293661,1
4,-1.30464,0.336474,-1.493462,0.310679,0.046025,0.595846,0.370766,0.064179,-0.141242,-0.33912,0.037521,-0.478799,0.449629,-0.190379,-0.11085,-0.112689,0.162289,0.145524,-0.033562,0.236474,-0.643673,-7.1e-05,-0.018297,0.029151,0.046942,0.097443,0.177624,0.031643,0.131602,0.126083,-0.175732,-0.063868,-0.065255,0.025453,-0.01258,0.007448,-0.143996,0.034184,0.061798,0.051067,-0.024434,-0.035129,-0.019685,-0.009579,0.029039,0.069514,-0.037928,0.013388,-0.041592,0.000152,-0.030064,293661,1


In [27]:
df_pca.tail()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,event_id,cluster
181984,-0.205857,0.046213,-0.262328,-0.544579,0.391583,-0.263815,-0.091872,-0.187139,-0.607862,0.036442,-0.275287,-0.201388,0.106388,-0.249283,0.988086,0.066394,-0.007639,0.128353,0.117004,0.196537,0.169025,-0.418349,-0.062843,0.139732,-0.063081,0.003425,-0.019238,0.022993,0.160207,0.196936,0.221904,-0.10407,-0.104814,-0.271816,0.043671,0.013885,0.003504,-0.016249,0.049688,0.010381,-0.010131,0.001626,0.083532,-0.039795,-0.02452,0.01787,-0.022241,0.041709,0.056745,-0.00531,0.007664,348044,1
181985,-0.560418,-0.066324,0.209394,0.603091,0.034402,0.135804,0.606321,0.344407,0.363943,-0.367062,0.536879,0.780592,-0.102542,-0.038813,-0.51607,0.12361,0.110038,-0.150697,0.010191,0.263108,0.025078,-0.043705,0.188683,-0.023777,0.048957,-0.086187,0.007946,0.162077,-0.050827,-0.011938,0.067505,0.076057,0.058427,0.126559,-0.220457,-0.177614,0.049682,0.007364,-0.133332,-0.027734,0.082446,-0.209408,0.072863,-0.510571,0.424915,0.051416,-0.158318,-0.027375,-0.016481,0.199499,-0.01082,348044,1
181986,1.080464,-0.125079,0.117892,0.357487,0.082169,0.091252,-0.567474,-0.045162,-0.118669,0.095505,0.07216,-0.352333,-0.038792,-0.332391,0.626519,-0.147912,-0.154666,0.157899,0.030803,-0.121196,0.119843,-0.092696,-0.183843,0.400229,-0.03302,0.001834,0.163093,-0.124005,-0.07843,0.057751,0.097834,-0.074739,-0.074104,0.000237,0.016498,0.159829,0.187909,-0.059914,0.018854,0.156631,0.112,0.253506,-0.19257,0.089835,-0.142241,-0.15384,-0.028278,0.024257,0.014283,-0.00449,0.061224,348044,1
181987,1.322641,0.038814,-0.234455,0.012107,-0.160549,0.335845,0.813282,0.47036,-0.144126,-0.782871,0.227114,0.354059,-0.317756,0.787691,-0.957388,0.264934,0.45828,-0.018025,-0.018637,0.248426,-0.289012,0.469922,-0.141957,-0.014098,0.129316,0.001226,-0.289352,-0.024394,0.018035,0.040297,0.02805,-0.051894,0.226373,-0.003471,0.22752,0.014412,0.101259,-0.047992,0.014958,-0.090137,-0.112244,0.035137,0.000278,0.139267,-0.230695,-0.01437,0.151181,-0.063509,0.003662,-0.143638,-0.051802,348044,1
181988,1.528876,-0.222407,-0.35118,0.319467,0.077415,0.504309,-0.130409,-0.20466,-0.837138,0.611499,-0.345275,-0.145286,0.016002,-0.26173,0.393581,-0.372469,-0.564377,0.130698,0.210396,-0.049357,-0.028397,-0.072864,0.052593,-0.14207,-0.07955,0.092344,0.121681,-0.021408,-0.017356,-0.132053,0.043813,0.039677,-0.098787,-0.016249,-0.15665,-0.037014,-0.093934,0.034877,-0.07733,-0.152112,-0.027567,-0.028024,-0.061599,-0.018159,-0.09713,0.021833,-0.002834,0.023295,-0.019302,0.000213,0.01923,348044,1


In [28]:
#df_pca['cluster'] = clusters

In [29]:
#df_pca['cluster'] =df_pca['cluster'].replace({x: 11 for x,fr in zip(name_cl, fr_cl) if fr <=400 })

In [30]:
is1 = mod.is1.copy()
is2 = mod.is2.copy()
oos = mod.oos.copy()

In [31]:
df_pca['cluster'].value_counts()

3    45342
4    43806
1    34490
2    29229
0    29122
Name: cluster, dtype: int64

In [32]:
df_pca['cluster'].unique()

array([1, 0, 4, 3, 2])

In [35]:
df_clusters = df_winner.loc[:,'cos_1':'cos_51'].groupby(df_winner['cluster']).mean()
df_clusters

Unnamed: 0_level_0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,1.883961,0.926552,1.017811,0.952609,0.957486,1.036076,1.027393,0.970277,0.993106,1.017051,1.001407,0.988449,1.02026,0.985638,1.008982,1.010774,1.01182,1.000568,0.997233,0.998694,1.003401,0.993832,1.000805,0.999836,0.998529,1.003631,0.998998,0.996148,0.998189,0.999417,0.999149,0.999986,0.998549,1.003105,0.995774,1.000524,0.993852,1.001579,1.002758,1.000318,0.998751,1.001344,0.998016,0.998471,0.999465,0.999998,0.999664,1.000689,0.999457,1.000341,0.998616
1,1.233835,0.979553,0.983689,0.956658,1.005553,1.029219,1.032264,1.024337,1.014491,0.992159,0.99844,0.999349,0.998668,0.996502,1.003786,0.993691,0.994445,1.008005,0.993105,0.999521,1.004043,1.00157,0.997464,0.993334,0.999569,1.002051,1.007431,1.002845,1.00015,0.995548,0.999585,1.005663,1.002385,1.003094,1.000957,1.000018,1.001905,1.003087,1.002045,0.998439,1.001479,0.99938,0.995626,1.004499,1.003247,1.000626,1.000169,1.000316,0.999404,1.001101,0.998866
2,1.703996,0.933867,0.903257,1.144037,1.066554,1.017891,0.982475,0.975577,0.997252,1.024321,1.024742,1.060539,1.005688,1.003782,0.969119,0.987801,1.000139,0.996274,1.006005,0.996384,1.003591,1.001059,1.004408,1.020114,0.987976,0.996012,0.992497,1.004486,0.999476,1.002132,1.003877,1.005218,1.003746,0.99811,1.000667,1.002543,1.00592,1.000148,1.002809,1.002708,1.004895,1.002483,1.00022,1.001189,0.999129,0.999032,0.998041,0.999554,0.999736,1.000617,1.001587
3,1.137039,0.982015,0.991559,0.978488,0.969397,0.958997,0.98571,1.038949,1.02581,1.003557,0.99933,0.985166,0.982729,1.007856,1.022711,0.982397,0.999839,1.000845,0.997755,0.995329,1.007667,1.008402,1.006113,0.985947,0.998495,1.000332,1.004412,1.000695,1.000783,0.999172,1.00129,1.001484,0.994354,1.002659,0.998207,1.000044,1.00221,0.995372,0.999603,0.996386,0.995284,0.99652,0.999844,1.000915,1.000331,0.998948,1.001859,0.998447,1.002018,0.999548,0.999826
4,1.324256,1.025765,0.962364,0.980675,1.019693,1.020808,1.034657,1.001438,1.030345,1.00384,1.010164,1.02012,1.003185,0.992981,1.002082,0.988804,0.994696,0.992206,0.99926,1.003807,1.004405,1.002621,1.000969,1.003173,1.00421,1.00128,1.003388,1.005249,0.997124,0.999656,0.999274,0.997123,0.99993,0.999054,0.999433,0.999488,1.00293,1.002069,0.999242,0.997856,1.003446,0.999473,1.001504,1.000881,0.999442,0.997392,0.99981,0.998843,0.999865,1.000475,0.999775


In [36]:
from itertools import combinations
import scipy
for cl1, cl2 in combinations(df_clusters.index, 2):
    cos_dist1 = scipy.spatial.distance.cosine(df_clusters.loc[cl1,:], df_clusters.loc[cl2,:])
    #cos_dist2 = scipy.spatial.distance.cosine(cluster_model_coefs[cl1][:,2], cluster_model_coefs[cl2][:,2])
    print 'step clusters  {}  and  {} corelation = {}'.format(cl1, cl2, cos_dist1)
    #print 'step 2 clusters  {}  and  {} corelation = {}'.format(cl1, cl2, cos_dist2)

step clusters  0  and  1 corelation = 0.00399904539471
step clusters  0  and  2 corelation = 0.000996250422145
step clusters  0  and  3 corelation = 0.00531708683426
step clusters  0  and  4 corelation = 0.00310537136482
step clusters  1  and  2 corelation = 0.00257036169566
step clusters  1  and  3 corelation = 0.000188622225776
step clusters  1  and  4 corelation = 0.000126660859564
step clusters  2  and  3 corelation = 0.00352207978163
step clusters  2  and  4 corelation = 0.00177324010475
step clusters  3  and  4 corelation = 0.000462422296193


In [37]:
df_clusters.to_csv('winners_clusters.csv')

#### fit our model for each cluster, each cluster gets different model

In [33]:
%%time
model_coefs, model_step1prob, model_step2prob, model_likelihood, cluster_inds = {}, {}, {}, {}, {}
train_val_test = {}  # events for each cluster
    
for cl in df_pca['cluster'].unique():

    events_tr = df['event_id'][df.is1.values & (df_pca['cluster'] ==cl)].values
    events_ts = df['event_id'][df.oos.values & (df_pca['cluster'] ==cl)].values
    train_val_test[cl] = events_tr, events_tr, events_ts
       
        
    mod.is1 = np.in1d(av.event_id, events_tr)  # mask of Model for train
    mod.is2 = np.in1d(av.event_id, events_tr)
    mod.oos = np.in1d(av.event_id, events_ts)  # mask of Model for test

        
         
    model_coefs[cl], model_step1prob[cl], model_step2prob[cl], model_likelihood[cl], cluster_inds[cl]\
    = mod.fit_slices(tsav, factors,  depth=3, lmbd=10, verbose=False, fit_afresh=True)

    print 'cluster {}  number  {}'.format(cl, df_pca['cluster'].value_counts()[cl])
    print 'LL  {}          {}            {}'.format (len(events_tr), len(events_tr), len(events_ts))
    print model_likelihood[cl]
    print '..................'

  step2factors = np.concatenate((fb, fl, np.log(probs).reshape((1, -1))), axis=0)


. . . . . . . . . . 10
cluster 1  number  34490
LL  26057          26057            8433
[[-1895.49795514 -2133.03234692 -2133.03234692]
 [-1902.27962593 -2127.695476   -2127.695476  ]
 [-1907.75934279 -2120.02236568 -2120.02236568]
 [-1912.48498197 -2101.13818955 -2101.13818955]
 [-1936.39067889 -2065.32506169 -2065.32506169]
 [-1959.88243541 -2061.77929345 -2061.77929345]
 [-1999.95045598 -2073.81562179 -2073.81562179]
 [-2025.71981647 -2089.13010733 -2089.13010733]
 [-2027.96123895 -2091.67125179 -2091.67125179]
 [-2027.98558489 -2099.14004839 -2099.14004839]
 [    0.             0.             0.        ]]
..................
. . . . . . . . . . 10
cluster 0  number  29122
LL  19585          19585            9537
[[-688.53807983 -885.65719107 -885.65719107]
 [-680.68659966 -874.90791274 -874.90791274]
 [-683.82546274 -869.91605982 -869.91605982]
 [-675.36441995 -853.89460091 -853.89460091]
 [-663.33823187 -820.08236719 -820.08236719]
 [-626.78815809 -802.3481409  -802.3481409 ]
 [-6

#### fit the old model for compare

In [34]:
mod.is1 = is1 # mask of Model for train
mod.is2 = is2
mod.oos = oos # mask of Model for test

old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood, inds \
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print old_model_likelihood
print '..................'

. . . . . . . . . . 10
[[-1809.6054581  -1997.1291991  -1997.1291991 ]
 [-1815.2956033  -1993.38729757 -1993.38729757]
 [-1820.46573327 -1991.13279384 -1991.13279384]
 [-1825.04841825 -1978.80723507 -1978.80723507]
 [-1845.34679474 -1957.00310259 -1957.00310259]
 [-1863.07407718 -1951.28943996 -1951.28943996]
 [-1903.75005812 -1924.17873453 -1924.17873453]
 [-1921.12493606 -1902.65155706 -1902.65155706]
 [-1923.89396228 -1902.50302182 -1902.50302182]
 [-1928.8879264  -1895.42641802 -1895.42641802]
 [    0.             0.             0.        ]]
..................


##### compare old model and models for each cluster

In [35]:
for cl in train_val_test.keys():
    
    train, val, test = train_val_test[cl] # events for each cluster
    cluster_mask = np.in1d(av.event_id, np.append(train, test))[inds] # mask for each cluster
    # replace the win probobility if the event is in cluster, for others use the old model
    prob_mix = np.where(cluster_mask , model_step2prob[cl] , old_model_step2prob )
    print 'cluster  ',cl
    print 'diff likelihood '
    print ll_diff (prob_mix, old_model_step2prob, train ,val, test, inds, av =av, tsav =tsav).mean(axis =0)
    print 'll'
    old_mean_ll = old_model_likelihood[:-1].mean(axis =0)
    print model_likelihood[cl][:-1].mean(axis =0) - old_mean_ll

cluster   0
diff likelihood 
[ 297.21495632  297.21495632  495.02743333]
ll
[ 1221.92448652  1118.1455194   1118.1455194 ]
cluster   1
diff likelihood 
[ 107.56698579  107.56698579   60.18268001]
ll
[ -93.94191487 -146.9240963  -146.9240963 ]
cluster   2
diff likelihood 
[ 448.95894915  448.95894915  582.20549662]
ll
[ 857.48372228  836.36957666  836.36957666]
cluster   3
diff likelihood 
[ 143.00142856  143.00142856   75.6751042 ]
ll
[-232.78808773 -241.41372927 -241.41372927]
cluster   4
diff likelihood 
[ 54.78797979  54.78797979  14.79841782]
ll
[-55.43603999 -14.15139504 -14.15139504]


#### write the result in the file 

In [38]:
df1 = df_pca.groupby('event_id')['cluster'].first()  # Series : index is a event_id, value is the name of cluster 
cl_list = [0, 1, 2, 3, 4]  # names of clusters that we replace in old model
print 'count events  ',fr_cl[np.in1d(name_cl, cl_list)].sum()
write_dic_to_simdata('simdata_winner_direction_claster.p', old_model_step1prob, old_model_coefs, oos, av =av, data =df1,
                    cluster_step1probs =model_step1prob, cluster_coefs =model_coefs, cluster_names =cl_list)

count events   17837


TypeError: 'NoneTraversal' object is not callable