In [1]:
# ThoroughBet Simulation


## Load necessary modules

In [2]:
import numpy as np
from scipy.spatial.distance import cosine

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata, dic_to_tenzor

## Load data

In [3]:
av = ArrayView.from_file(settings.paths.join('racing_data_azd.av.bcolz'))

In [4]:
av_w = ArrayView.from_file(settings.paths.join('weather.av.bcolz'))

In [5]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

In [6]:
#mod = Model(av, build_end = factor_build_end - YEAR/4, oos_start=factor_build_end+YEAR)
mod = Model(av, oos_start=factor_build_end+  YEAR)

In [7]:
#mod.is1, mod.is2, mod.oos = mod.model_mask(mod.strata, mod.av.start_time, mod.av.result, mod.av.course,
#                                          t0 = factor_build_end - YEAR/4, t1 = mod.oos_start -YEAR/4,
#                                          t2 = mod.oos_start, depth =3)

In [8]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [9]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 8165


 . . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . . . .CPU times: user 3min 31s, sys: 1.88 s, total: 3min 33s
Wall time: 1min 3s



In [10]:
predict_mask = mod.is1|mod.is2|mod.oos

In [11]:

def old_data(num, is1=mod.is1):
    
    first_is1 = np.where(is1 ==True)[0][0]
    past_events = np.unique(av.event_id[av.event_id < av.event_id[first_is1]])[-int(num):]
    
    return np.in1d(av.event_id, past_events)

In [12]:
mask_past = old_data(4000)

In [13]:
mask_val = old_data(3000, mod.oos)

In [14]:
mask_past.sum(), mask_val.sum()

(37795, 27726)

In [15]:
np.where(predict_mask ==True)[0][0], np.where(mask_past ==True)[0][0]

(1123738, 1085943)

In [16]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))
    factors_new[:, predict_mask] = X.T
    
    return factors_new

In [17]:
# создаем дата фрейм из данных 
def DF(mask, factors, av, factors_names, other_names):
    
    import pandas as pd
    
    df = pd.DataFrame(data =factors[:, mask].T , columns = factors_names)
    for col in other_names :
        
        df[col] = av[col][mask]
        
    return df

In [18]:
import pandas as pd
pd.set_option('display.max_columns', 90)

col_names = ['f{}'.format(i) for i in range(1,58)]

df = DF (predict_mask, factors, av, col_names, ['event_id', 'runner_id', 'result', 'start_time', 'jockey', 'trainer',
                                               'prize'])
df['is1'] = mod.is1 [predict_mask]
#df['is2'] = mask_val [predict_mask]
df['oos'] = mod.oos [predict_mask]


df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,start_time,jockey,trainer,prize,is1,oos
0,0.06641,1.812243,0.140033,0.007745,-0.000153,0.368391,0.886524,-0.001048,1.485403,-0.493447,0.174814,0.783236,0.477577,0.025752,1.219818,0.3137,-0.001038,-0.000633,0.004911,-0.017951,0.043984,-0.237237,1.034481,-0.045829,-0.005794,0.001333,-0.065022,0.001234,-0.018684,0.275482,-0.486973,0.078513,-0.116078,0.001665,-0.001337,-0.160823,0.091133,0.965444,2.384186e-07,2.492563,0.756727,-0.080026,0.235397,0.250485,0.023326,0.628332,0.0986,-0.014976,-0.089875,0.536839,-0.012052,-0.143187,0.945663,0.415251,-0.001041,-0.000991,0.118205,293661,360456,3,1443704000.0,5870,5165,3235.0,True,False
1,-0.154957,0.615122,-0.069783,-0.003173,0.001375,0.16133,-0.100893,-0.014863,0.169336,0.091599,0.057041,0.430749,0.136594,0.014466,0.412627,0.186385,-0.014956,-0.004117,0.013987,-0.015613,-0.026509,0.277255,-0.352035,0.028523,0.052144,-0.000224,0.116157,-0.002069,-0.021106,0.087584,-0.03054,-0.478374,0.035151,-0.014985,0.028866,0.03202,-0.061174,0.254077,2.384186e-07,1.648311,0.064365,-0.58235,0.184824,-0.053766,-0.034648,0.368781,-0.247407,-0.022042,0.548488,-0.12149,0.108465,0.040574,0.224466,0.061359,-0.01493,-0.015404,0.031394,293661,375590,5,1443704000.0,10816,448,3235.0,True,False
2,0.06641,0.442785,0.069257,0.007745,-0.000153,0.205659,-0.100893,0.014627,-0.009653,-0.555511,0.355908,0.247443,-0.303278,0.030828,0.364401,0.141475,0.014634,0.018316,0.009269,0.012953,-0.014429,0.264289,0.330698,0.038722,-0.005794,0.000564,0.173537,0.001555,0.054967,-0.092341,-0.266757,0.042518,0.001093,0.001665,-0.011037,0.172273,0.014969,0.155435,2.384186e-07,0.941561,0.148362,-0.060844,0.106616,0.005437,0.083749,0.145608,0.0986,0.08115,-0.153352,-0.183545,-0.012052,0.159183,0.489453,0.035043,0.014632,0.014698,0.049137,293661,374610,7,1443704000.0,10817,10804,3235.0,True,False
3,-0.154957,0.765883,0.129201,-0.025521,-0.000153,0.11243,-0.256966,0.012617,0.296299,0.198545,-0.260937,-0.048559,-0.044413,0.018731,0.549172,-0.279403,0.012624,0.003226,0.009269,0.011364,-0.004105,-0.12262,-0.420019,0.047166,-0.005794,-0.000809,0.239203,0.001421,-0.018684,-0.133672,0.250497,0.481832,0.086591,0.001665,-0.00974,0.190861,0.130183,0.446482,2.384186e-07,0.498852,-0.03661,0.348958,0.002126,-0.037165,-0.019922,-0.163246,-0.247407,-0.014976,0.470949,0.289911,-0.012052,0.088062,-0.128523,-0.493842,0.012622,0.012686,-0.261736,293661,373638,1,1443704000.0,63,64,3235.0,True,False
4,0.06641,0.329767,-0.093768,0.007745,-0.000153,0.11243,0.502805,-0.024248,1.255799,-0.329878,0.086009,-0.048559,-0.044413,0.030879,1.250973,0.155369,-0.024233,-0.020092,0.009269,0.012953,-0.038274,0.105863,0.202454,-0.032224,-0.005794,2e-06,-0.03753,0.000976,0.09693,0.023229,-0.043407,-0.246352,-0.009642,0.001665,0.011356,-0.122628,-0.217356,1.079721,2.384186e-07,-0.004528,0.054179,0.022644,0.116945,-0.11049,-0.044355,-0.163246,0.0986,0.023745,-0.238098,-0.440632,-0.012052,-0.12535,0.136879,0.440444,-0.024237,-0.024211,0.265512,293661,347906,4,1443704000.0,10921,299,3235.0,True,False


In [19]:
#df.loc[df.is1 &df.is2, 'is1']=False

In [20]:
from sklearn.decomposition import PCA
seed =7
pca = PCA(n_components =51, random_state =seed)
name_pca = ['pca_f%s'%i for i in range(1,52)]

df_pca = pd.DataFrame(data = pca.fit_transform(df.loc[:,'f1':'f57']), columns =name_pca) 
#df_pca = df_pca.append(pd.DataFrame(data = pca.transform(df.ix[df.oos.values,'f1':'f57']), columns =name_pca))

df_pca.head()

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51
0,-3.95304,-0.57626,-0.43786,-0.462102,0.672344,-0.693298,0.393534,-1.026562,-0.511706,0.145472,0.515999,-0.276328,-0.456568,-0.356363,-0.090854,-0.028276,-0.475345,-0.013399,-0.103967,-0.195848,-0.280302,-0.131751,-0.244838,0.060567,-0.133785,-0.145987,-0.350677,-0.075274,0.097802,0.237358,-0.187394,0.020422,0.17,-0.072247,0.032189,0.136419,0.169233,-0.088922,0.080483,-0.055433,-0.026963,0.02492,-0.013718,-0.041085,0.028237,0.014575,-0.02495,0.010142,-0.030849,-0.017814,0.004279
1,-1.398486,-0.047329,-0.130185,-0.679285,0.265606,-0.097027,0.771933,0.452562,-0.16421,0.04398,0.174593,-0.085769,-0.126028,-0.984862,-0.017243,-0.013368,-0.030447,-0.088102,-0.422572,0.014184,0.007804,0.030452,0.250111,0.000386,0.240268,-0.087136,0.131277,-0.085088,-0.026949,0.028189,-0.121991,-0.338809,-0.001715,-0.13365,-0.003034,0.195024,0.14705,-0.083824,-0.306087,0.130725,0.126432,-0.045441,-0.089456,0.133228,-0.139099,-0.01827,0.039998,0.001008,-0.004347,-0.001682,-0.01622
2,-0.968868,0.085577,0.108196,-0.252562,-0.145451,-0.379468,0.247287,-0.092799,-0.366308,0.216832,0.344856,0.218102,-0.381525,-0.343827,-0.158002,-0.487023,-0.522191,-0.156522,-0.116211,0.061975,-0.138777,0.255321,-0.103779,0.066646,0.076277,0.099712,0.104698,-0.040998,0.084311,-0.006415,-0.119434,0.022054,0.120175,-0.03863,0.034459,-0.1269,-0.029642,0.036887,0.023853,-0.009688,-0.002752,0.076313,0.047426,0.002341,0.012933,-0.025566,-0.001282,-0.010964,0.028927,-0.004685,0.06784
3,-0.891411,0.002056,-0.436944,0.198052,0.037906,-0.509616,-0.13716,0.794958,-0.072281,-0.120396,-0.020608,0.163839,0.153713,-0.082283,0.286551,0.227551,-0.02636,0.132672,0.117662,-0.491114,0.68347,-0.119229,0.317466,-0.010119,0.022053,0.15869,0.045466,0.187895,0.100806,-0.233132,0.199443,-0.034617,-0.072,0.041404,0.023015,-0.020009,-0.019725,0.057894,-0.114692,0.098766,0.039317,-0.002938,0.029464,-0.046456,0.057012,-0.000306,0.005925,-0.03384,0.008413,0.012611,0.001659
4,-1.275461,-0.368518,-1.481801,0.298958,2.9e-05,0.519721,-0.380882,-0.345212,0.002347,-0.061163,0.138806,0.281519,-0.440855,0.195924,-0.035182,-0.010729,-0.23203,-0.165433,0.107548,0.308902,-0.611968,0.005885,-0.031447,0.019448,0.039175,0.101489,0.109407,-0.103734,0.181036,0.141778,-0.16837,-0.109622,-0.046516,0.012436,0.017602,0.015962,-0.12917,0.068135,0.078552,0.002699,-0.024419,0.033941,0.028457,0.008836,0.034469,0.063986,-0.034762,0.004277,0.04211,-0.000615,-0.029691


In [21]:
nor_vectors = np.diag(np.ones(df_pca[name_pca].shape[1]))

In [22]:
def matrix_cosine(matrix,vector):
    
    matrix_norms = np.linalg.norm(matrix, axis=1)
    vector_norm = np.linalg.norm(vector)
    return 1- np.divide(matrix.dot(vector),np.multiply(matrix_norms, vector_norm))

In [23]:
%%time 
name_cos = []
matrix_norms = np.linalg.norm(df_pca[name_pca].values, axis=1)
vector_norm = np.linalg.norm(nor_vectors[0])

for i, v  in enumerate(nor_vectors):
    
    df_pca['cos_' + str(i+1)] = 1- np.divide(df_pca[name_pca].values.dot(v),np.multiply(matrix_norms, vector_norm))
    name_cos.append('cos_' + str(i+1))

CPU times: user 9.3 s, sys: 440 ms, total: 9.74 s
Wall time: 2.47 s


In [24]:
pd.set_option('display.max_columns', 110)
df_pca.tail(10)

Unnamed: 0,pca_f1,pca_f2,pca_f3,pca_f4,pca_f5,pca_f6,pca_f7,pca_f8,pca_f9,pca_f10,pca_f11,pca_f12,pca_f13,pca_f14,pca_f15,pca_f16,pca_f17,pca_f18,pca_f19,pca_f20,pca_f21,pca_f22,pca_f23,pca_f24,pca_f25,pca_f26,pca_f27,pca_f28,pca_f29,pca_f30,pca_f31,pca_f32,pca_f33,pca_f34,pca_f35,pca_f36,pca_f37,pca_f38,pca_f39,pca_f40,pca_f41,pca_f42,pca_f43,pca_f44,pca_f45,pca_f46,pca_f47,pca_f48,pca_f49,pca_f50,pca_f51,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
240401,1.694483,0.244582,-0.928025,0.507679,0.19278,-0.795934,0.074988,-0.153152,-0.135547,0.324434,0.066384,0.000494,-0.044171,0.392586,0.142108,-0.412131,0.020166,-0.036471,-0.409671,-0.045757,0.100416,0.041644,-0.203931,-0.077942,-0.134179,-0.029442,0.004551,0.059056,-0.037429,-0.057585,0.008941,0.04976,-0.048632,-0.025211,-0.039879,-0.043898,-0.000392,-0.019663,-0.009069,-0.033482,-0.071308,-0.100352,-0.045337,0.019212,0.00627,-0.067534,0.002213,0.01639,-0.023385,-0.023279,0.010402,0.278553,0.895866,1.395118,0.78385,0.917921,1.338879,0.968073,1.065206,1.057711,0.861868,0.971736,0.99979,1.018806,0.832851,0.939496,1.17547,0.991414,1.015528,1.174422,1.019481,0.957247,0.982269,1.086826,1.033185,1.057128,1.012535,0.998062,0.974856,1.015936,1.024518,0.996193,0.978814,1.020706,1.010734,1.016979,1.01869,1.000167,1.008372,1.003861,1.014255,1.03036,1.042726,1.019303,0.99182,0.997331,1.028754,0.999058,0.993022,1.009956,1.009911,0.995571
240402,-1.775967,-0.071053,0.524885,-0.387818,-0.683481,0.028008,0.124051,-0.394131,0.271904,0.219327,-0.195616,0.072245,0.000579,0.062021,-0.275457,-0.135324,-0.387643,-0.260343,0.300388,-0.329348,0.469903,-0.167697,0.03461,-0.048407,0.043714,0.056361,-0.100615,-0.017418,0.118992,-0.001026,0.017262,-0.126141,0.204814,-0.122534,0.077499,-0.174321,-0.017181,0.005287,0.002736,0.016652,0.006856,0.011263,0.020771,-0.009766,-0.00727,-0.013121,0.003449,-0.015828,0.032382,0.026946,0.02067,1.771453,1.030865,0.771998,1.168462,1.296894,0.987834,0.946114,1.171205,0.881889,0.904728,1.084973,0.968618,0.999748,0.973059,1.119654,1.058783,1.168386,1.113089,0.869516,1.143064,0.795881,1.072845,0.984966,1.021027,0.981011,0.975518,1.043706,1.007566,0.948312,1.000446,0.992502,1.054794,0.911032,1.053227,0.966336,1.075722,1.007463,0.997703,0.998812,0.992767,0.997022,0.995108,0.990977,1.004242,1.003158,1.005699,0.998502,1.006875,0.985934,0.988295,0.991021
240403,-0.073984,-0.032816,-0.22308,0.30466,-0.018086,-0.031169,-0.144697,-0.08189,-0.232255,-0.812334,0.258544,0.12314,-0.098577,-0.26241,-0.304523,0.615114,-0.159945,-0.631646,-0.189491,0.410922,-0.189131,-0.457474,0.243587,-0.144952,0.209798,0.036138,-0.026731,0.019832,0.060938,-0.143025,-0.110118,0.156102,-0.020513,0.052565,0.1316,-0.086201,0.120146,0.005979,-0.063358,0.052989,-0.01178,-0.087195,0.093208,0.020654,0.009091,-0.035903,-0.044962,0.00626,0.049528,-0.001218,0.012722,1.045383,1.02013,1.136843,0.813114,1.011094,1.01912,1.088761,1.050234,1.142471,1.498305,0.841403,0.924463,1.06047,1.160968,1.186802,0.622674,1.098114,1.387467,1.116239,0.74793,1.116018,1.280625,0.850578,1.088917,0.871305,0.977832,1.016397,0.987834,0.962619,1.087735,1.067549,0.904243,1.012583,0.967755,0.919273,1.052878,0.9263,0.996332,1.038865,0.967495,1.007226,1.053488,0.942824,0.98733,0.994423,1.022024,1.027581,0.99616,0.969618,1.000747,0.992196
240404,-0.00172,-0.054334,-0.250013,-0.427764,-0.167211,0.315168,-0.147826,0.60799,-0.116327,0.414713,0.198479,-0.120117,0.314847,-0.209838,0.231712,-0.081837,-0.018852,-0.058634,0.090401,-0.063589,-0.423482,0.334922,-0.030064,0.295767,-0.046165,-0.000937,0.130114,-0.001049,-0.185126,-0.163692,0.011871,0.014421,-0.085404,0.024681,-0.1437,0.077337,0.082272,-0.067722,0.021892,-0.047718,-0.055284,-0.142607,-0.027069,-0.039109,-0.026649,-0.04983,0.041074,-0.014278,-0.004565,-0.023854,-0.008906,1.001294,1.04088,1.188104,1.32184,1.125806,0.762875,1.111221,0.542562,1.087522,0.687979,0.850669,1.090373,0.763116,1.157878,0.825665,1.061572,1.014183,1.044115,0.931984,1.047843,1.318618,0.748012,1.02262,0.777471,1.034733,1.000705,0.902105,1.000789,1.139285,1.123158,0.991069,0.98915,1.064256,0.98143,1.108116,0.941814,0.9381,1.050953,0.983529,1.035902,1.041594,1.107294,1.020366,1.029425,1.02005,1.037491,0.969097,1.010742,1.003434,1.017948,1.0067
240405,0.169718,0.027748,-0.357479,-0.114998,0.334078,-0.047008,-0.064134,-0.002777,-0.057135,-0.396472,-0.141513,0.191923,-0.153816,-0.178657,0.536353,-0.118437,0.513315,0.405071,-0.01641,0.002613,0.20136,0.058466,-0.100382,-0.132201,-0.226298,-0.046117,0.128321,-0.073722,-0.023687,-0.035939,0.00643,-0.007747,-0.03766,0.075635,-0.00523,-0.031204,0.000341,-1e-05,0.047644,-0.035475,-0.074303,-0.086477,-0.008861,0.006566,-0.008769,-0.022249,-0.014081,0.007156,-0.031297,-0.002204,0.006746,0.859844,0.977085,1.295213,1.094968,0.724112,1.03882,1.052963,1.002293,1.047183,1.327414,1.116864,0.841507,1.127024,1.147539,0.557069,1.097807,0.576095,0.665484,1.013552,0.997842,0.833713,0.951717,1.082897,1.109174,1.186881,1.038084,0.89403,1.060881,1.019561,1.029679,0.99469,1.006398,1.031101,0.93754,1.004319,1.025769,0.999718,1.000008,0.960655,1.029296,1.061361,1.071415,1.007317,0.994578,1.007241,1.018374,1.011628,0.99409,1.025846,1.00182,0.994429
240406,-0.057891,0.109783,-0.280131,0.205777,-0.294866,-0.705154,0.12365,-0.594046,0.263151,-0.165157,-0.259733,0.194884,-0.260066,-0.01446,0.219119,-0.305415,-0.181774,0.018945,-0.398209,-0.036815,-0.016126,0.078565,-0.148916,0.122718,0.158633,0.154717,-0.173759,-0.12646,-0.08106,0.123786,-0.086993,0.023905,-0.066088,-0.031325,-0.029827,0.086481,-0.047087,0.025735,-0.104956,0.05212,0.025987,0.009242,0.025303,0.001076,0.003663,0.066247,0.006375,-0.016585,-0.01503,0.002469,-0.000651,1.042013,0.920328,1.203299,0.850662,1.213992,1.511749,0.910264,1.431116,0.809024,1.119859,1.188496,0.858567,1.188737,1.010494,0.840979,1.221648,1.131918,0.986251,1.288991,1.026718,1.011703,0.942983,1.108072,0.91094,0.884876,0.887717,1.126102,1.091776,1.058828,0.910165,1.063133,0.982651,1.047962,1.022733,1.021646,0.937238,1.034172,0.981323,1.07617,0.962175,0.981141,0.993293,0.981637,0.999219,0.997342,0.951923,0.995374,1.012036,1.010908,0.998208,1.000473
240407,0.175549,-0.031567,0.815984,0.073585,0.490177,0.910351,-0.427646,-0.974151,-0.123103,-0.174728,0.398385,-0.153992,0.451003,-0.106697,0.148777,-0.216483,-0.379045,-0.017441,0.062616,-0.119353,0.110242,0.113489,-0.13277,-0.261682,-0.080375,0.023216,-0.14689,-0.087656,0.021216,0.108499,0.109093,-0.042089,0.097872,0.091027,0.051246,0.139662,-0.040475,-0.006522,-0.002245,0.033247,0.092702,0.040935,-0.065723,-0.012549,0.005644,0.014324,0.107937,0.046641,-0.039715,-0.047862,-0.004526,0.910261,1.016137,0.582874,0.962384,0.749424,0.534634,1.21861,1.49798,1.06293,1.08932,0.796348,1.07872,0.76945,1.054543,0.923946,1.110665,1.193766,1.008915,0.967991,1.061012,0.943645,0.941985,1.067871,1.13377,1.041087,0.988132,1.075089,1.044809,0.989155,0.944536,0.944232,1.021516,0.949968,0.953467,0.973803,0.928606,1.020691,1.003334,1.001148,0.983004,0.952611,0.979074,1.033597,1.006415,0.997115,0.992678,0.944823,0.976158,1.020302,1.024467,1.002314
240408,0.023515,-0.041884,-0.12518,0.376441,-0.359941,-0.222719,0.156593,0.375414,-0.043906,-0.175236,0.510398,-0.205635,-0.347314,0.263966,-0.187185,-0.017998,0.154958,0.165375,0.127201,-0.154127,-0.22516,0.181468,0.091684,0.195557,0.119232,-0.040883,0.088206,0.036497,-0.005156,0.035157,0.048993,-0.009315,-0.042771,0.040545,0.03331,0.086635,-0.054762,-0.001152,0.042803,-0.02372,0.011242,0.04987,-0.042371,0.001764,0.009356,0.036827,-0.020764,0.007187,-0.001306,0.006,-0.016537,0.979685,1.036185,1.108145,0.674787,1.310959,1.19241,0.864717,0.675674,1.037931,1.151389,0.559059,1.177651,1.30005,0.771956,1.161712,1.015548,0.866129,0.85713,0.890109,1.133153,1.194519,0.843227,0.920793,0.831055,0.896994,1.03532,0.923797,0.968469,1.004454,0.969627,0.957674,1.008048,1.036951,0.964973,0.971223,0.925155,1.047309,1.000995,0.963021,1.020492,0.990288,0.956916,1.036605,0.998476,0.991917,0.968185,1.017938,0.993791,1.001128,0.994817,1.014287
240409,1.298414,0.128639,0.219697,0.262407,0.916089,-0.277912,0.527303,1.026878,-0.338449,1.188885,-0.585043,-0.13752,0.103601,0.126431,-0.22821,-0.076453,0.415549,0.42112,0.166636,-0.175669,0.38646,-0.204601,-0.073092,0.162585,-0.225719,0.034013,0.074179,-0.132018,0.022427,0.097512,0.01545,0.074873,-0.112218,0.090616,-0.004593,-0.090882,0.028771,-0.01873,0.009816,-0.032788,0.036222,0.122364,-0.031951,0.054368,0.037234,-0.012427,-0.038886,-0.0081,0.009951,0.007636,-0.01292,0.502813,0.950742,0.915874,0.899519,0.649212,1.106418,0.798086,0.606789,1.129599,0.544754,1.224024,1.052659,0.960329,0.951587,1.087386,1.029275,0.840879,0.838745,0.936192,1.067267,0.852017,1.078346,1.027988,0.937743,1.086432,0.986976,0.971595,1.050552,0.991412,0.962661,0.994084,0.97133,1.04297,0.965301,1.001759,1.034801,0.988983,1.007172,0.996241,1.012555,0.98613,0.953145,1.012235,0.979181,0.985742,1.004758,1.01489,1.003102,0.996189,0.997076,1.004947
240410,0.242366,-0.034514,-0.324682,-0.292288,-0.216761,0.030431,-0.147294,0.036713,0.376119,-0.098999,-0.183899,0.035072,-0.010258,0.319646,-0.140587,0.336832,0.04344,-0.042447,-0.143133,0.465369,-0.314068,0.06286,0.115344,-0.189387,0.047181,-0.216509,0.027173,0.381993,0.071455,-0.021272,-0.011987,-0.084009,0.061969,-0.22121,-0.110307,-0.007507,-0.072024,0.057134,0.045668,-0.015307,-0.031642,0.082606,0.036691,-0.023006,-0.022298,0.01613,-0.040143,-0.012453,5.2e-05,0.03209,0.0034,0.797579,1.028826,1.27117,1.244116,1.181036,0.974584,1.123018,0.969337,0.68587,1.082683,1.15359,0.970708,1.008568,0.733035,1.117417,0.718682,0.96372,1.035451,1.119543,0.61133,1.262306,0.9475,0.903666,1.158174,0.960595,1.180826,0.977306,0.680964,0.940322,1.017767,1.010012,1.070164,0.948244,1.184752,1.092127,1.00627,1.060154,0.952282,0.961859,1.012784,1.026427,0.931009,0.969356,1.019214,1.018623,0.986528,1.033527,1.010401,0.999957,0.973199,0.99716


In [25]:
#['cos_{}'.format(i+1) for i in range(df_max[name_pca].shape[1])]

In [26]:
factors_new = new_factors_array (df_pca.values, predict_mask =predict_mask)
factors_new.shape, factors.shape

((102, 1805375), (57, 1805375))

In [27]:
#mod.is1, mod.is2, mod.oos = new_factors_array(df[['is1','is2','oos']]).astype(bool)

In [35]:
#from prediction.tools.helpers import strata_scale_down

is1 = mod.is1.copy()
is2 = mod.is1.copy()
oos = mod.oos.copy()
#strata = strata_scale_down(av.event_id)

In [28]:
%%time
new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood, ints\
    = mod.fit_slices(tsav, factors_new,  depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'LL  new factors'
print new_model_likelihood
print '..................'

  step2factors = np.concatenate((fb, fl, np.log(probs).reshape((1, -1))), axis=0)


. . . . . . . . . . 10
LL  new factors
[[-1854.53772101 -1897.63612144 -1897.63612144]
 [-1856.65088517 -1900.00590953 -1900.00590953]
 [-1858.21193282 -1902.78844132 -1902.78844132]
 [-1858.35379367 -1904.36870966 -1904.36870966]
 [-1863.50580391 -1910.85010195 -1910.85010195]
 [-1867.53003018 -1918.28177941 -1918.28177941]
 [-1883.55808712 -1932.68131965 -1932.68131965]
 [-1888.42917315 -1936.98619598 -1936.98619598]
 [-1889.79244303 -1939.252269   -1939.252269  ]
 [-1889.67749806 -1941.84114075 -1941.84114075]
 [    0.             0.             0.        ]]
..................
CPU times: user 8min 40s, sys: 3.75 s, total: 8min 44s
Wall time: 6min 41s


In [29]:
old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood , ints\
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print old_model_likelihood
print '..................'

. . . . . . . . . . 10
[[-1856.72540883 -1897.73592604 -1897.73592604]
 [-1858.81114746 -1900.18095349 -1900.18095349]
 [-1860.3113265  -1903.02064196 -1903.02064196]
 [-1860.42261087 -1904.68529626 -1904.68529626]
 [-1865.67547942 -1911.09207575 -1911.09207575]
 [-1869.73271791 -1918.66289469 -1918.66289469]
 [-1885.62973036 -1933.11737206 -1933.11737206]
 [-1890.60868995 -1937.4647503  -1937.4647503 ]
 [-1891.9667941  -1939.7005181  -1939.7005181 ]
 [-1891.86169787 -1942.35404469 -1942.35404469]
 [    0.             0.             0.        ]]
..................


In [30]:
write_dic_to_simdata('simdata_New__clusters_direction.p', new_model_step1prob, new_model_coefs, mod.oos, av =av)
write_dic_to_simdata('simdata_New__old_model.p', old_model_step1prob, old_model_coefs, mod.oos, av =av)

In [31]:
df_pca['event_id'] = df['event_id']
df_cos = df_pca.loc[:,'cos_1':'cos_51'].groupby(df_pca['event_id']).mean()
df_cos.head()

Unnamed: 0_level_0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
293661,1.099408,1.008777,1.012219,0.976042,0.977904,0.978751,1.005168,0.944636,0.998453,0.985194,0.998768,0.998302,1.009469,1.009064,0.990565,1.002348,1.00656,0.991467,0.9837,1.001285,0.984609,0.99059,1.009813,0.994602,0.994954,0.984056,0.990432,1.001354,0.985555,1.011699,0.999951,0.998772,1.001881,0.996957,1.001885,1.004919,1.00146,0.997829,1.00493,0.997343,0.999183,0.996703,1.00091,0.999391,0.999933,1.001056,0.99889,1.000815,0.99933,0.999506,1.000508
293662,1.034704,1.020276,1.019598,0.997314,0.970187,1.002387,1.043255,1.042145,1.036125,1.008333,1.001983,0.992436,0.99865,0.996603,0.993302,0.996576,1.005888,0.99797,0.988966,0.978638,1.010471,1.024886,1.008265,1.003063,0.997338,0.989853,0.990676,0.999372,0.983255,1.013287,1.002347,1.010307,1.004724,0.997614,0.998692,1.003656,0.998793,1.000386,0.999897,0.996163,0.998285,1.000204,1.004196,0.998215,0.99951,1.002739,0.99732,1.00048,1.003006,1.000045,0.999884
293663,1.041353,0.999034,1.021235,0.957426,1.013768,1.018861,1.028091,0.962624,0.989368,0.991537,0.984853,0.960929,1.012752,1.017557,1.00673,0.997704,0.969503,1.013624,1.017271,0.991669,0.987797,1.010117,1.004882,1.004648,0.997263,1.007678,0.996923,0.988568,1.012667,1.002655,1.011232,1.00668,1.006944,1.001647,0.993386,1.000697,0.998176,0.999409,1.004741,1.000422,0.997047,1.000419,1.002294,0.99027,1.005232,0.999958,0.992774,1.004571,1.006753,0.99643,0.99838
293664,0.948855,0.974613,1.006513,1.008711,0.982126,1.057088,1.123436,0.964068,1.01625,1.021979,0.954072,0.966955,1.044712,0.98668,0.990354,0.979721,0.978721,1.034202,1.022507,0.998101,0.970761,1.007047,1.008432,0.961437,0.979693,0.993497,0.988132,1.017762,0.986021,0.990275,1.00896,0.998405,0.994558,1.008648,1.007997,1.001624,0.999595,1.001164,1.002538,1.0068,1.003578,1.000915,0.995236,0.995387,1.002018,1.000619,0.994754,1.005492,1.005643,0.999021,0.996936
293665,1.060874,0.999677,1.00058,0.975697,0.993581,1.029471,0.930963,0.966885,0.995214,0.956852,0.984796,0.988983,1.015931,1.036031,0.994281,0.995127,0.981903,1.004401,0.990972,0.982159,0.989085,1.018959,1.01416,0.995987,0.990585,0.995318,0.998385,1.011849,0.983014,1.012756,1.010878,1.008432,0.997702,0.999565,1.004485,0.998279,0.992241,1.003621,1.00337,1.001832,0.999646,0.994901,1.000429,0.99425,1.007892,1.002671,0.999327,0.999927,0.998295,0.999905,1.000652


In [44]:
from sklearn.mixture import BayesianGaussianMixture
BGM = BayesianGaussianMixture(n_components =7, random_state =seed)
BGM.fit(df_cos.loc[:,'cos_1':'cos_51'].values)
df_cos['cluster'] = BGM.predict(df_cos.loc[:,'cos_1':'cos_51'].values)
score = BGM.score(df_cos.loc[:,'cos_1':'cos_51'].values)
print score

184.734641965


In [45]:
df_cos.loc[:,'cos_1':'cos_51'].groupby(df_cos['cluster']).mean()

Unnamed: 0_level_0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,1.003363,1.00294,1.008645,0.99324,0.996733,0.99742,0.989818,0.991767,1.016158,0.990199,1.002809,0.990635,1.002749,1.002947,0.998621,0.996853,1.000568,1.000048,1.003885,1.001254,1.001985,1.000976,1.00211,0.999778,0.999739,0.998039,1.000492,0.997924,1.005272,0.996892,0.999883,1.003203,1.001371,0.999554,1.000616,1.00011,1.001374,0.999854,1.001828,0.999681,1.000068,0.999711,1.000007,0.996051,1.003431,1.000793,0.999523,0.999876,0.999776,0.999938,1.000611
1,1.0277,1.004272,0.991543,0.987299,0.989396,0.993722,1.001433,1.000713,1.012326,1.010622,1.010513,1.000042,1.006848,0.996227,0.998699,1.007942,1.001706,1.003097,0.998489,1.002582,1.001172,1.003135,0.999213,0.99684,1.003352,0.999535,1.000622,1.001217,1.000853,0.999295,1.000939,1.000074,0.999782,1.000224,0.999371,1.000689,1.002625,0.999042,1.000017,0.999786,1.000249,1.001592,0.999804,1.001113,1.002232,1.000456,1.000771,0.999929,0.99994,0.999745,1.000078
2,1.013926,1.003142,0.997772,0.989584,0.996032,0.994328,1.012253,0.997514,1.013405,1.006789,1.001032,0.995218,0.999756,0.999736,1.00032,1.005852,1.003463,1.000884,0.99854,1.001875,1.000832,1.003598,0.999371,0.997027,1.00168,1.00267,1.002159,0.999511,1.000827,0.999636,1.001048,1.000867,1.000432,0.999742,0.999711,1.000972,1.00131,0.998936,1.000106,0.999544,1.000316,1.000812,0.999768,1.000236,1.000986,0.999817,1.000024,0.999782,0.99995,1.00018,1.000204
3,1.105561,0.53733,1.005971,0.999233,1.001429,0.99315,1.003195,1.003028,1.002315,0.999974,1.002391,0.999898,1.000371,0.994672,0.999746,0.999941,1.001164,1.000215,1.001089,0.999155,1.000628,1.00082,0.99896,1.00019,1.00214,0.999169,1.000799,1.000042,1.000793,0.999615,0.999844,1.00053,1.000037,0.99996,0.999927,0.999951,0.999806,0.999134,1.000012,0.99994,1.000038,1.000085,1.000043,0.999815,1.000175,1.000064,1.000034,0.999931,1.000008,0.999961,0.999997
4,1.003383,1.001087,1.000092,0.995739,0.996502,1.004322,1.021159,0.999836,1.00982,0.999888,1.003329,0.996756,1.002793,1.002477,0.998249,1.000855,1.002358,0.999569,0.998787,1.000636,1.001491,1.001268,1.000114,0.999008,1.00175,1.000844,1.001079,0.999367,1.002554,0.998679,1.000384,1.000605,1.00088,0.99943,0.999726,1.000852,1.001621,1.000538,1.000724,0.999783,0.999931,1.000287,1.000167,0.999262,1.000952,1.000212,1.000208,0.999324,1.000018,1.000054,1.000195
5,1.041258,1.007638,1.01091,0.988586,1.001492,0.998893,1.012868,0.996484,1.017299,1.000483,1.006251,0.993024,1.002471,1.000629,0.999881,1.002332,1.005861,1.002759,0.998611,1.005048,1.001835,1.003986,1.001674,0.998803,1.003186,1.003288,1.003262,0.996922,1.010035,0.996765,1.000485,1.00184,1.000887,0.998853,0.997978,1.002281,1.003201,0.999449,1.001617,0.999573,1.000208,1.001167,0.99942,0.999211,1.002494,1.000564,1.000192,0.999506,1.000108,0.99992,1.000272
6,1.003827,1.001126,0.995133,0.993143,0.995053,0.995314,1.001807,0.999052,1.00724,1.005208,1.002836,0.998224,1.000108,0.999347,0.998731,1.004315,1.001087,1.000745,0.999363,1.001604,1.000486,1.002187,0.999778,0.997889,1.001139,1.000602,1.000451,1.000328,1.000427,1.000048,1.000576,1.00009,1.000094,1.000188,0.999901,1.000546,1.00108,0.999452,1.000005,0.999805,1.000244,1.000515,0.999975,1.000121,1.000578,1.00004,1.000099,0.99984,0.999971,1.000049,1.00012


In [46]:
df_cos['cluster'].value_counts()

6    7938
4    4016
0    3765
2    3486
1    2153
5    1686
3     470
Name: cluster, dtype: int64

#### build the clusters model

In [47]:
from prediction.tools.clustering import step2prob_clusters_model, ll_diff

In [49]:
clusters_list = df_cos['cluster'].unique()
clusters_list

array([0, 4, 5, 6, 2, 1, 3])

In [50]:
cluster_model_coefs, cluster_model_step1prob, cluster_model_step2prob, cluster_model_likelihood, ints, train_val_test\
    = step2prob_clusters_model (df_cos['cluster'], clusters_list, is1, is2, oos, verbose =True, 
                                av =av, factors =factors, tsav =tsav, mod =mod)

. . . . . . . . . . 10
cluster 0  number  3765
LL  2030          2030            1735
[[-1720.84979707 -1752.00263482 -1752.00263482]
 [-1723.64879879 -1753.45518632 -1753.45518632]
 [-1726.27153128 -1752.65890995 -1752.65890995]
 [-1726.96707757 -1751.61268153 -1751.61268153]
 [-1736.15078426 -1751.82861105 -1751.82861105]
 [-1743.07844724 -1759.11436613 -1759.11436613]
 [-1765.71747773 -1759.44288744 -1759.44288744]
 [-1764.81368861 -1753.84967714 -1753.84967714]
 [-1767.49856073 -1751.11994074 -1751.11994074]
 [-1760.15275478 -1760.26640221 -1760.26640221]
 [    0.             0.             0.        ]]
. . . . . . . . . . 10
cluster 4  number  4016
LL  2467          2467            1549
[[-1793.83933375 -1835.47636576 -1835.47636576]
 [-1795.31235777 -1842.18550025 -1842.18550025]
 [-1794.65458193 -1846.54080432 -1846.54080432]
 [-1790.24253536 -1844.24430424 -1844.24430424]
 [-1785.29541286 -1858.7969874  -1858.7969874 ]
 [-1784.5074671  -1865.91605664 -1865.91605664]
 [-1788.733

In [51]:
mod.is1 = is1
mod.is2 = is2
mod.oos = oos

##### compare with old model

In [52]:
total_ll = [0.0, 0.0, 0.0]
total_number = 0
good_clusters = []
threshold = 1.
threshold_size_cluster = 500

for cluster in train_val_test.keys():
    
    train, val, test = train_val_test[cluster] #events for each cluster
    
    cluster_mask = np.in1d(av.event_id, np.append(np.append(train, val),test))[ints] #mask for each cluster
    #replace the win probability if the event in the cluster, another use the old model
    prob_mix = np.where(cluster_mask, cluster_model_step2prob[cluster], old_model_step2prob)
    print 'cluster {}  number  {}'.format(cluster, df_cos['cluster'].value_counts()[cluster])
    
    print 'diff_likelihood'
    mean_diff_ll = ll_diff(prob_mix, old_model_step2prob, train, val, test, ints, av =av, tsav =tsav).mean(axis =0)
    print mean_diff_ll
    if (mean_diff_ll[0] >threshold) & (df_cos['cluster'].value_counts()[cluster] > threshold_size_cluster):
        good_clusters.append(cluster)
    total_ll += mean_diff_ll * df_cos['cluster'].value_counts()[cluster]
    total_number += df_cos['cluster'].value_counts()[cluster]
print '......................'
print 'total likelihood ', total_ll/total_number
print '......................'
print 'good clusters  ', good_clusters

cluster 0  number  3765
diff_likelihood
[ 23.89331443  23.89331443  21.25923287]
cluster 1  number  2153
diff_likelihood
[ 3.12636542  3.12636542 -1.77574963]
cluster 2  number  3486
diff_likelihood
[ 3.40871198  3.40871198  5.46024866]
cluster 3  number  470
diff_likelihood
[ 20.41127431  20.41127431  11.33799838]
cluster 4  number  4016
diff_likelihood
[  5.28736783   5.28736783 -28.17356759]
cluster 5  number  1686
diff_likelihood
[ 2.80949592  2.80949592  2.76267345]
cluster 6  number  7938
diff_likelihood
[ 4.49269112  4.49269112  2.86094729]
......................
total likelihood  [ 7.64647971  7.64647971  0.62958784]
......................
good clusters   [0, 1, 2, 4, 5, 6]


##### write result to file

In [53]:
write_dic_to_simdata('simdata_New_7_clustering_directions.p', old_model_step1prob, old_model_coefs, oos, av =av, 
                     data =df_cos['cluster'], cluster_step1probs =cluster_model_step1prob, 
                     cluster_coefs =cluster_model_coefs, cluster_names =good_clusters )

##### compare with directions model

In [41]:
total_ll = [0.0, 0.0, 0.0]
total_number = 0
good_clusters = []
threshold = 1.
threshold_size_cluster = 500

for cluster in train_val_test.keys():
    
    train, val, test = train_val_test[cluster] #events for each cluster
    
    cluster_mask = np.in1d(av.event_id, np.append(np.append(train, val),test))[ints] #mask for each cluster
    #replace the win probability if the event in the cluster, another use the old model
    prob_mix = np.where(cluster_mask, cluster_model_step2prob[cluster], new_model_step2prob)
    print 'cluster {}  number  {}'.format(cluster, df_cos['cluster'].value_counts()[cluster])
    
    print 'diff_likelihood'
    mean_diff_ll = ll_diff(prob_mix, new_model_step2prob, train, val, test, ints, av =av, tsav =tsav).mean(axis =0)
    print mean_diff_ll
    if (mean_diff_ll[0] >threshold) & (df_cos['cluster'].value_counts()[cluster] > threshold_size_cluster):
        good_clusters.append(cluster)
    total_ll += mean_diff_ll * df_cos['cluster'].value_counts()[cluster]
    total_number += df_cos['cluster'].value_counts()[cluster]
print '......................'
print 'total likelihood ', total_ll/total_number
print '......................'
print 'good clusters  ', good_clusters

cluster 0  number  4262
diff_likelihood
[ 16.46735839  16.46735839  16.03434831]
cluster 1  number  3214
diff_likelihood
[ 3.30445602  3.30445602 -5.46338468]
cluster 2  number  10205
diff_likelihood
[ 0.85506412  0.85506412  1.85290471]
cluster 3  number  473
diff_likelihood
[ 27.47273732  27.47273732  31.29459348]
cluster 4  number  5360
diff_likelihood
[  1.68774736   1.68774736 -21.71359183]
......................
total likelihood  [ 4.74488658  4.74488658 -1.35640651]
......................
good clusters   [0, 1, 4]


##### write result to file

In [42]:
write_dic_to_simdata('simdata_New_clustering_directions_new.p', new_model_step1prob, new_model_coefs, oos, av =av, 
                     data =df_cos['cluster'], cluster_step1probs =cluster_model_step1prob, 
                     cluster_coefs =cluster_model_coefs, cluster_names =good_clusters )

In [45]:
write_dic_to_simdata('simdata_clustering_directions_new_034.p', new_model_step1prob, new_model_coefs, oos, av =av, 
                     data =df_cos['cluster'], cluster_step1probs =cluster_model_step1prob, 
                     cluster_coefs =cluster_model_coefs, cluster_names =[0, 3, 4] )

In [58]:
df_clusters = df_cos.groupby('cluster').mean()
df_clusters

Unnamed: 0_level_0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,1.005337,0.998804,0.995519,0.992494,0.994452,0.995031,1.00117,1.007398,1.006487,0.997681,1.001337,1.000671,0.997313,1.00094,1.002581,0.99602,0.998696,0.998627,0.999226,1.001699,1.000384,1.002268,0.999958,0.99767,1.001125,1.000283,1.001063,0.999596,0.999668,1.000202,1.000571,0.999969,1.000266,1.000179,1.000714,0.9997,1.001079,0.999711,0.999904,1.00014,1.000298,0.999504,1.000194,0.999807,1.000715,1.000064,1.000088,0.999803,1.000019,1.000064,1.000121
1,1.037465,0.993865,1.004028,0.987454,0.996569,0.991272,1.001658,1.013886,1.00819,0.998714,1.005906,1.00535,0.992258,1.005848,1.004177,0.997045,0.996577,0.995753,0.9982,1.004743,1.00162,1.004107,1.00147,0.998222,1.002658,1.002479,1.006588,1.002627,0.999026,0.999198,1.000779,1.000648,1.001123,0.999372,1.001974,1.000083,1.002481,0.999955,1.000719,1.001401,1.000554,0.998988,1.000819,0.999629,1.002738,1.000487,1.000302,0.999515,0.999898,0.999867,1.000213
2,1.09259,1.459843,1.006679,0.999747,1.000547,0.992885,0.996882,1.002243,1.00079,1.000881,1.003676,1.001153,0.995225,0.997691,1.000504,1.000469,0.999398,0.999596,1.001395,0.999688,1.000615,1.000608,0.999254,0.999988,1.001563,0.998901,1.001382,1.000492,0.99959,0.999882,0.999912,1.000326,1.000155,0.999963,1.000057,1.000004,1.000034,0.99939,0.99998,1.000085,0.999967,0.999892,0.999992,1.000265,1.000276,1.000053,1.000045,0.999859,0.999991,0.999991,1.000005
3,1.010226,0.997455,1.000566,0.993659,0.994538,0.995496,0.998326,1.010082,1.004134,0.996748,1.001144,1.003732,0.996751,1.002746,1.002294,0.997912,0.996432,0.999466,0.998831,1.001747,1.000854,1.001538,1.000309,0.998012,1.002065,1.000288,1.003064,1.000641,0.999355,0.999325,1.000319,1.000148,1.000456,0.999183,1.001041,0.999742,1.001527,1.000777,1.000239,1.000475,0.999993,0.999754,1.000317,1.000308,1.000837,1.000292,1.000285,0.999371,0.999919,0.99999,1.000121
4,1.004509,0.997725,1.006995,0.993994,0.998237,1.000341,1.006657,1.017148,0.99092,0.992068,0.994269,1.002052,0.996966,1.002412,1.0004,1.00234,1.001114,0.999359,1.002865,1.001453,1.002206,1.001006,1.00142,0.999734,0.99853,0.999733,1.004034,1.003817,0.998994,0.99813,1.000332,1.003114,1.001274,0.999309,0.999958,0.999333,1.001135,1.000068,1.001129,1.001442,1.000272,1.000457,1.000543,1.003468,1.003014,1.000799,0.999781,0.999843,1.0,1.000088,1.000504


In [49]:
df_clusters.index

Int64Index([0, 1, 2, 3, 4], dtype='int64', name=u'cluster')

In [52]:
from itertools import combinations
import scipy
for cl1, cl2 in combinations(df_clusters.index, 2):
    cos_dist1 = scipy.spatial.distance.cosine(df_clusters.loc[cl1,:], df_clusters.loc[cl2,:])
    #cos_dist2 = scipy.spatial.distance.cosine(cluster_model_coefs[cl1][:,2], cluster_model_coefs[cl2][:,2])
    print 'step clusters  {}  and  {} corelation = {}'.format(cl1, cl2, cos_dist1)
    #print 'step 2 clusters  {}  and  {} corelation = {}'.format(cl1, cl2, cos_dist2)

step clusters  0  and  1 corelation = 1.27605926779e-05
step clusters  0  and  2 corelation = 0.00205118343695
step clusters  0  and  3 corelation = 1.0122415971e-06
step clusters  0  and  4 corelation = 7.37005586049e-06
step clusters  1  and  2 corelation = 0.00206841987024
step clusters  1  and  3 corelation = 9.13418364656e-06
step clusters  1  and  4 corelation = 1.91604114963e-05
step clusters  2  and  3 corelation = 0.0020578498482
step clusters  2  and  4 corelation = 0.00207413082489
step clusters  3  and  4 corelation = 5.7793296967e-06


In [54]:
df_winners = pd.read_csv('winners_clusters.csv', index_col =0)
df_winners

Unnamed: 0_level_0,cos_1,cos_2,cos_3,cos_4,cos_5,cos_6,cos_7,cos_8,cos_9,cos_10,cos_11,cos_12,cos_13,cos_14,cos_15,cos_16,cos_17,cos_18,cos_19,cos_20,cos_21,cos_22,cos_23,cos_24,cos_25,cos_26,cos_27,cos_28,cos_29,cos_30,cos_31,cos_32,cos_33,cos_34,cos_35,cos_36,cos_37,cos_38,cos_39,cos_40,cos_41,cos_42,cos_43,cos_44,cos_45,cos_46,cos_47,cos_48,cos_49,cos_50,cos_51
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,1.883961,0.926552,1.017811,0.952609,0.957486,1.036076,1.027393,0.970277,0.993106,1.017051,1.001407,0.988449,1.02026,0.985638,1.008982,1.010774,1.01182,1.000568,0.997233,0.998694,1.003401,0.993832,1.000805,0.999836,0.998529,1.003631,0.998998,0.996148,0.998189,0.999417,0.999149,0.999986,0.998549,1.003105,0.995774,1.000524,0.993852,1.001579,1.002758,1.000318,0.998751,1.001344,0.998016,0.998471,0.999465,0.999998,0.999664,1.000689,0.999457,1.000341,0.998616
1,1.233835,0.979553,0.983689,0.956658,1.005553,1.029219,1.032264,1.024337,1.014491,0.992159,0.99844,0.999349,0.998668,0.996502,1.003786,0.993691,0.994445,1.008005,0.993105,0.999521,1.004043,1.00157,0.997464,0.993334,0.999569,1.002051,1.007431,1.002845,1.00015,0.995548,0.999585,1.005663,1.002385,1.003094,1.000957,1.000018,1.001905,1.003087,1.002045,0.998439,1.001479,0.99938,0.995626,1.004499,1.003247,1.000626,1.000169,1.000316,0.999404,1.001101,0.998866
2,1.703996,0.933867,0.903257,1.144037,1.066554,1.017891,0.982475,0.975577,0.997252,1.024321,1.024742,1.060539,1.005688,1.003782,0.969119,0.987801,1.000139,0.996274,1.006005,0.996384,1.003591,1.001059,1.004408,1.020114,0.987976,0.996012,0.992497,1.004486,0.999476,1.002132,1.003877,1.005218,1.003746,0.99811,1.000667,1.002543,1.00592,1.000148,1.002809,1.002708,1.004895,1.002483,1.00022,1.001189,0.999129,0.999032,0.998041,0.999554,0.999736,1.000617,1.001587
3,1.137039,0.982015,0.991559,0.978488,0.969397,0.958997,0.98571,1.038949,1.02581,1.003557,0.99933,0.985166,0.982729,1.007856,1.022711,0.982397,0.999839,1.000845,0.997755,0.995329,1.007667,1.008402,1.006113,0.985947,0.998495,1.000332,1.004412,1.000695,1.000783,0.999172,1.00129,1.001484,0.994354,1.002659,0.998207,1.000044,1.00221,0.995372,0.999603,0.996386,0.995284,0.99652,0.999844,1.000915,1.000331,0.998948,1.001859,0.998447,1.002018,0.999548,0.999826
4,1.324256,1.025765,0.962364,0.980675,1.019693,1.020808,1.034657,1.001438,1.030345,1.00384,1.010164,1.02012,1.003185,0.992981,1.002082,0.988804,0.994696,0.992206,0.99926,1.003807,1.004405,1.002621,1.000969,1.003173,1.00421,1.00128,1.003388,1.005249,0.997124,0.999656,0.999274,0.997123,0.99993,0.999054,0.999433,0.999488,1.00293,1.002069,0.999242,0.997856,1.003446,0.999473,1.001504,1.000881,0.999442,0.997392,0.99981,0.998843,0.999865,1.000475,0.999775


In [59]:
min_cos_distance1 = []

for cl1 in df_clusters.index:
    min_distance1 , min_distance2 = 0., 0.
    for cl2 in df_winners.index:
        cos_dist1 = scipy.spatial.distance.cosine(df_clusters.loc[cl1,:], df_winners.loc[cl2,:])
        
        if min_distance1 < cos_dist1:
            min_distance1 = cos_dist1
            s_cl1, s_cl2 = cl1, cl2
        
        print 'step clusters  {}  and  {} corelation = {}'.format(cl1, cl2, cos_dist1)
        
    min_cos_distance1.append((s_cl1, s_cl2))
    
    print 'most closely  {}  and {}'.format(s_cl1, s_cl2)
    

step clusters  0  and  0 corelation = 0.00727806941722
step clusters  0  and  1 corelation = 0.000539636788655
step clusters  0  and  2 corelation = 0.0049342685919
step clusters  0  and  3 corelation = 0.000224297541454
step clusters  0  and  4 corelation = 0.000999466936677
most closely  0  and 0
step clusters  1  and  0 corelation = 0.00677255913562
step clusters  1  and  1 corelation = 0.000413134260451
step clusters  1  and  2 corelation = 0.00456111791303
step clusters  1  and  3 corelation = 0.000146166763797
step clusters  1  and  4 corelation = 0.000830151751681
most closely  1  and 0
step clusters  2  and  0 corelation = 0.00865992517003
step clusters  2  and  1 corelation = 0.00244659362183
step clusters  2  and  2 corelation = 0.0065444004693
step clusters  2  and  3 corelation = 0.00222298567183
step clusters  2  and  4 corelation = 0.00237894431165
most closely  2  and 0
step clusters  3  and  0 corelation = 0.00720290739479
step clusters  3  and  1 corelation = 0.0005222