# ThoroughBet Simulation


## Load necessary modules

In [1]:
import numpy as np
import pickle

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end

## Load data

In [2]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [3]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

## Preprocessing

In [4]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [5]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']

In [6]:
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [7]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 10443


 . . . . . . . . . . 10000


INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


. . . . .CPU times: user 6min 28s, sys: 20.8 s, total: 6min 49s
Wall time: 6min 21s



In [8]:
factors.T.shape

(1742772, 57)

In [9]:
predict_mask = mod.is1|mod.is2|mod.oos
train_mask = mod.is1|mod.is2
train_event_id = av.event_id[train_mask]
predict_event_id = av.event_id[predict_mask]
len(predict_mask), len(train_mask), train_mask.sum(), predict_mask.sum(), len(train_event_id), len(np.unique(train_event_id))

(1742772, 1742772, 125714, 150297, 125714, 12375)

In [10]:
import pandas as pd
pd.set_option('display.max_columns', 60)

In [11]:
col_names = ['f{}'.format(i) for i in range(1,58)]
df = pd.DataFrame(data =factors[:, predict_mask].T , columns = col_names)

In [12]:
df['event_id'] = av.event_id[predict_mask]
df['runner_id'] = av.runner_id[predict_mask]
df['result'] = av.result[predict_mask]
df['is1'] = mod.is1[predict_mask]
df['is2'] = mod.is2[predict_mask]
df['oos'] = mod.oos[predict_mask]
df['time'] =av.start_time[predict_mask]
df['obstacle'] = av.obstacle[predict_mask]
df['going'] = av.going[predict_mask]
df['speed'] = av.speed[predict_mask]
df['distance'] = av.distance[predict_mask]
df['prize'] = av.prize[predict_mask]

df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,...,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize
0,0.0664,1.812544,0.175912,0.007658,-7e-05,0.380432,0.886927,-0.003214,1.493022,-0.510089,0.174814,0.784029,0.492731,0.021176,1.222551,0.332255,-0.003245,0.083893,0.003949,-0.018036,0.044318,-0.236929,1.061229,-0.056129,-0.000462,-0.000142,-0.065776,7e-06,-0.017906,0.294096,...,2.493133,0.780797,0.111163,0.222586,0.005423,0.023559,0.630792,0.0986,-0.015562,-0.102575,0.537004,-0.010811,0.05647,0.934733,0.419065,-0.003162,-0.003211,0.115685,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0
1,-0.154935,0.615559,-0.099403,-0.003315,0.00063,0.151092,-0.100909,0.005656,0.220873,0.121961,0.057041,0.432281,0.044878,0.01792,0.414254,0.201616,0.005911,0.005786,0.022141,-0.015606,-0.026866,0.276907,-0.188122,0.044831,0.004163,-0.001194,0.11179,-0.004704,-0.032841,0.036493,...,1.649089,0.002394,-1.172824,0.226226,0.030542,-0.036105,0.356432,-0.247407,-0.015704,0.589225,-0.121583,0.097302,-0.166858,0.225441,0.053834,0.005173,0.00562,0.052669,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0
2,0.0664,0.443172,0.064974,0.007658,-7e-05,0.203753,-0.100909,0.011806,-0.030797,-0.497101,0.355908,0.236486,-0.301619,0.027384,0.365929,0.153332,0.011789,0.03592,0.00827,0.013054,-0.014764,0.263956,0.298949,0.041541,-0.000462,0.000254,0.168972,0.001457,0.057498,-0.092061,...,0.942577,0.152497,-0.248882,0.10434,-0.036932,0.084404,0.138569,0.0986,0.079891,-0.147373,-0.18363,-0.010811,0.002193,0.481888,0.037694,0.011866,0.011813,0.04467,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0
3,-0.154935,0.766351,0.157395,-0.025148,-7e-05,0.114861,-0.257157,0.00988,0.299221,0.391816,-0.260937,-0.046712,-0.033713,0.013927,0.551059,-0.300384,0.009861,-0.052231,0.00827,0.011259,-0.004403,-0.122487,-0.35003,0.062382,-0.000462,-0.001858,0.234969,0.00183,-0.017906,-0.127139,...,0.499952,-0.072498,0.68855,-0.007017,0.0003,-0.019645,-0.160828,-0.247407,-0.015562,0.30418,0.289913,-0.010811,0.417492,-0.145241,-0.49702,0.009939,0.009887,-0.263563,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0
4,0.0664,0.330113,-0.122287,0.007658,-7e-05,0.114861,0.503362,-0.025454,1.263503,-0.443943,0.086009,-0.046712,-0.033713,0.027452,1.253726,0.168296,-0.025506,-0.016331,0.00827,0.013054,-0.038637,0.105709,0.180534,-0.041776,-0.000462,0.000211,-0.039124,0.002172,0.100686,0.030163,...,-0.00348,0.045434,0.591243,0.105751,0.069936,-0.044642,-0.160828,0.0986,0.022905,-0.19369,-0.440603,-0.010811,-0.426567,0.135662,0.444311,-0.025412,-0.025456,0.263005,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0


In [13]:
#df.to_csv('/home/oleg/thbmodel/racehorse_data2.csv')

In [13]:
ts_mask = av.start_time >= float(timestamp('2015-08-01'))

for k in tsav.keys():
    tsav[k] = tsav[k][predict_mask [ts_mask]]

- групировка по забегам 
- df_f сгрупированы факторы по забегу c вычислением минимальной разницы между сортированными факторами для значение в забеге
- df1 сгрупированы  переменные которые общие для всех участников звбега 

In [14]:
def mean_max_min(data):
    
    n, m = data.shape
    max_min = np.zeros((n,m))
    index  = data.index
    #print index
    for i,j in enumerate(index):
        #print np.setdiff1d(index, i)
        data_ = data.ix[np.setdiff1d(index, j), :].values
        #print data_[:2]
        max_min[i, :] = (data_.max(axis =0) - data_.min(axis =0))/n
        
    return pd.DataFrame(data = max_min, columns = data.columns)

In [17]:
#df_f = df.ix[:,u'f1':u'f57'].groupby(by = df['event_id']).apply(mean_max_min)
for col in df_f.columns:
    df[str('M_'+col)]= df_f[col].values
pd.options.display.max_columns = 140
df.head(10)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize,M_f1,M_f2,M_f3,M_f4,M_f5,M_f6,M_f7,M_f8,M_f9,M_f10,M_f11,M_f12,M_f13,M_f14,M_f15,M_f16,M_f17,M_f18,M_f19,M_f20,M_f21,M_f22,M_f23,M_f24,M_f25,M_f26,M_f27,M_f28,M_f29,M_f30,M_f31,M_f32,M_f33,M_f34,M_f35,M_f36,M_f37,M_f38,M_f39,M_f40,M_f41,M_f42,M_f43,M_f44,M_f45,M_f46,M_f47,M_f48,M_f49,M_f50,M_f51,M_f52,M_f53,M_f54,M_f55,M_f56,M_f57
0,0.0664,1.812544,0.175912,0.007658,-7e-05,0.380432,0.886927,-0.003214,1.493022,-0.510089,0.174814,0.784029,0.492731,0.021176,1.222551,0.332255,-0.003245,0.083893,0.003949,-0.018036,0.044318,-0.236929,1.061229,-0.056129,-0.000462,-0.000142,-0.065776,7e-06,-0.017906,0.294096,-0.485075,-0.084142,-0.112596,-0.002112,-0.003081,-0.167638,0.074538,0.967736,-0.003212,2.493133,0.780797,0.111163,0.222586,0.005423,0.023559,0.630792,0.0986,-0.015562,-0.102575,0.537004,-0.010811,0.05647,0.934733,0.419065,-0.003162,-0.003211,0.115685,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0,0.022134,0.260604,0.043252,0.003281,7e-05,0.155211,0.129012,0.003726,0.424278,0.210182,0.097216,0.121474,0.03465,0.00906,0.311063,0.051448,0.00373,0.008922,0.006,0.002866,0.011607,0.069193,0.110954,0.013933,0.000463,0.00032,0.059297,0.000698,0.013353,0.018471,0.079228,0.080813,0.063538,0.002113,0.003758,0.116451,0.036222,0.21249,0.003726,0.413493,0.099755,0.186137,0.107289,0.011924,0.012905,0.051726,0.037202,0.009559,0.078291,0.089647,0.010811,0.08531,0.137641,0.098123,0.003728,0.003727,0.071853
1,-0.154935,0.615559,-0.099403,-0.003315,0.00063,0.151092,-0.100909,0.005656,0.220873,0.121961,0.057041,0.432281,0.044878,0.01792,0.414254,0.201616,0.005911,0.005786,0.022141,-0.015606,-0.026866,0.276907,-0.188122,0.044831,0.004163,-0.001194,0.11179,-0.004704,-0.032841,0.036493,-0.033482,0.15765,0.033526,0.019013,0.00439,0.030125,-0.090091,0.255435,0.005635,1.649089,0.002394,-1.172824,0.226226,0.030542,-0.036105,0.356432,-0.247407,-0.015704,0.589225,-0.121583,0.097302,-0.166858,0.225441,0.053834,0.005173,0.00562,0.052669,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0,0.022134,0.365223,0.043252,0.003281,0.0,0.155211,0.152782,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.004612,0.003109,0.01163,0.067898,0.187072,0.013933,0.0,0.00032,0.059297,0.000656,0.011859,0.042123,0.100929,0.080813,0.063538,0.0,0.003758,0.116451,0.036222,0.21249,0.003726,0.497897,0.146564,0.103327,0.107289,0.011924,0.012905,0.079162,0.037202,0.009545,0.049787,0.097761,0.0,0.08531,0.182926,0.098123,0.003728,0.003727,0.071853
2,0.0664,0.443172,0.064974,0.007658,-7e-05,0.203753,-0.100909,0.011806,-0.030797,-0.497101,0.355908,0.236486,-0.301619,0.027384,0.365929,0.153332,0.011789,0.03592,0.00827,0.013054,-0.014764,0.263956,0.298949,0.041541,-0.000462,0.000254,0.168972,0.001457,0.057498,-0.092061,-0.268066,-0.060863,0.000601,-0.002112,0.012036,0.175475,-0.025941,0.156538,0.011807,0.942577,0.152497,-0.248882,0.10434,-0.036932,0.084404,0.138569,0.0986,0.079891,-0.147373,-0.18363,-0.010811,0.002193,0.481888,0.037694,0.011866,0.011813,0.04467,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.003533,0.44723,0.210182,0.092662,0.156649,0.052644,0.00906,0.311063,0.064512,0.003537,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.00032,0.059297,0.000698,0.013353,0.042123,0.100929,0.080813,0.063538,0.002113,0.003565,0.116451,0.036222,0.21249,0.003533,0.497897,0.146564,0.186137,0.107289,0.011924,0.008795,0.079162,0.037202,0.003861,0.078291,0.097761,0.010811,0.08531,0.182926,0.098123,0.003535,0.003534,0.071853
3,-0.154935,0.766351,0.157395,-0.025148,-7e-05,0.114861,-0.257157,0.00988,0.299221,0.391816,-0.260937,-0.046712,-0.033713,0.013927,0.551059,-0.300384,0.009861,-0.052231,0.00827,0.011259,-0.004403,-0.122487,-0.35003,0.062382,-0.000462,-0.001858,0.234969,0.00183,-0.017906,-0.127139,0.246585,0.475844,0.083323,-0.002112,0.0101,0.194835,0.175495,0.448261,0.009881,0.499952,-0.072498,0.68855,-0.007017,0.0003,-0.019645,-0.160828,-0.247407,-0.015562,0.30418,0.289913,-0.010811,0.417492,-0.145241,-0.49702,0.009939,0.009887,-0.263563,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013614,0.000463,0.000254,0.059297,0.000698,0.013353,0.038616,0.100929,0.048994,0.063538,0.002113,0.003758,0.116451,0.031307,0.21249,0.003726,0.497897,0.146564,0.176407,0.107289,0.011924,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.08531,0.182926,0.093059,0.003728,0.003727,0.071853
4,0.0664,0.330113,-0.122287,0.007658,-7e-05,0.114861,0.503362,-0.025454,1.263503,-0.443943,0.086009,-0.046712,-0.033713,0.027452,1.253726,0.168296,-0.025506,-0.016331,0.00827,0.013054,-0.038637,0.105709,0.180534,-0.041776,-0.000462,0.000211,-0.039124,0.002172,0.100686,0.030163,-0.046281,-0.332286,-0.009773,-0.002112,-0.025545,-0.128658,-0.186721,1.081958,-0.02545,-0.00348,0.045434,0.591243,0.105751,0.069936,-0.044642,-0.160828,0.0986,0.022905,-0.19369,-0.440603,-0.010811,-0.426567,0.135662,0.444311,-0.025412,-0.025456,0.263005,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.00235,0.44723,0.210182,0.097216,0.156649,0.079435,0.009053,0.307945,0.064512,0.002352,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.00032,0.059297,0.000698,0.009034,0.042123,0.100929,0.065159,0.063538,0.002113,0.002367,0.116451,0.035842,0.201068,0.00235,0.497897,0.146564,0.186137,0.107289,0.008614,0.012656,0.079162,0.037202,0.009559,0.0776,0.095261,0.010811,0.08483,0.182926,0.098123,0.002351,0.002351,0.071853
5,0.0664,-0.189406,0.181539,0.007658,-7e-05,0.398373,-0.195555,0.008028,0.392581,0.877782,-0.239266,0.137463,-0.033713,0.002931,-0.148441,-0.064264,0.008008,0.02498,-0.037855,-0.008478,0.034166,0.015319,0.139157,-0.076951,-0.000462,0.000496,0.011734,0.000684,-0.017906,-0.092061,0.524217,-0.025874,0.114601,-0.002112,0.008238,0.316055,0.108989,-0.41966,0.008029,-0.334927,0.122812,-0.344724,-0.00305,-0.039968,-0.042161,-0.160828,0.0986,-0.015562,-0.186771,0.455867,-0.010811,0.426535,0.171052,-0.422339,0.008086,0.008034,-0.214755,293661,372674,8,True,True,False,1443704000.0,F,GD-FM,15.383439,1700.784058,3235.0,0.022134,0.365223,0.042689,0.003281,7e-05,0.155211,0.152782,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.012934,0.000463,0.00032,0.059297,0.000698,0.013353,0.042123,0.073166,0.080813,0.063538,0.002113,0.003758,0.116451,0.036222,0.21249,0.003726,0.497897,0.146564,0.186137,0.107289,0.011924,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.084406,0.182926,0.098123,0.003728,0.003727,0.071853
6,0.0664,0.330113,-0.09906,-0.025148,-7e-05,-0.276169,0.64922,-0.003725,0.844906,0.99682,0.23502,-0.286916,-0.033713,-0.057044,0.516816,0.117115,-0.003757,-0.011905,0.00827,0.000986,0.044089,0.159488,0.300057,-0.066961,-0.000462,0.001343,0.27933,0.002279,-0.017906,-0.000464,0.095582,-0.03174,0.349615,-0.002112,-0.003596,0.491909,0.099083,0.385784,-0.003722,-0.495811,0.312704,0.304515,0.463153,0.036844,-0.012634,-0.160828,0.124609,-0.015562,-0.185965,0.305653,-0.010811,0.219149,-0.160715,0.484206,-0.003673,-0.003722,0.430075,293661,365528,2,True,True,False,1443704000.0,F,GD-FM,15.792453,1700.784058,3235.0,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.003726,0.44723,0.198278,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.000319,0.054861,0.000688,0.013353,0.042123,0.100929,0.080813,0.040036,0.002113,0.003758,0.098866,0.036222,0.21249,0.003726,0.497897,0.146564,0.186137,0.083596,0.011924,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.08531,0.182926,0.094133,0.003728,0.003727,0.055146
7,-0.154935,-0.786903,0.09375,0.007658,-7e-05,-0.692767,-0.524122,0.008948,-0.847322,-0.138604,-0.102708,-0.046712,-0.033713,0.006467,-1.048457,-0.162347,0.008928,-0.03781,0.00827,0.011259,-0.07198,-0.415025,-0.809487,0.051586,-0.000462,0.000608,-0.194061,-0.000486,-0.017906,-0.014544,0.001124,0.120679,-0.224611,-0.002112,0.009163,-0.672603,0.126353,-0.823042,0.008949,-1.132346,-0.447431,0.111163,-0.495238,-0.013308,-0.019645,-0.160828,-0.247407,-0.015562,-0.136263,-0.415604,-0.010811,0.179266,-0.544099,-0.116849,0.009006,0.008954,-0.021121,293661,373315,6,True,True,False,1443704000.0,F,GD-FM,15.593626,1700.784058,3235.0,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.006,0.003109,0.008295,0.051384,0.144881,0.013933,0.000463,0.00032,0.059297,0.000698,0.013353,0.042123,0.100929,0.080813,0.063538,0.002113,0.003758,0.065955,0.036222,0.21249,0.003726,0.497897,0.146564,0.186137,0.107289,0.011924,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.08531,0.182926,0.098123,0.003728,0.003727,0.071853
8,0.0664,-1.481857,-0.250981,0.007658,-7e-05,0.578838,-0.640895,-0.000228,-0.656712,0.306359,0.310369,-0.380744,-0.033713,0.002931,-1.270533,-0.312864,-0.000256,0.020995,-0.037855,-0.008478,-0.009005,0.058171,-0.38758,-0.017711,-0.000462,0.001336,-0.313641,-0.004279,-0.017906,-0.092061,-0.062077,-0.175743,-0.285762,-0.002112,-7.1e-05,-0.083055,-0.098779,-1.010068,-0.000226,-1.132346,-0.211861,0.304515,-0.007017,-0.003537,0.043307,-0.160828,0.124609,-0.015562,-0.176926,-0.289736,-0.010811,-0.285919,-0.204194,0.043477,-0.000174,-0.000224,-0.288457,293661,366092,10,True,True,False,1443704000.0,F,GD-FM,15.065318,1700.784058,3235.0,0.022134,0.365223,0.030383,0.003281,7e-05,0.137165,0.141105,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.063264,0.00373,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.00032,0.047353,0.000698,0.013353,0.042123,0.100929,0.080813,0.057423,0.002113,0.003758,0.116451,0.036222,0.21249,0.003726,0.497897,0.146564,0.186137,0.107289,0.011924,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.08531,0.182926,0.098123,0.003728,0.003727,0.069364
9,0.0664,-1.839689,-0.101836,0.007658,-7e-05,-0.973273,-0.219964,-0.011695,-2.979274,-1.105002,-0.616251,-0.782461,-0.033713,-0.063144,-1.856904,-0.132756,-0.011734,-0.053298,0.00827,0.000986,0.043083,-0.105108,-0.244704,0.059187,-0.000462,-0.001052,-0.194195,0.00104,-0.017906,0.057576,0.027475,-0.043527,0.051078,-0.002112,-0.011636,-0.156445,-0.182925,-1.042942,-0.011692,-2.485839,-0.684848,-0.344724,-0.609735,-0.049299,0.023559,-0.160828,0.0986,0.006278,0.23616,-0.137282,-0.010811,-0.421763,-0.894524,-0.446379,-0.011647,-0.011694,-0.118209,293661,359427,9,True,True,False,1443704000.0,F,GD-FM,15.292547,1700.784058,3235.0,0.022134,0.32944,0.043252,0.003281,7e-05,0.127161,0.152782,0.003726,0.234034,0.150691,0.061685,0.116477,0.079435,0.00845,0.252426,0.064512,0.00373,0.013612,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.00032,0.059297,0.000698,0.013353,0.042123,0.100929,0.080813,0.063538,0.002113,0.003758,0.116451,0.036222,0.209203,0.003726,0.362548,0.122823,0.186137,0.095839,0.01099,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.08531,0.147883,0.098123,0.003728,0.003727,0.071853


In [18]:
df_f.shape, df.shape

((150297, 57), (150297, 126))

In [19]:
from prediction.models import clmodel
from prediction.tools.helpers import strata_scale_down

is1 = mod.is1.copy()
is2 = mod.is2.copy()
oos = mod.oos.copy()
strata = strata_scale_down(av.event_id)

In [20]:
df_f.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
293661,0,0.022134,0.260604,0.043252,0.003281,7e-05,0.155211,0.129012,0.003726,0.424278,0.210182,0.097216,0.121474,0.03465,0.00906,0.311063,0.051448,0.00373,0.008922,0.006,0.002866,0.011607,0.069193,0.110954,0.013933,0.000463,0.00032,0.059297,0.000698,0.013353,0.018471,0.079228,0.080813,0.063538,0.002113,0.003758,0.116451,0.036222,0.21249,0.003726,0.413493,0.099755,0.186137,0.107289,0.011924,0.012905,0.051726,0.037202,0.009559,0.078291,0.089647,0.010811,0.08531,0.137641,0.098123,0.003728,0.003727,0.071853
293661,1,0.022134,0.365223,0.043252,0.003281,0.0,0.155211,0.152782,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.004612,0.003109,0.01163,0.067898,0.187072,0.013933,0.0,0.00032,0.059297,0.000656,0.011859,0.042123,0.100929,0.080813,0.063538,0.0,0.003758,0.116451,0.036222,0.21249,0.003726,0.497897,0.146564,0.103327,0.107289,0.011924,0.012905,0.079162,0.037202,0.009545,0.049787,0.097761,0.0,0.08531,0.182926,0.098123,0.003728,0.003727,0.071853
293661,2,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.003533,0.44723,0.210182,0.092662,0.156649,0.052644,0.00906,0.311063,0.064512,0.003537,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.00032,0.059297,0.000698,0.013353,0.042123,0.100929,0.080813,0.063538,0.002113,0.003565,0.116451,0.036222,0.21249,0.003533,0.497897,0.146564,0.186137,0.107289,0.011924,0.008795,0.079162,0.037202,0.003861,0.078291,0.097761,0.010811,0.08531,0.182926,0.098123,0.003535,0.003534,0.071853
293661,3,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.003726,0.44723,0.210182,0.097216,0.156649,0.079435,0.00906,0.311063,0.064512,0.00373,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013614,0.000463,0.000254,0.059297,0.000698,0.013353,0.038616,0.100929,0.048994,0.063538,0.002113,0.003758,0.116451,0.031307,0.21249,0.003726,0.497897,0.146564,0.176407,0.107289,0.011924,0.012905,0.079162,0.037202,0.009559,0.078291,0.097761,0.010811,0.08531,0.182926,0.093059,0.003728,0.003727,0.071853
293661,4,0.022134,0.365223,0.043252,0.003281,7e-05,0.155211,0.152782,0.00235,0.44723,0.210182,0.097216,0.156649,0.079435,0.009053,0.307945,0.064512,0.002352,0.013719,0.006,0.003109,0.01163,0.069193,0.187072,0.013933,0.000463,0.00032,0.059297,0.000698,0.009034,0.042123,0.100929,0.065159,0.063538,0.002113,0.002367,0.116451,0.035842,0.201068,0.00235,0.497897,0.146564,0.186137,0.107289,0.008614,0.012656,0.079162,0.037202,0.009559,0.0776,0.095261,0.010811,0.08483,0.182926,0.098123,0.002351,0.002351,0.071853


In [21]:
col_names = ['f%s'%i for i in range(1,58)] + ['M_f%s'%i for i in range(1,58)]

predict_mask.shape[0], df_f.shape

(1742772, (150297, 57))

In [22]:
factors_new = np.zeros((df_f.shape[1], predict_mask.shape[0]))
X = df_f.values
j=0
for i,flag in enumerate(predict_mask):
    if flag:
        factors_new[:,i] = X[j,:]
        j +=1
factors_new.shape, factors.shape

((57, 1742772), (57, 1742772))

In [23]:
factors_new = np.vstack((factors, factors_new))

In [24]:
factors_new.shape

(114, 1742772)

In [25]:
%%time
model_coefs, model_step1prob, model_step2prob, model_likelihood \
= mod.fit_slices(tsav, factors_new, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print "LL new  model"
print model_likelihood
print 
mod.is1 = is1
mod.is2 = is1 
mod.oos = oos

old_coefs, old_step1prob, old_step2prob, old_likelihood \
= mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)
print "LL old model"
print old_likelihood

. . . . . . . . . . 10
LL new cluster model
[[-1851.31115584 -1995.16006953 -1995.16006953]
 [-1854.66326777 -1991.14291317 -1991.14291317]
 [-1858.29262186 -1988.69182241 -1988.69182241]
 [-1859.38004196 -1984.83079958 -1984.83079958]
 [-1872.69317589 -1972.07729947 -1972.07729947]
 [-1887.93390404 -1962.28002011 -1962.28002011]
 [-1915.06501813 -1947.95300353 -1947.95300353]
 [-1925.7402651  -1939.73386518 -1939.73386518]
 [-1926.96833766 -1939.58826054 -1939.58826054]
 [-1932.65006964 -1934.46768841 -1934.46768841]
 [    0.             0.             0.        ]]

. . . . . . . . . . 10
LL old model
[[-1851.82404552 -1995.80190523 -1995.80190523]
 [-1855.17692183 -1991.83205299 -1991.83205299]
 [-1858.78848725 -1989.37872712 -1989.37872712]
 [-1859.88547983 -1985.49613881 -1985.49613881]
 [-1873.26577532 -1972.64086846 -1972.64086846]
 [-1888.55464921 -1962.77217781 -1962.77217781]
 [-1915.6768923  -1948.29012377 -1948.29012377]
 [-1926.34347024 -1939.98590585 -1939.98590585]
 [-192

##### write the simdata file

In [42]:
def dic_to_tenzor(dic, key, base):
    '''
    tenzor where first dimention is the number of cluster
    0 = no_cluster
    <dic> dictionary cluster's data that to convert in tenzor
    <key> the list of clusters that use 
    <base> no cluster data
    '''
    
    key_0 = dic.keys()[0]
    tenzor = np.zeros((len(key)+1, dic[key_0].shape[0], dic[key_0].shape[1]))
    try:
        tenzor[0,:,:] = base
    except:
        print 'base and dic[k] have the diferent size'
        return
    for i,k in enumerate(key):
        tenzor[i+1,:,:] = dic[k]
    return tenzor

In [43]:
def clusters_number(data, key, av=av):
    """ 
    list with numbers of clusters 
    <data>  pandas Series index = event_id, data = cluster's names
    <key> the list of clusters that use
    """
    
    cl_number = np.zeros((len(av.event_id)))
    for i,k in enumerate(key):
        mask = np.in1d(av.event_id,data.index[data ==k])
        cl_number = np.where(mask,i+1,cl_number)
    return cl_number

In [44]:
def write_simdata(step1probs, oos, coefs, cluster_number=None, file_ = 'simdata.p'):
    '''
    <step1probs> is expected to be a matrix N_slices x len(av). 
    <oos> is a boolean mask denoting the out of sample range. len(oos) shoud equal len(av)
    <coefs> is a coefficient matrix with the size N_slices x 3
    <cluster_number> is an integer array with the cluster numbers per race. Size: len(av)
    '''
    f = file(settings.paths.join(file_), 'wb')
    if cluster_number is None:
        s1p = step1probs[:, oos]
    else:
        cluster_number = cluster_number[oos]
        s1p = step1probs[:, :, oos]
    pickle.dump([s1p, oos, coefs, cluster_number], f)
    f.close()

In [45]:
def write_dic_to_simdata(file_name, old_step1probs, old_coefs, oos, data=None, av =av,
                         cluster_step1probs =None, cluster_coefs =None, cluster_names =None):
    """
    <file_name> is name of file to record
    <old_step1probs> is expected to be a matrix N_slices x len(av)
    <old_coefs> is a coefficient matrix with the size N_slices x 3
    <oos> is a boolean mask denoting the out of sample range. len(oos) shoud equal len(av)
    <data>  pandas Series index = event_id, data = cluster's names
    <cluster_step1probs> is expected to be a dictionary: key is the cluster name and
                        data are the matrix N_slices x len(av) for each cluster
    <cluster_coefs> is a dictionary : key is the cluster name and data and 
                        data are the coefficient matrix with the size N_slices x 3
    <cluster_number> is an integer array with the cluster numbers per race. Size: len(av)
    """
    
    cl_number= np.zeros((len(av.event_id)))
    
    if cluster_names is not None:
        
        s1prob = dic_to_tenzor(cluster_step1probs, cluster_names, old_step1probs)
        coef_s = dic_to_tenzor(cluster_coefs, cluster_names, old_coefs)
        
        for i,k in enumerate(cluster_names):
            mask = np.in1d(av.event_id,data.index[data ==k])
            cl_number = np.where(mask, i+1, cl_number)
        #cl_number = clusters_number(data, cluster_names, av=av)
          
        
    else:
        
        s1prob = np.zeros((1,old_step1probs.shape[0], old_step1probs.shape[1]))
        s1prob[0,:,:] = old_step1probs
        coef_s = np.zeros((1,old_coefs.shape[0], old_coefs.shape[1]))
        coef_s[0,:,:] = old_coefs
        
        

    #write_simdata(s1prob, oos, coef_s, cl_number, file_ = file_name)
    s1prob = s1prob[:,:,oos]
    cl_number = cl_number[oos]
    
    with open (settings.paths.join(file_name), 'wb') as f:
            pickle.dump( [s1prob, oos, coef_s, cl_number], f)
    return