# ThoroughBet Simulation


## Load necessary modules

In [1]:
import numpy as np
import pickle

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end

## Load data

In [2]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [3]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

## Preprocessing

In [4]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [5]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']

price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [7]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . . . .CPU times: user 5min 39s, sys: 18.6 s, total: 5min 58s
Wall time: 5min 6s



In [8]:
factors.T.shape

(1631851, 57)

In [9]:
predict_mask = mod.is1|mod.is2|mod.oos

(1631851, 1631851, 125714, 181989, 125714, 12375)

In [10]:
import pandas as pd
pd.set_option('display.max_columns', 60)

In [11]:
col_names = ['f{}'.format(i) for i in range(1,58)]
df = pd.DataFrame(data =factors[:, predict_mask].T , columns = col_names)

In [12]:
df['event_id'] = av.event_id[predict_mask]
df['runner_id'] = av.runner_id[predict_mask]
df['result'] = av.result[predict_mask]
df['is1'] = mod.is1[predict_mask]
df['is2'] = mod.is2[predict_mask]
df['oos'] = mod.oos[predict_mask]
df['time'] =av.start_time[predict_mask]
df['obstacle'] = av.obstacle[predict_mask]
df['going'] = av.going[predict_mask]
df['speed'] = av.speed[predict_mask]
df['distance'] = av.distance[predict_mask]
df['prize'] = av.prize[predict_mask]

df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,...,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize
0,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,...,2.49246,0.753668,-0.000403,0.234473,0.250251,0.023256,0.628183,0.0986,-0.014886,-0.089427,0.53691,-0.012463,-0.143313,0.943943,0.417261,-0.000921,-0.00078,0.1193,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0
1,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,...,1.648349,0.073634,-0.650115,0.177804,-0.052191,-0.034483,0.370725,-0.247407,-0.023127,0.550295,-0.121513,0.112166,0.040612,0.224962,0.057615,-0.016085,-0.01733,0.026977,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0
2,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,...,0.941642,0.146075,-0.109316,0.106961,0.005249,0.083641,0.144267,0.0986,0.08141,-0.152771,-0.183572,-0.012463,0.159303,0.48867,0.036059,0.0148,0.014925,0.051179,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0
3,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,...,0.498932,-0.041847,0.597555,0.001817,-0.037324,-0.019826,-0.163311,-0.247407,-0.014886,0.461782,0.289931,-0.012463,0.088135,-0.126984,-0.495465,0.012784,0.012911,-0.262583,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0
4,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,...,-0.004469,0.049962,0.495737,0.116301,-0.110655,-0.044347,-0.163311,0.0986,0.0239,-0.237007,-0.440662,-0.012463,-0.125458,0.136239,0.442488,-0.024187,-0.024024,0.267151,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0


In [13]:
df.to_csv('/home/oleg/thbmodel/racehorse_data2.csv')

- групировка по забегам 
- df_f сгрупированы факторы по забегу c вычислением минимальной разницы между сортированными факторами для значение в забеге
- df1 сгрупированы  переменные которые общие для всех участников звбега 

In [14]:
def df_cut(data, result):
    """
    only event_id where there are result-1 and result
    """
    
    event = np.intersect1d(data[data.result == result-1]['event_id'], data[data.result == result]['event_id'])
    mask = np.in1d(data['event_id'],event)
    return data[mask]

In [15]:
def ranking_data(df, list_result =[2, 3, 4], 
                columns = ['f{}'.format(i) for i in range(1,58)] +['result','event_id']):
    
    df_event = pd.Series()
    
    X_data = np.zeros((0,len(columns)-1))
    for n in list_result:
        print ''
        df_r = df_cut(df, n) # only data where are first and second place
        first , second = n-1, n # place 1, 2 
        # data where first place
        X1 = df_r[columns][df_r['result'] == first].drop_duplicates(subset ='event_id').values
        # events which use in data
        df_event = df_event.append(df_r['event_id'][df_r['result'] == first])
        #print df_event.shape
        # data where second place
        X2 = df_r[columns][df_r['result'] == second].drop_duplicates(subset ='event_id').values
        # differance between first and second place mix of the order  from "first -second" or "second -first"
        X = (X1 -X2)* np.random.choice([1,-1], len(X1)).reshape(-1,1) 
        X_data = np.vstack((X_data, X[:, :-1])) # union of set of deta matrix
        #print X.shape, X_data.shape
        mask_class = np.in1d(X_data[:,-1],[-1.,1.]) # only two class "-1, 1"
    return X_data[mask_class], df_event

In [16]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

In [53]:
def clf_llhood (data, clf, name, result =[2,3,4], 
               columns = ['f{}'.format(i) for i in range(1,58)] +['result','event_id']):
    
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
    #print np.sum(data.is1), result
    X_y_data, _ = ranking_data(data[data.is1], result, columns)
    X_train, y_train = X_y_data[:,:-1], X_y_data[:,-1]
    
    clf.fit(X_train, y_train)
    print clf
    X_y_data, _ = ranking_data(data[data.oos], result, columns)
    X_test, y_test = X_y_data[:,:-1], X_y_data[:,-1]
    
    y_pred = clf.predict(X_test)
    
    print 'train score  ', logit.score(X_train, y_train)
    print 'test accuracy  %s'%name, accuracy_score(y_test, y_pred)
    print 'test precision %s'%name, precision_score(y_test, y_pred)#, labels = [-1], average ='micro')
    print 'test recall %s'%name, recall_score(y_test, y_pred)#, labels = [-1], average ='micro')
    print 'test f1_score %s'%name, f1_score(y_test, y_pred)#, labels = [-1], average ='micro')
    
    print
    factors = columns[:-2]
    try:
        data['S_%s'%name] = clf.decision_function(data[factors].values)
        data['p_%s'%name] = data['S_%s'%name].groupby(by = data.event_id).apply(softmax)
    except:
        print "not decision_function"
        data['p_%s'%name] = 0.0
    
    data['log_p_%s'%name] = np.log(data['p_%s'%name])
    print 'is1  mean ll %s'%name, data.ix[data.is1 & (data.result ==1),'log_p_%s'%name].mean()*1000
    print 
    print 'oos  mean ll %s'%name, data.ix[data.oos & (data.result ==1),'log_p_%s'%name].mean()*1000
    return data, clf

##### построим алгоритм попарного ранжирования  на LogisticRegression¶

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logit = LogisticRegression (C =1., penalty ='l2', n_jobs =-1)

In [47]:
df, logit = clf_llhood(df, logit,'LR',[2,4]) 



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


train score   0.570335515548
test accuracy  LR 0.578908211662
test precision LR 0.580702063856
test recall LR 0.604813521955
test f1_score LR 0.592512598992

is1  mean ll LR -2596.72266715

oos  mean ll LR -2660.4926531


In [55]:
from sklearn.svm import SVC

svm = SVC(kernel ='poly')
df, svm = clf_llhood(df, svm,'svm',[2])


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

train score   0.57995959596
test accuracy  svm 0.582021237642
test precision svm 0.618216139689
test recall svm 0.470545977011
test f1_score svm 0.534366714257

is1  mean ll svm -2450.36916075

oos  mean ll svm -2530.52318193


In [56]:
from sklearn.svm import SVC

svm = SVC(kernel ='rbf')
df, svm = clf_llhood(df, svm,'svm_rbf',[2])


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

train score   0.580525252525
test accuracy  svm_rbf 0.593921640425
test precision svm_rbf 0.578646008757
test recall svm_rbf 0.639851024209
test f1_score svm_rbf 0.607711354793

is1  mean ll svm_rbf -2927.80637527

oos  mean ll svm_rbf -3007.06824248


In [57]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
df, gbc = clf_llhood(df, gbc,'GBC',[2])


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

train score   0.579878787879
test accuracy  GBC 0.584218235079
test precision GBC 0.586685653257
test recall GBC 0.59753444525
test f1_score GBC 0.592060355667

is1  mean ll GBC -2560.14865191

oos  mean ll GBC -2611.23864575


In [58]:
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,...,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize,S_Logist,p_Logist,log_p_Logist,S_LR,p_LR,log_p_LR,S_svm,p_svm,log_p_svm,S_svm_rbf,p_svm_rbf,log_p_svm_rbf,S_GBC,p_GBC,log_p_GBC
0,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,...,-0.000921,-0.00078,0.1193,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0,-0.886454,0.035475,-3.338929,-0.743594,0.035475,-3.338929,-0.819598,0.045058,-3.099814,-1.37049,0.018784,-3.974746,-0.865776,0.041643,-3.178634
1,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,...,-0.016085,-0.01733,0.026977,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0,-0.336058,0.061511,-2.788532,-0.172904,0.061511,-2.788532,-0.112023,0.091425,-2.392238,-0.651454,0.038553,-3.25571,-0.321176,0.071788,-2.634033
2,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,...,0.0148,0.014925,0.051179,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0,-0.20929,0.069825,-2.661765,-0.173136,0.069825,-2.661765,-0.062747,0.096043,-2.342963,-0.561874,0.042166,-3.16613,-0.278906,0.074888,-2.591764
3,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,...,0.012784,0.012911,-0.262583,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0,-0.064788,0.08068,-2.517262,-0.024767,0.08068,-2.517262,-0.065262,0.095801,-2.345478,-0.22686,0.058947,-2.831116,-0.379769,0.067703,-2.692627
4,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,...,-0.024187,-0.024024,0.267151,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0,-0.210551,0.069737,-2.663025,-0.208785,0.069737,-2.663025,-0.126865,0.090078,-2.407081,-0.432343,0.047998,-3.036599,0.026403,0.101626,-2.286455


##### Добавим новых признаков

In [None]:
from sklearn.preprocessing import MinMaxScaler
df_scale = MinMaxScaler().fit_transform(df[col_names])
df_scale['event_id'] = df['event_id']
df_scale.head()

In [None]:
df_scale = df_scale.join(df_scale.groupby(df_scale.event_id).mean(), on ='event_id', rsuffix = '_mean')
df_scale.head()

In [None]:
X_rel = df_scale[col_names].values/df_scale[col_mean].values
X_factor = df_scale[col_names].values *X_rel
X_new = np.hstack((df_scale[col_names].values, X_rel, X_factor))

In [None]:
df_new = pd.DataFrame(X_new, columns = )
df_new['event_id'] = df['event_id']
df_new['result'] = df['result']
df.head()

- построим новый признак с помощью модели logit

In [59]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))

    j=0
    for i,flag in enumerate(predict_mask):
        if flag:
            factors_new[:,i] = X[j,:]
            j +=1
    return factors_new

In [61]:
X_new = np.log(1 -df[['p_LR', 'p_svm', 'p_svm_rbf', 'p_GBC']].values)
X = np.hstack((df[['log_p_LR', 'log_p_svm', 'log_p_svm_rbf', 'log_p_GBC']].values, X_new))
X_new.shape, X.shape

((181989, 4), (181989, 8))

In [63]:
factors_new = np.vstack((factors, new_factors_array(X)))
factors_new.shape

(65, 1631851)

#### NEW Model & OLD model compare

In [64]:
from prediction.models import clmodel
from prediction.tools.helpers import strata_scale_down

#is1 = mod.is1.copy()
#is2 = mod.is2.copy()
#oos = mod.oos.copy()
strata = strata_scale_down(av.event_id)

In [65]:
new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood \
    =   mod.fit_slices(tsav, factors_new, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'Model with new factor '
print 'LL'
print new_model_likelihood

    
#mod.is1 = is1
#mod.is2 = is1
#mod.oos = oos
    

old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood \
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)
    

print 'Old model '
print 'LL'
print old_model_likelihood



. . . . . . . . . . 10
Model with new factor 
LL
[[-1809.70243941 -1997.47376677 -1997.47376677]
 [-1815.37265321 -1993.74235621 -1993.74235621]
 [-1820.5170977  -1991.46397512 -1991.46397512]
 [-1825.05028225 -1979.15444169 -1979.15444169]
 [-1845.24623878 -1956.28931779 -1956.28931779]
 [-1862.9495832  -1949.96027839 -1949.96027839]
 [-1903.63712001 -1924.4423732  -1924.4423732 ]
 [-1921.01859378 -1903.35031036 -1903.35031036]
 [-1923.74153621 -1902.96599187 -1902.96599187]
 [-1928.72119415 -1895.91106428 -1895.91106428]
 [    0.             0.             0.        ]]
. . . . . . . . . . 10
Old model 
LL
[[-1809.60545794 -1997.12919856 -1997.12919856]
 [-1815.29560313 -1993.38729693 -1993.38729693]
 [-1820.46573311 -1991.13279316 -1991.13279316]
 [-1825.04841809 -1978.8072344  -1978.8072344 ]
 [-1845.34679456 -1957.00310184 -1957.00310184]
 [-1863.07407698 -1951.28943929 -1951.28943929]
 [-1903.75005795 -1924.17873405 -1924.17873405]
 [-1921.12493591 -1902.65155685 -1902.65155685]
 

In [39]:
old_model_step1prob[-1][av.result == 1].shape

(145392,)

In [143]:
def result_sort(ar):
    n = len(ar)
    x = np.zeros((n), dtype=np.int32)
    for i,j in enumerate(np.argsort(ar)):
        x[j]= int(n-i)
    return x

In [145]:
ar =np.array([0.1,0.2,0.01,0.4,0.5,0.3,0.7,1.1,0.21])
df_308708 = df[(df['event_id']== 308708)]
result_sort(df_308708.p)

array([43, 42, 44, 36, 41, 37, 30, 25, 38, 29, 32, 39, 33, 31, 14, 24, 15,
       35, 27, 21, 18, 40, 16, 23, 26, 10, 19,  6, 17, 34, 13, 28,  7, 12,
       11,  8, 22, 20,  2,  9,  1,  4,  3,  5], dtype=int32)

In [137]:
by =

In [146]:
df['p_result'] =df.S.groupby(by =df.event_id).transform(result_sort).astype(int)
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,...,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize,S,p,log_p,p_result
0,0.0664,1.812544,0.175912,0.007658,-7e-05,0.380432,0.886927,-0.003214,1.493022,-0.510089,0.174814,0.784029,0.492731,0.021176,1.222551,0.332255,-0.003245,0.083893,0.003949,-0.018036,0.044318,-0.236929,1.061229,-0.056129,-0.000462,-0.000142,-0.065776,7e-06,-0.017906,0.294096,...,0.005423,0.023559,0.630792,0.0986,-0.015562,-0.102575,0.537004,-0.010811,0.05647,0.934733,0.419065,-0.003162,-0.003211,0.115685,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0,-0.816518,0.039211,-3.238797,10
1,-0.154935,0.615559,-0.099403,-0.003315,0.00063,0.151092,-0.100909,0.005656,0.220873,0.121961,0.057041,0.432281,0.044878,0.01792,0.414254,0.201616,0.005911,0.005786,0.022141,-0.015606,-0.026866,0.276907,-0.188122,0.044831,0.004163,-0.001194,0.11179,-0.004704,-0.032841,0.036493,...,0.030542,-0.036105,0.356432,-0.247407,-0.015704,0.589225,-0.121583,0.097302,-0.166858,0.225441,0.053834,0.005173,0.00562,0.052669,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0,-0.242959,0.069583,-2.665238,8
2,0.0664,0.443172,0.064974,0.007658,-7e-05,0.203753,-0.100909,0.011806,-0.030797,-0.497101,0.355908,0.236486,-0.301619,0.027384,0.365929,0.153332,0.011789,0.03592,0.00827,0.013054,-0.014764,0.263956,0.298949,0.041541,-0.000462,0.000254,0.168972,0.001457,0.057498,-0.092061,...,-0.036932,0.084404,0.138569,0.0986,0.079891,-0.147373,-0.18363,-0.010811,0.002193,0.481888,0.037694,0.011866,0.011813,0.04467,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0,-0.167937,0.075004,-2.590215,7
3,-0.154935,0.766351,0.157395,-0.025148,-7e-05,0.114861,-0.257157,0.00988,0.299221,0.391816,-0.260937,-0.046712,-0.033713,0.013927,0.551059,-0.300384,0.009861,-0.052231,0.00827,0.011259,-0.004403,-0.122487,-0.35003,0.062382,-0.000462,-0.001858,0.234969,0.00183,-0.017906,-0.127139,...,0.0003,-0.019645,-0.160828,-0.247407,-0.015562,0.30418,0.289913,-0.010811,0.417492,-0.145241,-0.49702,0.009939,0.009887,-0.263563,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0,-0.115483,0.079043,-2.537762,5
4,0.0664,0.330113,-0.122287,0.007658,-7e-05,0.114861,0.503362,-0.025454,1.263503,-0.443943,0.086009,-0.046712,-0.033713,0.027452,1.253726,0.168296,-0.025506,-0.016331,0.00827,0.013054,-0.038637,0.105709,0.180534,-0.041776,-0.000462,0.000211,-0.039124,0.002172,0.100686,0.030163,...,0.069936,-0.044642,-0.160828,0.0986,0.022905,-0.19369,-0.440603,-0.010811,-0.426567,0.135662,0.444311,-0.025412,-0.025456,0.263005,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0,-0.159802,0.075616,-2.582081,6


In [151]:
df_compare_results =df.ix[(df.result <= 5) & (df.result >= 0)& (df.p_result <= df.result +3) & (df.p_result >= df.result -3), ['event_id', 'result', 'p_result']]
df_compare_results

Unnamed: 0,event_id,result,p_result
1,293661,5,8
4,293661,4,6
11,293662,4,7
25,293663,4,5
30,293663,3,4
31,293663,5,7
34,293664,2,4
35,293664,4,2
37,293664,3,3
45,293665,2,3


In [157]:
df_compare_results[(df.p_result <= df.result +1) & (df.p_result >= df.result -1) & (df.result <= 3)]

  if __name__ == '__main__':


Unnamed: 0,event_id,result,p_result
30,293663,3,4
37,293664,3,3
45,293665,2,3
63,293667,3,3
66,293667,1,2
69,293668,2,1
70,293668,3,2
82,293670,2,2
90,293671,2,3
109,293674,3,3


In [159]:
df_compare_results[(df.p_result == df.result ) & (df.result <= 3)]

  if __name__ == '__main__':


Unnamed: 0,event_id,result,p_result
37,293664,3,3
63,293667,3,3
82,293670,2,2
109,293674,3,3
129,293677,3,3
187,293684,3,3
224,293687,3,3
233,293688,3,3
244,293689,3,3
327,293744,3,3


In [160]:
def P_mean_rank(X):
    S = X.sum()/X.count() - 2*X
    return 1./(1+np.exp(S))

In [164]:
df['log_mean_r'] = np.log(df.S.groupby(by =df.event_id).apply(P_mean_rank))

In [165]:
df.head(10)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,...,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize,S,p,log_p,p_result,log_mean_r
0,0.0664,1.812544,0.175912,0.007658,-7e-05,0.380432,0.886927,-0.003214,1.493022,-0.510089,0.174814,0.784029,0.492731,0.021176,1.222551,0.332255,-0.003245,0.083893,0.003949,-0.018036,0.044318,-0.236929,1.061229,-0.056129,-0.000462,-0.000142,-0.065776,7e-06,-0.017906,0.294096,...,0.023559,0.630792,0.0986,-0.015562,-0.102575,0.537004,-0.010811,0.05647,0.934733,0.419065,-0.003162,-0.003211,0.115685,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0,-0.816518,0.039211,-3.238797,10,-1.811464
1,-0.154935,0.615559,-0.099403,-0.003315,0.00063,0.151092,-0.100909,0.005656,0.220873,0.121961,0.057041,0.432281,0.044878,0.01792,0.414254,0.201616,0.005911,0.005786,0.022141,-0.015606,-0.026866,0.276907,-0.188122,0.044831,0.004163,-0.001194,0.11179,-0.004704,-0.032841,0.036493,...,-0.036105,0.356432,-0.247407,-0.015704,0.589225,-0.121583,0.097302,-0.166858,0.225441,0.053834,0.005173,0.00562,0.052669,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0,-0.242959,0.069583,-2.665238,8,-0.965335
2,0.0664,0.443172,0.064974,0.007658,-7e-05,0.203753,-0.100909,0.011806,-0.030797,-0.497101,0.355908,0.236486,-0.301619,0.027384,0.365929,0.153332,0.011789,0.03592,0.00827,0.013054,-0.014764,0.263956,0.298949,0.041541,-0.000462,0.000254,0.168972,0.001457,0.057498,-0.092061,...,0.084404,0.138569,0.0986,0.079891,-0.147373,-0.18363,-0.010811,0.002193,0.481888,0.037694,0.011866,0.011813,0.04467,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0,-0.167937,0.075004,-2.590215,7,-0.87512
3,-0.154935,0.766351,0.157395,-0.025148,-7e-05,0.114861,-0.257157,0.00988,0.299221,0.391816,-0.260937,-0.046712,-0.033713,0.013927,0.551059,-0.300384,0.009861,-0.052231,0.00827,0.011259,-0.004403,-0.122487,-0.35003,0.062382,-0.000462,-0.001858,0.234969,0.00183,-0.017906,-0.127139,...,-0.019645,-0.160828,-0.247407,-0.015562,0.30418,0.289913,-0.010811,0.417492,-0.145241,-0.49702,0.009939,0.009887,-0.263563,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0,-0.115483,0.079043,-2.537762,5,-0.815284
4,0.0664,0.330113,-0.122287,0.007658,-7e-05,0.114861,0.503362,-0.025454,1.263503,-0.443943,0.086009,-0.046712,-0.033713,0.027452,1.253726,0.168296,-0.025506,-0.016331,0.00827,0.013054,-0.038637,0.105709,0.180534,-0.041776,-0.000462,0.000211,-0.039124,0.002172,0.100686,0.030163,...,-0.044642,-0.160828,0.0986,0.022905,-0.19369,-0.440603,-0.010811,-0.426567,0.135662,0.444311,-0.025412,-0.025456,0.263005,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0,-0.159802,0.075616,-2.582081,6,-0.865664
5,0.0664,-0.189406,0.181539,0.007658,-7e-05,0.398373,-0.195555,0.008028,0.392581,0.877782,-0.239266,0.137463,-0.033713,0.002931,-0.148441,-0.064264,0.008008,0.02498,-0.037855,-0.008478,0.034166,0.015319,0.139157,-0.076951,-0.000462,0.000496,0.011734,0.000684,-0.017906,-0.092061,...,-0.042161,-0.160828,0.0986,-0.015562,-0.186771,0.455867,-0.010811,0.426535,0.171052,-0.422339,0.008086,0.008034,-0.214755,293661,372674,8,True,True,False,1443704000.0,F,GD-FM,15.383439,1700.784058,3235.0,0.112361,0.099269,-2.309918,4,-0.587086
6,0.0664,0.330113,-0.09906,-0.025148,-7e-05,-0.276169,0.64922,-0.003725,0.844906,0.99682,0.23502,-0.286916,-0.033713,-0.057044,0.516816,0.117115,-0.003757,-0.011905,0.00827,0.000986,0.044089,0.159488,0.300057,-0.066961,-0.000462,0.001343,0.27933,0.002279,-0.017906,-0.000464,...,-0.012634,-0.160828,0.124609,-0.015562,-0.185965,0.305653,-0.010811,0.219149,-0.160715,0.484206,-0.003673,-0.003722,0.430075,293661,365528,2,True,True,False,1443704000.0,F,GD-FM,15.792453,1700.784058,3235.0,-0.53119,0.052158,-2.953469,9,-1.359244
7,-0.154935,-0.786903,0.09375,0.007658,-7e-05,-0.692767,-0.524122,0.008948,-0.847322,-0.138604,-0.102708,-0.046712,-0.033713,0.006467,-1.048457,-0.162347,0.008928,-0.03781,0.00827,0.011259,-0.07198,-0.415025,-0.809487,0.051586,-0.000462,0.000608,-0.194061,-0.000486,-0.017906,-0.014544,...,-0.019645,-0.160828,-0.247407,-0.015562,-0.136263,-0.415604,-0.010811,0.179266,-0.544099,-0.116849,0.009006,0.008954,-0.021121,293661,373315,6,True,True,False,1443704000.0,F,GD-FM,15.593626,1700.784058,3235.0,0.591434,0.160278,-1.830844,2,-0.267274
8,0.0664,-1.481857,-0.250981,0.007658,-7e-05,0.578838,-0.640895,-0.000228,-0.656712,0.306359,0.310369,-0.380744,-0.033713,0.002931,-1.270533,-0.312864,-0.000256,0.020995,-0.037855,-0.008478,-0.009005,0.058171,-0.38758,-0.017711,-0.000462,0.001336,-0.313641,-0.004279,-0.017906,-0.092061,...,0.043307,-0.160828,0.124609,-0.015562,-0.176926,-0.289736,-0.010811,-0.285919,-0.204194,0.043477,-0.000174,-0.000224,-0.288457,293661,366092,10,True,True,False,1443704000.0,F,GD-FM,15.065318,1700.784058,3235.0,0.498571,0.146064,-1.923707,3,-0.314031
9,0.0664,-1.839689,-0.101836,0.007658,-7e-05,-0.973273,-0.219964,-0.011695,-2.979274,-1.105002,-0.616251,-0.782461,-0.033713,-0.063144,-1.856904,-0.132756,-0.011734,-0.053298,0.00827,0.000986,0.043083,-0.105108,-0.244704,0.059187,-0.000462,-0.001052,-0.194195,0.00104,-0.017906,0.057576,...,0.023559,-0.160828,0.0986,0.006278,0.23616,-0.137282,-0.010811,-0.421763,-0.894524,-0.446379,-0.011647,-0.011694,-0.118209,293661,359427,9,True,True,False,1443704000.0,F,GD-FM,15.292547,1700.784058,3235.0,0.831526,0.203772,-1.590753,1,-0.173583


In [175]:
def add_new_factors(new_facotrs, predict_mask =predict_mask):
    
    n,m = new_facotrs.shape
    factors_new = np.zeros((m,predict_mask.shape[0]))
    
    j=0
    for i,flag in enumerate(predict_mask):
        if flag:
            factors_new[:,i] = new_facotrs[j,:]
            j +=1
    return factors_new
    
factors_new = np.vstack((factors, add_new_factors(df[['log_mean_r']].values)))
factors_new.shape, factors.shape

((58, 1742772), (57, 1742772))

In [173]:
df[['log_mean_r']].values.shape

(150297, 1)

In [176]:
new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood \
    =   mod.fit_slices(tsav, factors_new, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'Model with new factor '
print 'LL'
print new_model_likelihood

    
#mod.is1 = is1
#mod.is2 = is1
#mod.oos = oos
    

old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood \
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)
    

print 'Old model '
print 'LL'
print old_model_likelihood


. . . . . . . . . . 10
Model with new factor 
LL
[[-1851.8089964  -1995.51280354 -1995.51280354]
 [-1855.1547098  -1991.53894916 -1991.53894916]
 [-1858.76346104 -1989.09626139 -1989.09626139]
 [-1859.86144973 -1985.23280343 -1985.23280343]
 [-1873.2037878  -1972.33031725 -1972.33031725]
 [-1888.46346448 -1962.41004666 -1962.41004666]
 [-1915.55632013 -1948.01265824 -1948.01265824]
 [-1926.20943873 -1939.76050395 -1939.76050395]
 [-1927.37472777 -1939.46182377 -1939.46182377]
 [-1933.10581927 -1934.24051674 -1934.24051674]
 [    0.             0.             0.        ]]
. . . . . . . . . . 10
Old model 
LL
[[-1851.82404552 -1995.80190523 -1995.80190523]
 [-1855.17692183 -1991.83205299 -1991.83205299]
 [-1858.78848725 -1989.37872712 -1989.37872712]
 [-1859.88547983 -1985.49613881 -1985.49613881]
 [-1873.26577532 -1972.64086846 -1972.64086846]
 [-1888.55464921 -1962.77217781 -1962.77217781]
 [-1915.6768923  -1948.29012377 -1948.29012377]
 [-1926.34347024 -1939.98590585 -1939.98590585]
 