# ThoroughBet Simulation


## Load necessary modules

In [1]:
import numpy as np

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata

AttributeError: 'str' object has no attribute 'general'

## Load data

In [2]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [3]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

In [4]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [5]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [6]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . . . .CPU times: user 5min 38s, sys: 19.2 s, total: 5min 57s
Wall time: 5min 21s



In [7]:
predict_mask = mod.is1|mod.is2|mod.oos
predict_event_id = av.event_id[predict_mask]

In [8]:
import pandas as pd
pd.set_option('display.max_columns', 90)
col_names = ['f{}'.format(i) for i in range(1,58)]
df = pd.DataFrame(data =factors[:, predict_mask].T , columns = col_names)
df['event_id'] = av.event_id[predict_mask]
df['runner_id'] = av.runner_id[predict_mask]
df['result'] = av.result[predict_mask]
df['is1'] = mod.is1[predict_mask]
df['is2'] = mod.is2[predict_mask]
df['oos'] = mod.oos[predict_mask]
df['time'] =av.start_time[predict_mask]
df['obstacle'] = av.obstacle[predict_mask]
df['going'] = av.going[predict_mask]
df['speed'] = av.speed[predict_mask]
df['distance'] = av.distance[predict_mask]
df['prize'] = av.prize[predict_mask]

In [9]:
df['day'] = df['time'].apply(lambda x: ((x -df['time'].values[0])/(24*3600)).round(0)).astype(int)
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize,day
0,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,-0.487135,0.084544,-0.116107,0.002079,-0.001362,-0.160866,0.091133,0.966224,2.384186e-07,2.49246,0.753668,-0.000403,0.234473,0.250251,0.023256,0.628183,0.0986,-0.014886,-0.089427,0.53691,-0.012463,-0.143313,0.943943,0.417261,-0.000921,-0.00078,0.1193,293661,360456,3,True,True,False,1443704000.0,F,GD-FM,15.735644,1700.784058,3235.0,0
1,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,-0.030274,-0.498561,0.035182,-0.01871,0.029094,0.031624,-0.061174,0.254454,2.384186e-07,1.648349,0.073634,-0.650115,0.177804,-0.052191,-0.034483,0.370725,-0.247407,-0.023127,0.550295,-0.121513,0.112166,0.040612,0.224962,0.057615,-0.016085,-0.01733,0.026977,293661,375590,5,True,True,False,1443704000.0,F,GD-FM,15.62203,1700.784058,3235.0,0
2,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,-0.26665,0.043125,0.001112,0.002079,-0.011063,0.171808,0.014969,0.155725,2.384186e-07,0.941642,0.146075,-0.109316,0.106961,0.005249,0.083641,0.144267,0.0986,0.08141,-0.152771,-0.183572,-0.012463,0.159303,0.48867,0.036059,0.0148,0.014925,0.051179,293661,374610,7,True,True,False,1443704000.0,F,GD-FM,15.565223,1700.784058,3235.0,0
3,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,0.250868,0.486319,0.086636,0.002079,-0.009765,0.1904,0.130183,0.447011,2.384186e-07,0.498932,-0.041847,0.597555,0.001817,-0.037324,-0.019826,-0.163311,-0.247407,-0.014886,0.461782,0.289931,-0.012463,0.088135,-0.126984,-0.495465,0.012784,0.012911,-0.262583,293661,373638,1,True,True,False,1443704000.0,F,GD-FM,15.849259,1700.784058,3235.0,0
4,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,-0.043148,-0.245328,-0.009627,0.002079,0.011331,-0.122763,-0.217356,1.080508,2.384186e-07,-0.004469,0.049962,0.495737,0.116301,-0.110655,-0.044347,-0.163311,0.0986,0.0239,-0.237007,-0.440662,-0.012463,-0.125458,0.136239,0.442488,-0.024187,-0.024024,0.267151,293661,347906,4,True,True,False,1443704000.0,F,GD-FM,15.712922,1700.784058,3235.0,0


In [10]:
%%time

df_new = pd.DataFrame()

for day, data in df[['runner_id']].groupby(by =df.day):

    mask_days = (df.day <= day).values
    mask_runners = np.in1d(df.runner_id.values, data.runner_id.values)
    df_days = df['runner_id'][mask_days & mask_runners]
    df_count = df_days.value_counts()
    df_new = df_new.append(data.join(df_count, on ='runner_id', rsuffix = '_count')) 

CPU times: user 22.5 s, sys: 64 ms, total: 22.5 s
Wall time: 22.7 s


In [11]:
df_new.tail()

Unnamed: 0,runner_id,runner_id_count
181984,309510,17
181985,134162,17
181986,167167,16
181987,138643,13
181988,311581,9


In [13]:
def expiriance(x, threshold =15):
    if x >= threshold:
        return 1.
    else:
        return float(x)/threshold

In [14]:
df['expiriance'] = df_new['runner_id_count'].apply(expiriance)
df.tail()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,is1,is2,oos,time,obstacle,going,speed,distance,prize,day,expiriance
181984,-0.001791,0.305292,0.002202,-0.04358,0.000176,-0.830468,0.546406,0.189813,0.08874,-0.054661,-0.355233,0.636184,-0.37376,0.019384,0.005445,-0.167969,0.189934,0.170681,0.201674,0.013882,0.043565,-0.262285,-0.314926,-0.054621,-0.199356,-0.001768,-0.203631,-0.001118,0.057663,0.247347,0.075659,-0.19457,0.036453,-0.047364,0.129998,-0.142565,-0.004153,0.011022,1.192093e-07,0.25928,0.049065,0.049889,0.070893,-0.095818,0.031717,0.258435,0.098857,0.009017,-0.094818,-0.271917,-0.10662,-0.044507,0.044462,-0.001848,0.189937,0.189809,-0.273905,348044,309510,4,False,False,True,1491325000.0,F,GD,14.738428,2212.8479,2911.0,551,1.0
181985,-0.001791,0.158784,-0.094404,0.034454,-0.001054,0.38303,-0.021071,-0.450935,0.240886,0.245064,-0.009981,-0.43083,0.09118,0.050764,0.107631,-0.061776,-0.451033,-0.482679,-0.376243,-0.022931,-0.054987,0.080867,0.966551,0.034155,0.082771,-0.002208,0.042784,0.00113,0.10602,-0.145781,0.055507,0.015852,-0.063126,-0.687815,-0.097964,-0.105352,-0.143244,-0.014678,1.192093e-07,0.25928,0.033785,-0.019956,0.106958,0.28357,-0.031777,-0.310286,0.098857,0.009175,0.15543,0.150893,-0.10662,0.092016,0.134832,0.022426,-0.451031,-0.450936,-0.081369,348044,134162,3,False,False,True,1491325000.0,F,GD,14.820935,2212.8479,2911.0,551,1.0
181986,0.121731,-0.607752,0.112927,0.04038,0.000176,-0.811391,-0.353688,0.098327,-0.171542,-0.08816,-0.11526,0.15657,-0.142259,-0.269422,-0.365013,-0.124193,0.09837,0.038713,0.105367,0.082842,0.019753,0.203087,-0.604638,0.002315,-0.032396,-0.002944,0.289072,0.000932,-0.37102,0.060209,0.340068,-0.053617,0.105134,0.149609,0.030614,0.363646,0.091233,-0.329113,1.192093e-07,-0.365193,0.039436,-0.019956,-0.108792,-0.135291,0.037264,-0.310286,0.098857,-0.033725,-0.085344,-0.211192,-0.10662,0.004154,-0.135623,-0.107339,0.098368,0.098361,-0.049301,348044,167167,7,False,False,True,1491325000.0,F,GD,14.548663,2212.8479,2911.0,551,1.0
181987,-0.45911,-1.184132,0.069584,-0.114837,0.000176,0.713696,0.333819,-0.202482,-0.113766,-0.386204,0.39667,-0.607411,0.014962,0.044718,-0.437826,0.086846,-0.202673,-0.087505,-0.238376,-0.022931,0.008674,-0.305982,0.671716,-0.023509,0.040691,-0.001821,-0.163256,0.00026,-0.170459,0.13615,-0.56036,-0.15181,-0.202459,0.301643,-0.248122,-0.158359,0.072683,-0.12792,1.192093e-07,-0.675358,-0.778373,-0.019956,0.007761,0.053726,0.00609,-0.310286,0.098857,-0.266593,-0.093705,0.063148,-0.10662,-0.13552,-0.675076,-0.022018,-0.202678,-0.202493,-0.008656,348044,138643,5,False,False,True,1491325000.0,F,GD,14.709551,2212.8479,2911.0,551,0.866667
181988,0.121731,-0.850804,-0.073879,0.04038,0.000176,-0.918695,0.22304,0.147873,-0.347024,0.089216,-0.177962,-0.66932,-0.015987,0.053083,-0.279684,-0.13365,0.147968,0.121095,0.195003,-0.014636,-0.011203,-0.142182,-1.057921,0.086318,0.01458,0.013167,-0.254831,0.001233,0.10602,-0.20373,0.35189,-0.199153,0.124085,0.060165,0.121153,-0.111032,-0.039353,-0.156794,1.192093e-07,-0.798513,0.093016,0.049889,-0.191833,-0.003674,-0.024356,-0.310286,-0.247143,0.106391,-0.093148,-0.328325,-0.10662,0.083268,-0.282246,-0.001047,0.147968,0.147897,0.082279,348044,311581,6,False,False,True,1491325000.0,F,GD,14.680673,2212.8479,2911.0,551,0.6


In [15]:
X_new = df.ix[:,'f1':'f57'].values * df['expiriance'].values.reshape(-1,1)

In [51]:
df['expiriance'].values.reshape(-1,1)[:5]#df.ix[:,'f1':'f57'].values[:5]

array([[ 0.06666667],
       [ 0.06666667],
       [ 0.06666667],
       [ 0.06666667],
       [ 0.06666667]])

In [10]:
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))

    j=0
    for i,flag in enumerate(predict_mask):
        if flag:
            factors_new[:,i] = X[j,:]
            j +=1
    return factors_new

In [17]:
X_new = np.hstack((X_new, df['expiriance'].values.reshape(-1,1)))

In [18]:
factors_new = np.vstack((factors,new_factors_array(X_new)))
factors_new.shape, factors.shape

((115, 1631851), (57, 1631851))

In [19]:
model_coefs, model_step1prob, model_step2prob, model_likelihood \
= mod.fit_slices(tsav, factors_new, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print "LL factors factorization expiriance"
print model_likelihood
print 


old_coefs, old_step1prob, old_step2prob, old_likelihood \
= mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)
print "LL old model"
print old_likelihood

. . . . . . . . . . 10
LL factors factorization expiriance
[[-1808.91786872 -1993.75057782 -1993.75057782]
 [-1814.62149716 -1990.63189064 -1990.63189064]
 [-1819.74802034 -1988.71907346 -1988.71907346]
 [-1824.27719558 -1976.5924666  -1976.5924666 ]
 [-1844.51622195 -1955.29398416 -1955.29398416]
 [-1862.0064982  -1948.78386065 -1948.78386065]
 [-1902.39086888 -1920.78753303 -1920.78753303]
 [-1919.50481249 -1901.02548472 -1901.02548472]
 [-1922.23487913 -1900.77455152 -1900.77455152]
 [-1927.0519835  -1894.55226629 -1894.55226629]
 [    0.             0.             0.        ]]

. . . . . . . . . . 10
LL old model
[[-1809.60545794 -1997.12919856 -1997.12919856]
 [-1815.29560313 -1993.38729693 -1993.38729693]
 [-1820.46573311 -1991.13279316 -1991.13279316]
 [-1825.04841809 -1978.8072344  -1978.8072344 ]
 [-1845.34679456 -1957.00310184 -1957.00310184]
 [-1863.07407698 -1951.28943929 -1951.28943929]
 [-1903.75005795 -1924.17873405 -1924.17873405]
 [-1921.12493591 -1902.65155685 -1902.6

In [20]:
write_dic_to_simdata('simdata_runner_expiriance.p', model_step1prob, model_coefs, mod.oos, av =av)

In [26]:
from sklearn.feature_extraction import FeatureHasher
new_ob = FeatureHasher(n_features =4, input_type ='string').fit_transform(df['obstacle'])

In [28]:
new_ob.todense()

matrix([[ 0.,  0., -1.,  0.],
        [ 0.,  0., -1.,  0.],
        [ 0.,  0., -1.,  0.],
        ..., 
        [ 0.,  0., -1.,  0.],
        [ 0.,  0., -1.,  0.],
        [ 0.,  0., -1.,  0.]])

In [11]:
df['obstacle'] = av.obstacle[predict_mask]
df['obstacle'].unique()

array(['F', 'H', 'C', ''], dtype=object)

In [12]:
dic = {x: y for y, x in enumerate(df['obstacle'].unique())}

In [13]:
dic

{'': 3, 'C': 2, 'F': 0, 'H': 1}

In [14]:
from sklearn.preprocessing import OneHotEncoder
new_ob = OneHotEncoder().fit_transform(df[['obstacle']].replace(dic))
new_ob

<181989x4 sparse matrix of type '<type 'numpy.float64'>'
	with 181989 stored elements in Compressed Sparse Row format>

In [15]:
#one_result = lambda x: 1 if (1 <= x)& (x <=3) else 0
df['one_result'] = df['result'].apply(lambda x: 1 if (1 <= x)& (x <=3) else 0)

In [19]:
df_result_ob = pd.DataFrame(new_ob.toarray()*df['one_result'].values.reshape(-1,1), columns = ['F', 'H', 'C', 'None'],
                            dtype=int)
df_result_ob['day'] =df['day']
df_result_ob['runner_id'] =df['runner_id']

In [20]:
df_result_ob.ix[:6,:4].sum(axis =1)

0    1
1    0
2    0
3    1
4    0
5    0
6    1
dtype: int64

In [21]:
df_result_ob.head()

Unnamed: 0,F,H,C,None,day,runner_id
0,1,0,0,0,0,360456
1,0,0,0,0,0,375590
2,0,0,0,0,0,374610
3,1,0,0,0,0,373638
4,0,0,0,0,0,347906


In [22]:
n,m = len(df.runner_id.unique()), len(df.obstacle.unique())

In [23]:
df_runner = pd.DataFrame(np.zeros((n,m)), index = df.runner_id.unique(), columns = ['F', 'H', 'C', 'None'])

In [24]:
def dev(a,b):
    if a ==0:
        return 0
    return float(a/b)

In [31]:
df_days_/df_runner_sum.reshape(-1,1)

ValueError: Empty data passed with indices specified.

In [35]:
#%%time

df_new = pd.DataFrame()

for d, data in df_result_ob.groupby(by =df_result_ob.day):

    mask_days = (df_result_ob.day < d).values
    mask_runner = np.in1d(df_result_ob.runner_id.values, data.runner_id.values)
    df_days_ = df_result_ob.ix[mask_days &mask_runner, :4].groupby(df_result_ob.runner_id).sum()
    df_runner_sum = df_days_.sum(axis =1).values.astype(float)
    
    try:
        df_days = df_days_/df_runner_sum.reshape(-1,1)
    except ValueError:
        df_days  = pd.DataFrame()
    
    if np.any(np.isinf (df_days )):
        print 'devision by zero, have inf'
        break
    df_days =  data.join(df_days.apply(np.nan_to_num), on = 'runner_id')
    df_new = df_new.append(df_days)

ValueError: Other Series must have a name

In [146]:
df_days

Unnamed: 0,0,1,2,3,day,runner_id,event_id,0_rank,1_rank,2_rank,3_rank
181865,0,1,0,0,551,410943,348030,,,,
181866,0,1,0,0,551,492974,348030,,,,
181867,0,0,0,0,551,371025,348030,,,,
181868,0,0,0,0,551,468218,348030,,,,
181869,0,1,0,0,551,367815,348030,,,,
181870,0,0,0,0,551,334767,348030,,,,
181871,0,0,0,0,551,371224,348030,,,,
181872,0,0,0,0,551,493304,348030,,,,
181873,0,0,0,0,551,496094,348030,,,,
181874,0,0,0,0,551,450496,348030,,,,


In [144]:
df_new.tail(10)

Unnamed: 0,0,1,2,3,day,runner_id,event_id,0_rank,1_rank,2_rank,3_rank
181979,0,0,0,0,551,367414,348043,,,,
181980,0,0,0,0,551,340100,348043,,,,
181981,1,0,0,0,551,346448,348043,,,,
181982,1,0,0,0,551,365184,348044,,,,
181983,1,0,0,0,551,157115,348044,,,,
181984,0,0,0,0,551,309510,348044,,,,
181985,1,0,0,0,551,134162,348044,,,,
181986,0,0,0,0,551,167167,348044,,,,
181987,0,0,0,0,551,138643,348044,,,,
181988,0,0,0,0,551,311581,348044,,,,


In [94]:
df_new = df_new.fillna(0)
df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,day,runner_id,0_mean,1_mean,2_mean,3_mean,4_mean,5_mean,6_mean,7_mean,8_mean,9_mean,10_mean,11_mean,12_mean,13_mean
0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,360456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,375590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,374610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,373638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,347906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
df_new.ix[:,16:].max()

0_mean     34.0
1_mean     34.0
2_mean     34.0
3_mean     34.0
4_mean     34.0
5_mean     34.0
6_mean     34.0
7_mean     34.0
8_mean     34.0
9_mean     34.0
10_mean    34.0
11_mean    34.0
12_mean    14.0
13_mean    34.0
dtype: float64

In [96]:
(1/df_new.ix[:,16:].replace({0: 50})).max()

0_mean     29.0
1_mean     14.0
2_mean     28.0
3_mean     28.0
4_mean     31.0
5_mean     21.0
6_mean     37.0
7_mean     43.0
8_mean     18.0
9_mean     24.0
10_mean    20.0
11_mean    23.0
12_mean    23.0
13_mean    33.0
dtype: float64

In [97]:
(1/df_new.ix[:,16:].replace({0: 50})).min()

0_mean     0.02
1_mean     0.02
2_mean     0.02
3_mean     0.02
4_mean     0.02
5_mean     0.02
6_mean     0.02
7_mean     0.02
8_mean     0.02
9_mean     0.02
10_mean    0.02
11_mean    0.02
12_mean    0.02
13_mean    0.02
dtype: float64

In [98]:
from sklearn.preprocessing import MinMaxScaler
X_rank = MinMaxScaler().fit_transform((1/df_new.ix[:,16:].replace({0: 50})))
X_rank =O_H.toarray()*X_rank

In [104]:
from sklearn.decomposition import SparsePCA
X_rank_sparse =SparsePCA(n_components= 2).fit_transform(X_rank)

In [105]:
factors_new = np.vstack((factors,new_factors_array(X_rank_sparse)))
factors_new.shape, factors.shape

((59, 1631851), (57, 1631851))

In [106]:
model_coefs, model_step1prob, model_step2prob, model_likelihood \
= mod.fit_slices(tsav, factors_new, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print "LL factors factorization expiriance"
print model_likelihood
print 

. . . . . . . . . . 10
LL factors factorization expiriance
[[-1809.60545909 -1997.12884624 -1997.12884624]
 [-1815.2956189  -1993.38703599 -1993.38703599]
 [-1820.46573638 -1991.13244643 -1991.13244643]
 [-1825.04842738 -1978.80697191 -1978.80697191]
 [-1845.34686014 -1957.00301836 -1957.00301836]
 [-1863.07413963 -1951.2893558  -1951.2893558 ]
 [-1903.75000456 -1924.17790661 -1924.17790661]
 [-1921.12487565 -1902.65066286 -1902.65066286]
 [-1923.8936869  -1902.50166159 -1902.50166159]
 [-1928.88779239 -1895.42532656 -1895.42532656]
 [    0.             0.             0.        ]]



In [107]:
write_dic_to_simdata('simdata_runner_rank.p', model_step1prob, model_coefs, mod.oos, av =av)

In [112]:
from sklearn.decomposition import PCA
X_pca = PCA(n_components= 51).fit_transform(df[col_names].values)
X_new = np.hstack((X_pca, X_pca*X_rank_sparse[:,0].reshape(-1,1), X_pca*X_rank_sparse[:,1].reshape(-1,1)))
X_new.shape

(181989, 153)

In [114]:
factors_new = new_factors_array(X_new)
factors_new.shape, factors.shape

((153, 1631851), (57, 1631851))

In [115]:
model_coefs, model_step1prob, model_step2prob, model_likelihood \
= mod.fit_slices(tsav, factors_new, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print "LL factors factorization expiriance"
print model_likelihood
print 

. . . . . . . . . . 10
LL factors factorization expiriance
[[-1809.69917201 -1997.31304513 -1997.31304513]
 [-1815.4028655  -1993.66337266 -1993.66337266]
 [-1820.58995505 -1991.49018631 -1991.49018631]
 [-1825.16718309 -1979.18926507 -1979.18926507]
 [-1845.51702198 -1957.55438871 -1957.55438871]
 [-1863.22685966 -1951.76838265 -1951.76838265]
 [-1904.00145682 -1924.62805033 -1924.62805033]
 [-1921.39764123 -1902.68369667 -1902.68369667]
 [-1924.14110873 -1902.45679877 -1902.45679877]
 [-1929.19610401 -1895.2412228  -1895.2412228 ]
 [    0.             0.             0.        ]]



In [116]:
write_dic_to_simdata('simdata_runner_rank_f.p', model_step1prob, model_coefs, mod.oos, av =av)