In [2]:
# ThoroughBet Simulation


## Load necessary modules

In [1]:
import numpy as np

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end
from prediction.tools.clustering import write_dic_to_simdata, dic_to_tenzor, ll_diff

## Load data

In [2]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [3]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

In [4]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [5]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [6]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

INFO:models:Getting factors from av and rescaling...


. . . . .

INFO:models:Filling in missing values...
INFO:models:Computing each factor as linear combination of all the others...


 . . . . .

INFO:models:Number of missing patterns: 7754


 . . . . . . .

INFO:models:Transforming factors by applying CL-model on their Taylor expansions...


 . . .

  coefse = np.sqrt(np.diag(information_matrix))


 . .CPU times: user 3min 9s, sys: 1.63 s, total: 3min 11s
Wall time: 55.7 s



In [7]:
predict_mask = mod.is1|mod.is2|mod.oos

In [8]:
# создадим функцию которая добовляет более старые данные
def old_data(num, is1=mod.is1):
    
    first_is1 = np.where(is1 ==True)[0][0]
    past_events = np.unique(av.event_id[av.event_id < av.event_id[first_is1]])[-int(num):]
    
    return np.in1d(av.event_id, past_events)

In [9]:
mask_past = old_data(4000)

In [10]:
np.where(predict_mask ==True)[0][0], np.where(mask_past ==True)[0][0]

(1123738, 1085943)

In [11]:
# создаем новые факторы из столбцов таблицы
def new_factors_array (X, predict_mask =predict_mask):
    
    
    factors_new = np.zeros((X.shape[1], predict_mask.shape[0]))
    factors_new[:, predict_mask] = X.T
    
    return factors_new

In [12]:
# создаем дата фрейм из данных 
def DF(mask, factors, av, factors_names, other_names):
    
    import pandas as pd
    
    df = pd.DataFrame(data =factors[:, mask].T , columns = factors_names)
    for col in other_names :
        
        df[col] = av[col][mask]
        
    return df

In [18]:
predict_mask_old = predict_mask|mask_past

In [19]:
import pandas as pd
pd.set_option('display.max_columns', 90)

col_names = ['f{}'.format(i) for i in range(1,58)]

df = DF (predict_mask_old, factors, av, col_names, ['event_id', 'runner_id', 'result', 'start_time', 'jockey', 'trainer',
                                               'prize', 'speed'])
df['is1'] = mod.is1 [predict_mask_old]
df['oos'] = mod.oos [predict_mask_old]
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,start_time,jockey,trainer,prize,speed,is1,oos
0,-0.300425,0.395791,0.128154,0.007678,0.000409,0.266461,0.403035,0.102524,0.190081,-0.081118,0.496736,0.577068,0.566129,0.002369,0.557563,-0.290554,0.102632,0.158571,0.101999,-0.016289,0.012719,-0.165066,0.429672,0.075039,0.377461,-0.005094,-0.052737,0.004409,-0.097898,-0.04974,-0.408912,0.273634,-0.29821,0.004905,0.026865,-0.243684,0.13466,0.617767,2.384186e-07,0.529467,-0.11499,1.050827,0.430937,0.10248,-0.04097,0.731995,0.091984,-0.191077,0.255842,0.006521,0.612698,0.1218,-0.028377,0.176443,0.102635,0.102526,0.140143,289661,337075,7,1440263000.0,2813,177,7439.0,15.835739,False,False
1,-0.079047,0.734631,0.00082,0.002008,0.000409,0.586906,0.693549,0.109356,-0.381902,0.102185,0.001599,-0.222851,0.412028,0.002429,0.519179,0.006526,0.10947,0.118188,0.081228,-0.007011,0.052239,-0.234721,0.688754,-0.044445,0.800941,-0.004916,0.073925,-0.011162,0.038662,0.132302,-0.310634,0.278897,-0.177673,-0.057861,0.022425,-0.209739,-0.044735,0.323752,2.384186e-07,0.380274,0.255982,-2.658586,0.309631,-0.102876,-0.035557,-0.431108,0.091984,-0.058425,0.422626,0.146689,-0.136156,0.095385,-0.644119,-0.140514,0.109473,0.109355,-0.186209,289661,343042,-2,1440263000.0,8918,64,7439.0,13.955957,False,False
2,0.285734,0.734631,0.078515,0.00051,0.000409,0.356276,0.00803,-0.023606,0.000329,0.555535,-0.108936,0.577068,0.108821,0.01847,0.520955,-0.030059,-0.023637,0.013305,-0.025985,-0.016289,0.045093,0.264268,-0.008505,-0.029066,0.303144,-0.004725,0.360712,0.002138,0.038662,-0.221067,0.573874,-0.008419,0.297319,0.069789,-0.000165,0.458564,0.14028,0.34693,2.384186e-07,0.250397,0.402266,1.076248,-0.074991,-0.050474,-0.00812,0.731995,-0.253881,0.130537,-0.206888,0.192649,0.612698,0.001878,1.135393,-0.087405,-0.023638,-0.023604,0.013511,289661,281324,6,1440263000.0,5398,4782,7439.0,15.902196,False,False
3,0.172759,0.234369,0.100693,0.002008,0.000409,0.136677,0.464283,0.009444,0.095405,0.69394,-0.033834,0.365992,0.017717,-0.065297,0.380561,-0.096452,0.009449,0.010176,0.013376,-0.018866,-0.019429,0.084561,0.079373,0.080374,-0.532077,0.047428,0.177036,0.004347,0.078543,0.033728,0.455001,0.238922,0.13171,-0.139804,-0.021004,0.060606,0.106962,0.255895,2.384186e-07,0.20284,0.108072,1.144194,0.249019,-0.004037,0.009909,0.195334,0.091984,0.109085,0.225671,-0.148319,-0.136156,-0.119264,0.83203,0.035556,0.009448,0.009451,-0.052432,289661,301349,1,1440263000.0,8890,1366,7439.0,15.992388,False,False
4,-0.412245,0.122855,-0.225543,0.004341,0.000409,-0.517045,0.255158,0.069344,-0.095044,-0.173903,-0.421254,-0.61783,-0.257109,-0.052952,0.068361,0.089372,0.069417,0.066034,0.100479,-0.008628,-0.02205,0.159811,0.058405,0.080832,0.130999,-0.00505,-0.13487,-0.006987,-0.097898,-0.140647,0.167489,0.031722,0.096828,0.004905,0.044129,-0.069157,0.000311,0.180541,2.384186e-07,0.093072,-0.336946,-2.776815,0.309631,-0.164607,-0.011642,-0.431108,0.117755,-0.22477,0.093832,0.271197,-0.136156,-0.023747,-0.343946,0.017772,0.069418,0.069352,-0.190724,289661,314101,-2,1440263000.0,10205,1782,7439.0,13.955957,False,False


In [14]:
def convert_result(data, r_max =np.inf):
    """ 
    convert result to other format
    for example : 1, 2, 3, 4, 5 , -1,  => 1, 2, 3, 4, 4, 4 if r_max = 4
    data > pd.Series with results
    r_max > maximum place or result that consider as unique 
    """
    
    _max = data.max()
    data = data.apply(lambda x: np.where(x>0, x, _max+1))
    return data.apply(lambda x: np.where(x <=r_max, x, r_max))

###### считаем новый рейтинг лошади

In [15]:
def new_R_Elo (Em_data, S_data, K =20):
    """
    new Elo rating for player
    Em_data > dataframe of the math expected each player win from another with old Elo rating
    S_data  > dataframe of the score each player win from another really
    K       > coffissient 
    return new Elo rating for player as pd.Series
    """
    
    return ((S_data -Em_data)*K).sum(axis =1).rename('rating')#.to_frame()

In [43]:
player = 'runner_id' # choose the player from (runner_id, jockey, trainer)
draw = 0.5 # ничья
loss = 0.0 #  проигрыш
p = 10 # основание степени
N = 400 # коэффициент уменьшения разности рейтингов
f_score = 'speed'# or 'result'
p_N = np.log(p)/N

In [44]:
mask = (df['result'] > 0) & (df['result'] <= 4) #consider only runners with result from 1 to 4

In [45]:
df.loc[~mask, 'speed'] = np.nan

###### считаем сколько очков заработала лошадь в попарном соревновании за один забег

###### считаем мат ожидание что лошадь в попарном соревновании выиграет у другой

###### считаем новый рейтинг лошади

In [46]:
# pd.Series for rating of player, starts Elo rating for players
df_rating = pd.Series(index = df[player].unique(), name ='rating').fillna(200)
df2 = df.copy()
step = True

In [24]:
def matrix_score (data):
    """
    data - 1D array 
    return the matrix with pair difference between all elements of the array
    """
    n = len(data)
    _result = np.ones((n, n), float) *data
    return _result.T - _result

In [47]:
%%time
# for speed difference
f_score = 'speed'
from itertools import combinations

# loop for all event
for event, sub_df in df[['event_id', player, f_score]].groupby('event_id'):
    
    sub_df[f_score] = sub_df[f_score].transform(lambda x: x/x.max())
    sub_df.loc[sub_df[f_score].isnull(), f_score] = sub_df[f_score].min()
    #sub_df.sort_values('result', inplace =True) # sort by result 
    #delate duplicate player
    sub_df.drop_duplicates(player, inplace =True)
    # create the dataframe  with score of results
    _players = sub_df[player].values
    df_score = pd.DataFrame(matrix_score(sub_df[f_score].values), index = _players, columns = _players) 
    # create the matrix with calculate rating 
    _R_em = 1 /(1 + np.exp( p_N * matrix_score(df_rating[sub_df[player]].values)))
    # create the dataframe  with math expected  of win
    df_Em    = pd.DataFrame(np.tril(1 - np.triu(_R_em).T,  -1) + np.triu(_R_em), index = _players, columns = _players)
    
    # convert score result
    K = - 180 
    df_score = df_score * K + draw
    df_score[df_score < 0]  = loss
    df_score[df_score > 1]  = np.power(df_score , 1./4) 
    
    df_ = new_R_Elo(df_Em, df_score) # calculate the change of the Elo rating
    df_rating[df_.index] += df_ # update the Elo rating
    
    df_ = df_.to_frame()
    df_['event_id'] = event
    df_.set_index('event_id', append =True, inplace =True) # DataFrame with new rating
    
    if step:
        df2 = df2.join(df_, on =[player,'event_id']).fillna(0)
    else:
        df2 = df2.join(df_, on =[player,'event_id'], rsuffix ='_new').fillna(0)
        df2['rating'] = df2['rating'] + df2['rating_new']
        #df2['rating'] = df2['rating'].combine_first(df2['rating_new'])
        df2.drop('rating_new', axis =1, inplace =True)
    step = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


CPU times: user 1h 13min 25s, sys: 2.87 s, total: 1h 13min 28s
Wall time: 1h 13min 1s


In [24]:
"""
%%time
from itertools import combinations

# loop for all event
for event, sub_df in df.groupby('event_id')[[player, f_score]]:
    
    sub_df['result'] = sub_df[f_score].transform(lambda x: convert_result(x, 4))
    sub_df.sort_values('result', inplace =True) # sort by result 
    #delate duplicate player
    sub_df.drop_duplicates(player, inplace =True)
    # create the dataframe  with score of results
    _players = sub_df[player].values
    df_score = pd.DataFrame(matrix_score(sub_df['result'].values), index = _players, columns = _players) 
    # create the matrix with calculate rating 
    _R_em = 1 /(1 + np.exp( p_N * matrix_score(df_rating[sub_df[player]].values)))
    # create the dataframe  with math expected  of win
    df_Em    = pd.DataFrame(np.tril(1 - np.triu(_R_em).T,  -1) + np.triu(_R_em), index = _players, columns = _players)
    
    # convert score result
    df_score[df_score == 0] = draw
    df_score[df_score < 0]  = loss
    df_score[df_score > 1]  = np.power(df_score , 1./4) 
    
    df_ = new_R_Elo(df_Em, df_score) # calculate the change of the Elo rating
    df_rating[df_.index] += df_ # update the Elo rating
    
    df_ = df_.to_frame()
    df_['event_id'] = event
    df_.set_index('event_id', append =True, inplace =True) # DataFrame with new rating
    
    if step:
        df2 = df2.join(df_, on =[player,'event_id']).fillna(0)
    else:
        df2 = df2.join(df_, on =[player,'event_id'], rsuffix ='_new').fillna(0)
        df2['rating'] = df2['rating'] + df2['rating_new']
        #df2['rating'] = df2['rating'].combine_first(df2['rating_new'])
        df2.drop('rating_new', axis =1, inplace =True)
    step = False
"""

"\n%%time\nfrom itertools import combinations\n\n# loop for all event\nfor event, sub_df in df.groupby('event_id')[[player, f_score]]:\n    \n    sub_df['result'] = sub_df[f_score].transform(lambda x: convert_result(x, 4))\n    sub_df.sort_values('result', inplace =True) # sort by result \n    #delate duplicate player\n    sub_df.drop_duplicates(player, inplace =True)\n    # create the dataframe  with score of results\n    _players = sub_df[player].values\n    df_score = pd.DataFrame(matrix_score(sub_df['result'].values), index = _players, columns = _players) \n    # create the matrix with calculate rating \n    _R_em = 1 /(1 + np.exp( p_N * matrix_score(df_rating[sub_df[player]].values)))\n    # create the dataframe  with math expected  of win\n    df_Em    = pd.DataFrame(np.tril(1 - np.triu(_R_em).T,  -1) + np.triu(_R_em), index = _players, columns = _players)\n    \n    # convert score result\n    df_score[df_score == 0] = draw\n    df_score[df_score < 0]  = loss\n    df_score[df_sc

In [48]:
rating = lambda x: 1.25 -0.25* x if (x >0 )& (x  <4) else 0
rating = lambda x: 1./x/x if (x >0 )& (x  <10) else 0.1
#df['rating'] = df['result'].apply(rating)
print 'correlation old rating and  ELO rating   ',df['result'].apply(rating).corr(df2['rating'])

correlation old rating and  ELO rating    -0.54000094407


In [49]:
df['rating_r'] = df['result'].apply(rating)
df2['rating_r'] = df.groupby(player)['rating_r'].cumsum()/(1+df.groupby(player).cumcount())
df2['rating_r'] = df.groupby(player)['rating_r'].shift().fillna(0)

In [50]:
df2['rating_s'] = df2.groupby(player)['rating'].shift().fillna(0)

In [51]:
df2['dif_rating_s'] = df2.groupby(player)['rating_s'].diff().fillna(0)
df2['dif_rating_r'] = df2.groupby(player)['rating_r'].diff().fillna(0)
df2.tail()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,start_time,jockey,trainer,prize,speed,is1,oos,rating_r,rating,rating_s,dif_rating_s,dif_rating_r
219779,-0.001791,0.305292,0.002202,-0.04358,0.000176,-0.830468,0.546406,0.189813,0.08874,-0.054661,-0.355233,0.636184,-0.37376,0.019384,0.005445,-0.167969,0.189934,0.170681,0.201674,0.013882,0.043565,-0.262285,-0.314926,-0.054621,-0.199356,-0.001768,-0.203631,-0.001118,0.057663,0.247347,0.075659,-0.19457,0.036453,-0.047364,0.129998,-0.142565,-0.004153,0.011022,1.192093e-07,0.25928,0.049065,0.049889,0.070893,-0.095818,0.031717,0.258435,0.098857,0.009017,-0.094818,-0.271917,-0.10662,-0.044507,0.044462,-0.001848,0.189937,0.189809,-0.273905,348044,309510,4,1491325000.0,10830,12095,2911.0,14.738428,False,True,0.111111,103.901566,77.932782,72.66197,-0.138889
219780,-0.001791,0.158784,-0.094404,0.034454,-0.001054,0.38303,-0.021071,-0.450935,0.240886,0.245064,-0.009981,-0.43083,0.09118,0.050764,0.107631,-0.061776,-0.451033,-0.482679,-0.376243,-0.022931,-0.054987,0.080867,0.966551,0.034155,0.082771,-0.002208,0.042784,0.00113,0.10602,-0.145781,0.055507,0.015852,-0.063126,-0.687815,-0.097964,-0.105352,-0.143244,-0.014678,1.192093e-07,0.25928,0.033785,-0.019956,0.106958,0.28357,-0.031777,-0.310286,0.098857,0.009175,0.15543,0.150893,-0.10662,0.092016,0.134832,0.022426,-0.451031,-0.450936,-0.081369,348044,134162,3,1491325000.0,19689,755,2911.0,14.820935,False,True,0.027778,-28.915965,38.492183,81.676245,-0.083333
219781,0.121731,-0.607752,0.112927,0.04038,0.000176,-0.811391,-0.353688,0.098327,-0.171542,-0.08816,-0.11526,0.15657,-0.142259,-0.269422,-0.365013,-0.124193,0.09837,0.038713,0.105367,0.082842,0.019753,0.203087,-0.604638,0.002315,-0.032396,-0.002944,0.289072,0.000932,-0.37102,0.060209,0.340068,-0.053617,0.105134,0.149609,0.030614,0.363646,0.091233,-0.329113,1.192093e-07,-0.365193,0.039436,-0.019956,-0.108792,-0.135291,0.037264,-0.310286,0.098857,-0.033725,-0.085344,-0.211192,-0.10662,0.004154,-0.135623,-0.107339,0.098368,0.098361,-0.049301,348044,167167,7,1491325000.0,7281,20559,2911.0,0.0,False,True,0.020408,59.894633,62.824269,-11.594957,0.004783
219782,-0.45911,-1.184132,0.069584,-0.114837,0.000176,0.713696,0.333819,-0.202482,-0.113766,-0.386204,0.39667,-0.607411,0.014962,0.044718,-0.437826,0.086846,-0.202673,-0.087505,-0.238376,-0.022931,0.008674,-0.305982,0.671716,-0.023509,0.040691,-0.001821,-0.163256,0.00026,-0.170459,0.13615,-0.56036,-0.15181,-0.202459,0.301643,-0.248122,-0.158359,0.072683,-0.12792,1.192093e-07,-0.675358,-0.778373,-0.019956,0.007761,0.053726,0.00609,-0.310286,0.098857,-0.266593,-0.093705,0.063148,-0.10662,-0.13552,-0.675076,-0.022018,-0.202678,-0.202493,-0.008656,348044,138643,5,1491325000.0,8704,23303,2911.0,0.0,False,True,0.020408,43.220814,48.929955,0.559967,-0.079592
219783,0.121731,-0.850804,-0.073879,0.04038,0.000176,-0.918695,0.22304,0.147873,-0.347024,0.089216,-0.177962,-0.66932,-0.015987,0.053083,-0.279684,-0.13365,0.147968,0.121095,0.195003,-0.014636,-0.011203,-0.142182,-1.057921,0.086318,0.01458,0.013167,-0.254831,0.001233,0.10602,-0.20373,0.35189,-0.199153,0.124085,0.060165,0.121153,-0.111032,-0.039353,-0.156794,1.192093e-07,-0.798513,0.093016,0.049889,-0.191833,-0.003674,-0.024356,-0.310286,-0.247143,0.106391,-0.093148,-0.328325,-0.10662,0.083268,-0.282246,-0.001047,0.147968,0.147897,0.082279,348044,311581,6,1491325000.0,19225,20927,2911.0,0.0,False,True,0.04,-2.453874,14.011646,14.509505,-0.06


In [52]:
df2.loc[df2.is1, :].head(10)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,event_id,runner_id,result,start_time,jockey,trainer,prize,speed,is1,oos,rating_r,rating,rating_s,dif_rating_s,dif_rating_r
37795,0.066411,1.812371,0.140033,0.007517,-0.000144,0.370123,0.8869,-0.000888,1.492165,-0.504111,0.174814,0.783295,0.47842,0.02652,1.220996,0.317993,-0.000908,-0.000375,0.004848,-0.017851,0.043987,-0.237318,1.049784,-0.046792,-0.006285,0.000628,-0.065017,0.001235,-0.018441,0.274973,-0.487135,0.084544,-0.116107,0.002079,-0.001362,-0.160866,0.091133,0.966224,2.384186e-07,2.49246,0.753668,-0.000403,0.234473,0.250251,0.023256,0.628183,0.0986,-0.014886,-0.089427,0.53691,-0.012463,-0.143313,0.943943,0.417261,-0.000921,-0.00078,0.1193,293661,360456,3,1443704000.0,5870,5165,3235.0,15.735644,True,False,0.1,-32.410678,15.547942,78.557704,-0.15
37796,-0.154959,0.615217,-0.069783,-0.001667,0.001293,0.163312,-0.100933,-0.016348,0.145181,0.085654,0.057041,0.43098,0.132121,0.013572,0.413222,0.189882,-0.016198,-0.002962,0.014569,-0.015469,-0.026517,0.277365,-0.374422,0.029834,0.056561,-0.000221,0.116112,-0.002068,-0.024272,0.090061,-0.030274,-0.498561,0.035182,-0.01871,0.029094,0.031624,-0.061174,0.254454,2.384186e-07,1.648349,0.073634,-0.650115,0.177804,-0.052191,-0.034483,0.370725,-0.247407,-0.023127,0.550295,-0.121513,0.112166,0.040612,0.224962,0.057615,-0.016085,-0.01733,0.026977,293661,375590,5,1443704000.0,10816,448,3235.0,0.0,True,False,0.0,24.050933,0.0,0.0,0.0
37797,0.066411,0.442863,0.069257,0.007517,-0.000144,0.207937,-0.100933,0.014814,-0.010327,-0.566212,0.355908,0.246179,-0.305634,0.031401,0.364949,0.144203,0.014812,0.017871,0.009206,0.01274,-0.01444,0.264392,0.339009,0.039109,-0.006285,0.000204,0.173482,0.001554,0.055569,-0.092425,-0.26665,0.043125,0.001112,0.002079,-0.011063,0.171808,0.014969,0.155725,2.384186e-07,0.941642,0.146075,-0.109316,0.106961,0.005249,0.083641,0.144267,0.0986,0.08141,-0.152771,-0.183572,-0.012463,0.159303,0.48867,0.036059,0.0148,0.014925,0.051179,293661,374610,7,1443704000.0,10817,10804,3235.0,0.0,True,False,0.1,30.798762,23.639584,23.639584,0.1
37798,-0.154959,0.765992,0.129201,-0.02548,-0.000144,0.112316,-0.257098,0.012801,0.304235,0.226926,-0.260937,-0.048408,-0.043558,0.019547,0.54989,-0.284258,0.012796,0.00347,0.009206,0.011642,-0.004117,-0.122674,-0.421721,0.048495,-0.006285,-3.1e-05,0.239139,0.001421,-0.018441,-0.13433,0.250868,0.486319,0.086636,0.002079,-0.009765,0.1904,0.130183,0.447011,2.384186e-07,0.498932,-0.041847,0.597555,0.001817,-0.037324,-0.019826,-0.163311,-0.247407,-0.014886,0.461782,0.289931,-0.012463,0.088135,-0.126984,-0.495465,0.012784,0.012911,-0.262583,293661,373638,1,1443704000.0,63,64,3235.0,15.849259,True,False,0.0625,-85.059636,41.131703,41.131703,0.0625
37799,0.066411,0.329832,-0.093768,0.007517,-0.000144,0.112316,0.503109,-0.024129,1.263628,-0.359817,0.086009,-0.048408,-0.043558,0.03145,1.252166,0.158341,-0.024174,-0.019263,0.009206,0.01274,-0.03828,0.10589,0.209527,-0.033071,-0.006285,-9.9e-05,-0.037533,0.000975,0.097792,0.022945,-0.043148,-0.245328,-0.009627,0.002079,0.011331,-0.122763,-0.217356,1.080508,2.384186e-07,-0.004469,0.049962,0.495737,0.116301,-0.110655,-0.044347,-0.163311,0.0986,0.0239,-0.237007,-0.440662,-0.012463,-0.125458,0.136239,0.442488,-0.024187,-0.024024,0.267151,293661,347906,4,1443704000.0,10921,299,3235.0,15.712922,True,False,0.1,42.447358,48.378643,32.195684,0.0375
37800,0.066411,-0.189496,0.143254,0.007517,-0.000144,0.395783,-0.195529,0.010864,0.397088,0.822311,-0.239266,0.135545,-0.043558,0.00653,-0.148721,-0.060579,0.010858,0.010002,-0.037326,-0.008528,0.034087,0.015357,0.121058,-0.06734,-0.006285,-0.00036,0.014615,0.001481,-0.018441,-0.092425,0.528255,0.033845,0.118888,0.002079,-0.008533,0.306386,0.10787,-0.419146,2.384186e-07,-0.335775,0.083948,-0.298972,0.007959,0.028957,-0.041888,-0.163311,0.0986,-0.014886,-0.222176,0.455814,-0.012463,-0.104669,0.172006,-0.421129,0.010845,0.010975,-0.213364,293661,372674,8,1443704000.0,8471,10802,3235.0,0.0,True,False,0.027778,34.06262,35.05916,35.05916,-0.072222
37801,0.066411,0.329832,-0.069433,-0.02548,-0.000144,-0.278906,0.648995,-0.001422,0.847041,1.074346,0.23502,-0.288042,-0.043558,-0.06971,0.515679,0.110034,-0.001442,-0.01031,0.009206,0.000805,0.043767,0.159754,0.369388,-0.057384,-0.006285,0.000635,0.282944,0.000182,-0.018441,-0.005967,0.088356,0.036931,0.357333,0.002079,-0.001049,0.473467,0.103462,0.384612,2.384186e-07,-0.496545,0.295928,0.132444,0.476269,-0.082668,-0.012627,-0.163311,0.124609,-0.014886,-0.220523,0.305665,-0.012463,-0.033478,-0.167352,0.482425,-0.001455,-0.001313,0.433394,293661,365528,2,1443704000.0,5398,4782,3235.0,15.792453,True,False,0.012346,-69.371602,23.625285,23.625285,0.012346
37802,-0.154959,-0.786669,0.08905,0.007517,-0.000144,-0.676161,-0.523932,0.011826,-0.835812,-0.17177,-0.102708,-0.048408,-0.043558,0.010969,-1.046903,-0.152868,0.011821,0.011069,0.009206,0.011642,-0.071668,-0.415776,-0.773299,0.046501,-0.006285,-0.000331,-0.198829,0.000995,-0.018441,-0.019253,-0.012238,0.180918,-0.231889,0.002079,-0.009143,-0.661849,0.114869,-0.821292,2.384186e-07,-1.132221,-0.444124,-0.000403,-0.49931,-0.016908,-0.019826,-0.163311,-0.247407,-0.014886,-0.135985,-0.415647,-0.012463,-0.00025,-0.559134,-0.116951,0.011808,0.011937,-0.018297,293661,373315,6,1443704000.0,5024,943,3235.0,0.0,True,False,0.0625,36.094285,42.177769,42.177769,0.0625
37803,0.066411,-1.481173,-0.265544,0.007517,-0.000144,0.576175,-0.640655,0.002234,-0.646301,0.474308,0.310369,-0.381563,-0.043558,0.00653,-1.268393,-0.297752,0.002218,0.002681,-0.037326,-0.008528,-0.0087,0.058277,-0.316419,-0.010853,-0.006285,-7e-05,-0.325943,-0.001708,-0.018441,-0.092425,-0.059038,-0.088881,-0.294858,0.002079,-0.003216,-0.078288,-0.073353,-1.007668,2.384186e-07,-1.132221,-0.23146,0.132444,0.001817,-0.031762,0.042845,-0.163311,0.124609,-0.014886,-0.202853,-0.289713,-0.012463,0.114613,-0.207465,0.042755,0.002205,0.002343,-0.287694,293661,366092,10,1443704000.0,9356,3097,3235.0,0.0,True,False,0.012346,34.06262,35.05916,35.05916,0.012346
37804,0.066411,-1.83877,-0.072265,0.007517,-0.000144,-0.982893,-0.219925,-0.009753,-2.956896,-1.081632,-0.616251,-0.781167,-0.043558,-0.076809,-1.852885,-0.124998,-0.009782,-0.012181,0.009206,0.000805,0.04188,-0.105266,-0.202905,0.051502,-0.006285,-0.000352,-0.198971,-0.004066,-0.018441,0.048845,0.031004,-0.032912,0.053329,0.002079,0.003705,-0.149916,-0.210604,-1.040424,2.384186e-07,-2.480154,-0.685783,-0.298972,-0.62409,0.047053,0.023256,-0.163311,0.0986,0.007131,0.248665,-0.137214,-0.012463,0.004505,-0.904886,-0.445059,-0.009795,-0.009645,-0.116064,293661,359427,9,1443704000.0,9852,157,3235.0,0.0,True,False,0.1,29.699372,19.794038,19.794038,0.1


In [53]:
df2['rating_s'].describe()

count    219784.000000
mean          2.791806
std          61.362819
min        -574.896829
25%         -17.491175
50%           0.000000
75%          38.010958
max         373.040045
Name: rating_s, dtype: float64

In [33]:
from sklearn.preprocessing import scale, MinMaxScaler, MaxAbsScaler
#transform with min-max scale
#df2['rating_s_scale'] = MinMaxScaler().fit_transform(df2['rating_s'].values.reshape(-1,1)) 
#df2['dif_rating_s_scale'] = MinMaxScaler().fit_transform(df2['dif_rating_s'].values.reshape(-1,1))

In [54]:
df2['rating_s_scale'] = df2['rating_s'].groupby(df2.event_id).apply(lambda x: x/(x.max() -x.min()))
df2['dif_rating_s_scale'] = df2['dif_rating_s'].groupby(df2.event_id).apply(lambda x: x/(x.max() -x.min()))

In [34]:
#log rating
#df2['log_rating_s_scale'] = df2['rating_s_scale'].apply(np.log)
#df2['dif_rating_s_scale'] = MinMaxScaler().fit_transform(df2['dif_rating_s'].values.reshape(-1,1))

In [55]:
new_factors =  np.vstack((factors, 
    new_factors_array (df2.loc[df2.is1|df2.oos, ['rating_s_scale', 'dif_rating_s_scale']].values)))
new_factors.shape

(59, 1631851)

#### fit our model 

In [56]:
%time new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood, inds \
    =   mod.fit_slices(tsav, new_factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'new_model_likelihood'   
print new_model_likelihood
print '..................'

. . . . . . . . . . 10
CPU times: user 2min 48s, sys: 57.4 s, total: 3min 46s
Wall time: 1min 45s
new_model_likelihood
[[-1809.6054581  -1997.1291991  -1997.1291991 ]
 [-1815.2956033  -1993.38729757 -1993.38729757]
 [-1820.46573327 -1991.13279384 -1991.13279384]
 [-1825.04841825 -1978.80723507 -1978.80723507]
 [-1845.34679474 -1957.00310259 -1957.00310259]
 [-1863.07407718 -1951.28943996 -1951.28943996]
 [-1903.75005812 -1924.17873453 -1924.17873453]
 [-1921.12493606 -1902.65155706 -1902.65155706]
 [-1923.89396228 -1902.50302182 -1902.50302182]
 [-1928.8879264  -1895.42641802 -1895.42641802]
 [    0.             0.             0.        ]]
..................


In [35]:
df2['R_E_R'] = df2['rating_s_scale'] * df2['rating_r'] #product two rating - Elo and simple
df2['dif_R_E_R'] = df2.groupby(player)['R_E_R'].diff().fillna(0)

In [36]:
new_factors =  np.vstack((factors, 
    new_factors_array (df2[['R_E_R', 'dif_R_E_R']].values)))
new_factors.shape

(59, 1631851)

#### fit our model 

In [37]:
%time new_model_coefs, new_model_step1prob, new_model_step2prob, new_model_likelihood, inds \
    =   mod.fit_slices(tsav, new_factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'new_model_likelihood'   
print new_model_likelihood
print '..................'

. . . . . . . . . . 10
CPU times: user 2min 57s, sys: 57.2 s, total: 3min 54s
Wall time: 1min 50s
new_model_likelihood
[[-1809.55102147 -1997.04476319 -1997.04476319]
 [-1815.24720634 -1993.29163788 -1993.29163788]
 [-1820.41859484 -1991.02572548 -1991.02572548]
 [-1825.01211387 -1978.70720667 -1978.70720667]
 [-1845.33768712 -1956.93724432 -1956.93724432]
 [-1863.06566797 -1951.22448469 -1951.22448469]
 [-1903.71443211 -1924.04991949 -1924.04991949]
 [-1921.10935738 -1902.5585017  -1902.5585017 ]
 [-1923.84416068 -1902.40578304 -1902.40578304]
 [-1928.87046354 -1895.32807994 -1895.32807994]
 [    0.             0.             0.        ]]
..................


#### fit the old model for compare

In [26]:
%time old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood, inds \
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)

print 'old_model_likelihood'
print old_model_likelihood
print '..................'

. . . . . . . . . . 10
CPU times: user 3min 2s, sys: 1min 3s, total: 4min 5s
Wall time: 1min 53s
old_model_likelihood
[[-1809.6054581  -1997.1291991  -1997.1291991 ]
 [-1815.2956033  -1993.38729757 -1993.38729757]
 [-1820.46573327 -1991.13279384 -1991.13279384]
 [-1825.04841825 -1978.80723507 -1978.80723507]
 [-1845.34679474 -1957.00310259 -1957.00310259]
 [-1863.07407718 -1951.28943996 -1951.28943996]
 [-1903.75005812 -1924.17873453 -1924.17873453]
 [-1921.12493606 -1902.65155706 -1902.65155706]
 [-1923.89396228 -1902.50302182 -1902.50302182]
 [-1928.8879264  -1895.42641802 -1895.42641802]
 [    0.             0.             0.        ]]
..................


##### compare old model and models for each cluster

#### write the result in the file 

In [53]:
write_dic_to_simdata('simdata_Elo_factors2.p', new_model_step1prob, new_model_coefs, mod.oos, av =av)