# ThoroughBet Simulation


## Load necessary modules

In [4]:
import numpy as np
import pickle

from utils import settings, timestamp, YEAR
from utils.arrayview import ArrayView, TimeseriesView


from prediction.models.preprocessing import Model
from prediction.models.prediction import factornames_trimmed
from prediction.models.parameters import factor_build_end

ImportError: cannot import name multiarray

## Load data

In [None]:
av = ArrayView.from_file(settings.paths.join('brain_final2cut.av.bcolz'))

In [None]:
tsav = {}
sl = 0
while True:
    try:
        tsav[sl] = ArrayView.from_file(settings.paths.join('brain_final2_slice_%s.av.bcolz' % sl))
    except ValueError:
        break
    sl += 1

## Preprocessing

In [None]:
mod = Model(av, oos_start=factor_build_end+YEAR)

In [None]:
high_kurtosis_factors =  ['z64f5be67e', 'z90adc182a', 'z7081bf371', 'z34b808e99', 'z757be272e', 'z5a85cd6a9',
                         'zf991b634a', 'z62651f605', 'zd002b7067', 'z2ef7fedca', 'z6f11029f7', 'z412893062',
                          'z919b9585a', 'z89b0eda37', 'z31780b3f4', 'z6631693d3', 'z0b27f29ad', 'zd7cd94e4c', 
                          'zf5b2aef2a']

In [None]:
price_factors = ['zb392bb74a', 'z6809c316d', 'zd678f0538', 'z027f9f0f5', 'z88e79930c', 'z4a72dc02f',
                 'z1a3573928', 'z7b15df227']

In [None]:
%time factors = mod._preprocess_factors(factornames_trimmed, high_kurtosis_factors = high_kurtosis_factors,\
                                        price_factors = price_factors, verbose=True)

In [None]:
factors.T.shape

In [None]:
predict_mask = mod.is1|mod.is2|mod.oos
train_mask = mod.is1|mod.is2
train_event_id = av.event_id[train_mask]
predict_event_id = av.event_id[predict_mask]
len(predict_mask), len(train_mask), train_mask.sum(), predict_mask.sum(), len(train_event_id), len(np.unique(train_event_id))

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 60)

In [None]:
col_names = ['f{}'.format(i) for i in range(1,58)]
df = pd.DataFrame(data =factors[:, predict_mask].T , columns = col_names)

In [None]:
df['event_id'] = av.event_id[predict_mask]
df['runner_id'] = av.runner_id[predict_mask]
df['result'] = av.result[predict_mask]
df['is1'] = mod.is1[predict_mask]
df['is2'] = mod.is2[predict_mask]
df['oos'] = mod.oos[predict_mask]
df['time'] =av.start_time[predict_mask]
df['obstacle'] = av.obstacle[predict_mask]
df['going'] = av.going[predict_mask]
df['speed'] = av.speed[predict_mask]
df['distance'] = av.distance[predict_mask]
df['prize'] = av.prize[predict_mask]

df.head()

In [None]:
#df.to_csv('/home/oleg/thbmodel/racehorse_data2.csv')

- групировка по забегам 
- df_f сгрупированы факторы по забегу c вычислением минимальной разницы между сортированными факторами для значение в забеге
- df1 сгрупированы  переменные которые общие для всех участников звбега 

In [None]:
fun_event = lambda x: (x.max()-x.min())/x.count()
df_f = df.ix[:,u'f1'].groupby(df['event_id']).apply(fun_event)

In [None]:
for f in df.ix[:,u'f2':u'f57'].columns:
    df_f = pd.concat([df_f,df.ix[:, f].groupby(df['event_id']).apply(fun_event)], axis=1)

In [None]:
df_f.head()

In [None]:
df1 = df.ix[:,u'is2':].groupby(df['event_id']).first()
df1 = df1.join(df['result'].groupby(df['event_id']).count())
df1.head()

In [None]:
df_f.shape, df1.shape

##### кластеризуем алгоритмом BayesianGaussianMixture¶

In [None]:
from sklearn.mixture import BayesianGaussianMixture

cl_algor = []
score_list =[]
for eps in [None, 5e-3, 3e-2, 0.1, 0.2, 0.4, 1.0, 10., 1e+2, 1e+3, 1e+5]:
    if eps is None:
        BGM = BayesianGaussianMixture(n_components=12 )
        #BGM = BayesianGaussianMixture(n_components=12 , weight_concentration_prior_type ='dirichlet_distribution')
        eps = 'def'
    else:
        BGM = BayesianGaussianMixture(n_components=12, weight_concentration_prior= eps)
        #BGM = BayesianGaussianMixture(n_components=12, weight_concentration_prior_type ='dirichlet_distribution', 
                                      #weight_concentration_prior= eps) 
                                      
    BGM.fit(df_f.ix[df1.is2,u'f1':u'f57'].values)
    
    al_name = 'BGM_p_{}'.format(eps)
    
    df1[al_name] = BGM.predict(df_f.ix[:,u'f1':u'f57'].values)
    score = BGM.score(df_f.ix[:,u'f1':u'f57'].values)
    cl_algor.append(al_name)
    score_list.append((score, al_name))
    print 'score  ',score
    
    print df1[al_name].value_counts()
    print

- отсортируем в порядке возростания likelihood

In [None]:
cl_algor = [x[1] for x in sorted(score_list, key = lambda x: x[0])]
cl_algor

In [None]:
cl_algor = cl_algor[:2] + cl_algor[-2:] # 4 with max and min score
cl_algor

- переименуем кластеры по порядку убывания в них точек

In [None]:
for al in cl_algor:
    dic = {x:y for y,x in zip(['a', 'b', 'c', 'd', 'e', 'f','g','h', 'j', 'q', 'p', 's'],df1[al].value_counts().index)}
    df1[al] = df1[al].replace(dic)
    df_f[al] = df1[al]
    print df1[al].value_counts()

In [None]:
threshold = 400
df_f.to_csv(settings.paths.join('clusterin_data_BGM_p.csv'))
#with open (settings.paths.join('clusterin_data_kMM.pkl'.format(al)), 'wb') as data:
            #pickle.dump( df_f, data)

- сравним разные алгоритмы на совпадения кластеров 

In [None]:
from itertools import combinations
simmilar_algorithm = []
for (ag1, ag2) in combinations(cl_algor, 2):
    numbe_simmilar = (df1[ag1] == df1[ag2]).sum()
    print 'algorithms {}  = {}  in  {} case  from  {}'.format(ag1,ag2, numbe_simmilar, len(df1[ag1]))
    print 
    if numbe_simmilar > 14000:
        simmilar_algorithm.append((ag1,ag2))

In [None]:
simmilar_algorithm

#### Clustering Outlier

In [None]:
from prediction.models import clmodel
from prediction.tools.helpers import strata_scale_down

is1 = mod.is1.copy()
is2 = mod.is2.copy()
oos = mod.oos.copy()
strata = strata_scale_down(av.event_id)

In [None]:
def time_sets_end (data, cl, time = 'time', end = (0.75, 0.85), threshold = 400):
    """ 
    определяем время окончания общее для всех кластеров из data[cl] так чтобы доля точек с временем меньше 
    было в интервале 'end' если не удается найти приемлемое пресечение временных интервалов выдает сообщение
    
    <data> - pandas dataFrame
    <cl>  - column with names of clusters for each event_id
    <time> - column with the start's time  for each event_id 
    <end> - part of poins that before interval 
    <threshold> minimum number of points in the cluster if the cluster is counted
    """
    
    time_end_min =[]
    time_end_max =[]
    df_cl = data[cl].value_counts()
    
    for cluster  in df_cl.index[df_cl > threshold]:
        time_list = data[time][data[cl] == cluster].tolist()
        time_end_min.append(time_list[int(len(time_list)*end[0])]) # время  окончания трайн min
        time_end_max.append(time_list[int(len(time_list)*end[1])]) # время окончания трайн max
        
    time_min = np.max(time_end_min)
    time_max = np.min(time_end_max)
    
    if time_min <= time_max:
        return (time_min+time_max)/2.
    else:
        print 'not itersection of time sets for {}  {}'.format( cl, end)
        return time_max

In [None]:
def cut(data, cl, time = 'time', train = (0.75, 0.85), val = (0.5, 0.65), threshold = 400, verbose=False):
    """ 
    разбивает выборку на train, validation, test так чтобы доля точек в train set была в пределах 'train'
    а оставшиеся точки разбиваются на validation & test в соотношении 'val',
    при этом время всех точек по наростающей
    <data> - pandas dataFrame
    <cl>  - column with names of clusters for each event_id
    <time> - column with the start's time  for each event_id 
    <train> - part of points  for train
    <val> -  part of points  for validation from (all set - train)
    <threshold> minimum number of points in the cluster if the cluster is counte
    """
    
    train_event, val_event, test_event =[], [],[]
    
    time_train_end = time_sets_end( data, cl , end =train, threshold = threshold) # время  окончания трайн
    train_event = data.index[data[time] <= time_train_end].tolist() 
    
    df_val = data[data[time] > time_train_end]
    time_test_start = time_sets_end(df_val , cl, end =val) # время  окончания validation
    
    if verbose:
        print 'time train end', time_train_end
        print 'time test start', time_test_start
    val_event = data.index[(data['time'] >time_train_end) & (data['time'] <= time_test_start)].tolist()
    test_event = data.index[data['time'] > time_test_start].tolist()
    
    return (train_event, val_event, test_event)

In [None]:
ts_mask = av.start_time >= float(timestamp('2015-08-01'))

for k in tsav.keys():
    tsav[k] = tsav[k][predict_mask [ts_mask]]

In [None]:
def step2a(data, cluster_list, is1, oos, av =av, tsav = tsav, factors = factors, mod = mod, verbose=False):
    
    model_coefs, model_step1prob, model_step2prob, model_likelihood = {}, {}, {}, {}
    
    
    df_cl = data.value_counts()

    for cluster in cluster_list:

        mask_cluster = np.in1d(av.event_id, data.index[data == cluster])
        
        mod.is1 = is1 & mask_cluster
        mod.is2 = is1 & mask_cluster
        mod.oos = oos & mask_cluster

        
         
        model_coefs[cluster], model_step1prob[cluster], model_step2prob[cluster], model_likelihood[cluster]\
        = mod.fit_slices(tsav, factors,  depth=3, lmbd=10, verbose=False, fit_afresh=True)
        if verbose:
            print 'cluster {}  number  {}'.format(cluster, df_cl[cluster])
            train_event = np.unique(av.event_id[mod.is1])
            test_event = np.unique(av.event_id[mod.oos])
            print 'LL  {}          {}            {}'.format (len(train_event), len(train_event), len(test_event))
            print model_likelihood[cluster]
    return model_coefs, model_step1prob, model_step2prob, model_likelihood

In [None]:
def cluster_lists(data, oos, threshold, min_test =5, verbose=False):
    """
    return
    cluster_list the list of cluster where each cluster has test points more that min_test 
    
    <data>  pandas Series with index are event_id , data are clusters names
    <oos>   test mask
    <threshold> minimum number of points in the cluster if the cluster is counte
    """
    
    
    df_cl = data.value_counts()
    cluster_list = []
   
    
    for cluster  in df_cl.index[df_cl > threshold]:
        
        cluster_mask = np.in1d(av.event_id, data.index[data == cluster])
        oos_event = np.unique(av.event_id[oos & cluster_mask])
        
        if len(oos_event) >= min_test:
            
            cluster_list.append(cluster)
            
    return cluster_list

In [None]:
#cluster_model_coefs, cluster_model_step1prob, cluster_model_step2prob, cluster_model_likelihood = {}, {}, {}, {}
#old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood= {}, {}, {}, {}

cluster_list ={}

for al in cl_algor:
    print al
    
    cluster_list[al] = cluster_lists(df1[al], oos, threshold = threshold)
    
    #cluster_model_coefs[al], cluster_model_step1prob[al],  cluster_model_step2prob[al],\
    #cluster_model_likelihood[al]= step2 (df1[al], train_val_test[al], verbose =True)
    cluster_model_coefs, cluster_model_step1prob, cluster_model_step2prob, cluster_model_likelihood \
    =    step2a (df1[al], cluster_list[al], is1, oos, verbose =True)
    
    print 
    """
    with open (settings.paths.join('clusterin_{}_cofs.pkl'.format(al)), 'wb') as f_cof:
            pickle.dump( cluster_model_coefs, f_cof)
            
    with open (settings.paths.join('clusterin_{}_step1prob.pkl'.format(al)), 'wb') as f_1prob:
            pickle.dump( cluster_model_step1prob, f_1prob)
            
    with open (settings.paths.join('clusterin_{}_step2prob.pkl'.format(al)), 'wb') as f_2prob:
            pickle.dump( cluster_model_step2prob, f_2prob)
    """      
mod.is1 = is1
mod.is2 = is1
mod.oos = oos
    
#old_model_coefs[al], old_model_step1prob[al], old_model_step2prob[al],\
#old_model_likelihood[al] = mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)
old_model_coefs, old_model_step1prob, old_model_step2prob, old_model_likelihood \
    =   mod.fit_slices(tsav, factors, depth=3, lmbd=10, verbose=False, fit_afresh=True)
    

print 'No clustering '
print 'LL'
print old_model_likelihood
"""
with open (settings.paths.join('old_model_cofs.pkl'), 'wb') as f_cof:
    pickle.dump( old_model_coefs, f_cof)
            
with open (settings.paths.join('old_model_step1prob.pkl'), 'wb') as f_1prob:
    pickle.dump( old_model_step1prob, f_1prob)
            
with open (settings.paths.join('old_model_step2prob.pkl'), 'wb') as f_2prob:
    pickle.dump( old_model_step2prob, f_2prob)"""

In [None]:
df_f.ix[:,'f1':'f57'].groupby(df_f['BGM_p_def']).mean()

In [None]:
%pylab inline
col = 'BGM_p_def'
df__f = df_f[(df_f[col]== 'a') |(df_f[col] =='b') |(df_f[col] =='c') |(df_f[col] =='d')]
df__f['f25'].hist( bins = 60, by=df__f[col])

In [None]:
np.unique(df1['BGM_p_def'] ), old_model_step2prob.shape, cluster_model_step2prob.keys()

In [None]:
from utils.accumarray import uaccum
from prediction.tools.helpers import strata_scale_down
strata = strata_scale_down(av.event_id[predict_mask])

In [None]:
def ll_diff (prob_new, prob_old, is1= is1, oos =oos, 
             av =av, tsav =tsav, strata = strata):
    """ 
    count the differance of Likelihood for two models 
    <prob_new>  probability of new model
    <prob_old>  probability of old model
    <train, val, test> are lists of events for train, validation, test
    """
    
    llcomb = np.zeros((11, 3))
    ll_old = np.zeros((11, 3))
        
    is2 = is1
    predict_mask = is1|is2|oos
            
    for sl in xrange(10):
        good = ~np.isnan(tsav[sl + 1].log_pmkt_back) & ~np.isnan(tsav[sl + 1].log_pmkt_lay)
        good = uaccum(strata, good, func='all')
        for i, mask in enumerate([is1, is2, oos]):
            p_new = prob_new[sl, mask[predict_mask] & good] # probobility new model
            p_old = prob_old[sl, mask[predict_mask] & good] # probobility old model
            
            winners = av.result[predict_mask][mask[predict_mask] & good] == 1

            llcomb[sl, i] = np.mean(np.log(p_new[winners][p_new[winners] !=0])) * 1000 # LL new model
            ll_old[sl, i] = np.mean(np.log(p_old[winners][p_old[winners] !=0])) * 1000 # LL old model
        #print llcomb[sl, i] - ll_old[sl, i]
            
    diff_new_old = llcomb- ll_old # differance between mix  and old model 
    return diff_new_old

In [None]:
def ll_for_each_cluster (data,  new_Model, old_Model, is1 =is1, oos =oos, best ='train', not_list =np.array([]), 
                         av =av, verbose=False):
    """ 
    строим новую модель заменяя в старой один кластер 
    расчитываем изменеие LL в среднем по train, validation, test 
    отбираем кластеры по лучшему среднему улучшению для train or validation
    <data> - pandas Series with index are event_id , data are clusters names 
    <new_Model>  wins probability for each cluster 
    <old_Model>  wins probability for no cluster 
    <train_val_test>  event_id for train , validation , test 
    <best>  the choose from train or validadion
    <not_list>  the list with clusters that to exclude from model
    """
    
    if best == 'train':
        best = 0
    else:
        best = 1
    
    
    mean_new =[]
    is2 = is1
    predict_mask = is1|is2|oos
    
    for cluster in new_Model.keys():
        
        if not cluster in not_list:
            cluster_mask = np.in1d(av.event_id[predict_mask], data.index[data == cluster])
            prob_mix = np.where(cluster_mask , new_Model[cluster] , old_Model )
        

            diff_new_old = ll_diff(prob_mix, old_Model)

            print 'cluster ', cluster
            if verbose:
                print diff_new_old
    
            mean_ll_diff = diff_new_old[:10].mean(axis =0)
            print 'mean  ', mean_ll_diff
            if mean_ll_diff[best] > 0:
                mean_new.append((cluster,mean_ll_diff[best]))
    cl_list = [x[0] for x in sorted(mean_new , key = lambda x: x[1], reverse =True)]
    return cl_list, mean_new

In [None]:
def ll_for_mix_clusters (data, cl_list, new_Model, old_Model, is1 =is1, oos =oos, best ='test', 
                         av =av, tsav =tsav, strata =strata, predict_mask =predict_mask, verbose=False):
    """ 
    строим новую модель заменяя в старой некоторые кластеры из списка полученного от  ll_for_each_cluster()
    расчитываем изменеие LL в среднем по train, validation, test 
    печатаем лучшее улучшение для test
    <data> - pandas Series with index are event_id , data are clusters names
    <cl_list> list for mix clusters
    <new_Model>  wins probability for each cluster 
    <old_Model>  wins probability for no cluster 
    <train_val_test> event_id for train , validation , test 
    <best>  the choose from train or validadion
    """
    
    cl_lists = []
    is2 = is1
    predict_mask = is1|is2|oos
    
    for i in range(len(cl_list)):
        cl_lists.append(cl_list[:i+1])
        
    if best =='test':
        best = 2
    elif best =='val':
        best = 1
    else:
        best = 0

    best_ll = 0.
    best_mix = None
    
    for list_ in cl_lists:
        print list_
        prob_mix = old_Model
        
        for cluster in list_:

                cluster_mask = np.in1d(av.event_id[predict_mask], data.index[data == cluster])
                prob_mix = np.where(cluster_mask , new_Model[cluster],  prob_mix)
                
        diff_new_old = ll_diff(prob_mix, old_Model)
        if verbose:
            print '    train       validation      test'
            print diff_new_old      # differance between mix  and old model
        mean_ll_diff = diff_new_old[:10].mean(axis =0)
        print 'mean ', mean_ll_diff
        if mean_ll_diff[best] > best_ll:
            best_mix = list_
            best_ll = mean_ll_diff[best]


    return best_mix, best_ll

- отбор лучших по трейн дата

In [None]:
cl_list = {}
cl_means ={}
#threshold = 100
#cl_algor = ['BGM_0.001','BGM_0.1', 'BGM_1.0', 'kMM']
for al in cl_algor:
    print al
    
    #with open (settings.paths.join('clusterin_{}_step2prob.pkl'.format(al)), 'rb') as f_2prob:
            #cluster_model_step2prob = pickle.load( f_2prob)
    #with open (settings.paths.join('old_model_step2prob.pkl'), 'rb') as f_2prob:
            #old_model_step2prob = pickle.load( f_2prob)
    
    cl_list[al], cl_means[al] = ll_for_each_cluster (df1[al],  cluster_model_step2prob, old_model_step2prob)
    print cl_list
    print 
for al in cl_algor:
    
    #with open (settings.paths.join('clusterin_{}_step2prob.pkl'.format(al)), 'rb') as f_2prob:
            #cluster_model_step2prob = pickle.load( f_2prob)
    #with open (settings.paths.join('old_model_step2prob.pkl'), 'rb') as f_2prob:
            #old_model_step2prob = pickle.load( f_2prob)
         
    best_cl, best_score = ll_for_mix_clusters (df1[al], cl_list[al], cluster_model_step2prob, old_model_step2prob)
    print 
    print 'for {}  best clusters from train score are {}  best score on test {}'.format(al, best_cl, best_score)
    print 

##### write the simdata file

In [None]:
def dic_to_tenzor(dic, key, base):
    '''
    tenzor where first dimention is the number of cluster
    0 = no_cluster
    <dic> dictionary cluster's data that to convert in tenzor
    <key> the list of clusters that use 
    <base> no cluster data
    '''
    
    key_0 = dic.keys()[0]
    tenzor = np.zeros((len(key)+1, dic[key_0].shape[0], dic[key_0].shape[1]))
    try:
        tenzor[0,:,:] = base
    except:
        print 'base and dic[k] have the diferent size'
        return
    for i,k in enumerate(key):
        tenzor[i+1,:,:] = dic[k]
    return tenzor

In [None]:
def clusters_number(data, key, av=av):
    """ 
    list with numbers of clusters 
    <data>  pandas Series index = event_id, data = cluster's names
    <key> the list of clusters that use
    """
    
    cl_number = np.zeros((len(av.event_id)))
    for i,k in enumerate(key):
        mask = np.in1d(av.event_id,data.index[data ==k])
        cl_number = np.where(mask,i+1,cl_number)
    return cl_number

In [None]:
def write_simdata(step1probs, oos, coefs, cluster_number=None, file_ = 'simdata.p'):
    '''
    <step1probs> is expected to be a matrix N_slices x len(av). 
    <oos> is a boolean mask denoting the out of sample range. len(oos) shoud equal len(av)
    <coefs> is a coefficient matrix with the size N_slices x 3
    <cluster_number> is an integer array with the cluster numbers per race. Size: len(av)
    '''
    f = file(settings.paths.join(file_), 'wb')
    if cluster_number is None:
        s1p = step1probs[:, oos]
    else:
        cluster_number = cluster_number[oos]
        s1p = step1probs[:, :, oos]
    pickle.dump([s1p, oos, coefs, cluster_number], f)
    f.close()

In [None]:
def write_dic_to_simdata(file_name, old_step1probs, old_coefs, oos, data=None, av =av,
                         cluster_step1probs =None, cluster_coefs =None, cluster_names =None):
    """
    <file_name> is name of file to record
    <old_step1probs> is expected to be a matrix N_slices x len(av)
    <old_coefs> is a coefficient matrix with the size N_slices x 3
    <oos> is a boolean mask denoting the out of sample range. len(oos) shoud equal len(av)
    <data>  pandas Series index = event_id, data = cluster's names
    <cluster_step1probs> is expected to be a dictionary: key is the cluster name and
                        data are the matrix N_slices x len(av) for each cluster
    <cluster_coefs> is a dictionary : key is the cluster name and data and 
                        data are the coefficient matrix with the size N_slices x 3
    <cluster_number> is an integer array with the cluster numbers per race. Size: len(av)
    """
    
    cl_number= np.zeros((len(av.event_id)))
    
    if cluster_names is not None:
        
        s1prob = dic_to_tenzor(cluster_step1probs, cluster_names, old_step1probs)
        coef_s = dic_to_tenzor(cluster_coefs, cluster_names, old_coefs)
        
        for i,k in enumerate(cluster_names):
            mask = np.in1d(av.event_id,data.index[data ==k])
            cl_number = np.where(mask, i+1, cl_number)
        #cl_number = clusters_number(data, cluster_names, av=av)
          
        
    else:
        
        s1prob = np.zeros((1,old_step1probs.shape[0], old_step1probs.shape[1]))
        s1prob[0,:,:] = old_step1probs
        coef_s = np.zeros((1,old_coefs.shape[0], old_coefs.shape[1]))
        coef_s[0,:,:] = old_coefs
        
        

    #write_simdata(s1prob, oos, coef_s, cl_number, file_ = file_name)
    s1prob = s1prob[:,:,oos]
    cl_number = cl_number[oos]
    
    with open (settings.paths.join(file_name), 'wb') as f:
            pickle.dump( [s1prob, oos, coef_s, cl_number], f)
    return

In [None]:
cl_algor

In [None]:
threshold

In [None]:
al, cl = 'BGM_p_def' , ['h', 'b', 'q', 'a', 'c', 'e', 'f', 'd', 'j']

In [None]:
file_write = 'simdata{}_{}.p'.format(al,''.join(cl))

In [None]:
al, 'clusterin_{}_cofs.pkl'.format(al), file_write

In [None]:
import pickle

    
with open (settings.paths.join('clusterin_{}_cofs.pkl'.format(al)), 'rb') as f_cof:
    cluster_model_coefs = pickle.load( f_cof)
            
with open (settings.paths.join('clusterin_{}_step1prob.pkl'.format(al)), 'rb') as f_1prob:
    cluster_model_step1prob = pickle.load(f_1prob)
    
#with open (settings.paths.join('old_model_cofs.pkl'.format(al)), 'rb') as f_cof:
    #old_model_coefs = pickle.load( f_cof)
            
#with open (settings.paths.join('old_model_step1prob.pkl'.format(al)), 'rb') as f_1prob:
    #old_model_step1prob = pickle.load(f_1prob)
            

In [None]:
write_dic_to_simdata(file_write, old_model_step1prob, old_model_coefs, oos, data=df1[al], av =av,\
        cluster_step1probs = cluster_model_step1prob, cluster_coefs = cluster_model_coefs, cluster_names = cl)

In [None]:
write_dic_to_simdata('simdata_Oldmodel.p', old_model_step1prob, old_model_coefs, oos)