In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from models import DecisionTreeRegressor_Modified,DecisionTreeClassifier_Modified
from sklearn.model_selection import StratifiedKFold,KFold
from my_forest import RandomForestClassifier_Modified
from tqdm.auto import tqdm
from configs import ionosphere,music, Wids2021,fetal_health
from sklearn.metrics import mean_squared_error, r2_score,roc_auc_score,accuracy_score
from skopt import gp_minimize, forest_minimize
from skopt.utils import use_named_args
from skopt.plots import plot_objective, plot_evaluations, plot_convergence, plot_regret
from skopt.space import Categorical, Integer, Real

In [2]:
def calculate_final_score_class(rf_clf,x_val,y_val,A,N,CFG,to_print=False):
    y_pred = rf_clf.predict_proba(x_val,A,N,to_print)
    auc = roc_auc_score(y_val,y_pred if CFG.multi_class else y_pred[:,1],multi_class='ovo' if CFG.multi_class else "raise")
    return auc

In [3]:
def optimize(space, rf_clf, x_val,y_val,CFG,fold,fun, n_calls=50):
    @use_named_args(space)
    def score(**params):
        final_score = fun(rf_clf,x_val,y_val,CFG=CFG, **params)
        return -final_score
    return gp_minimize(func=score, dimensions=space, n_calls=n_calls)

In [4]:
def predict_proba_with_tta(data, 
                           model, 
                           dummies = None, 
                           num_tta = 4, 
                           alpha   = 0.01, 
                           beta    = 0.01, 
                           seed    = 0):
    '''
    Predicts class probabilities using TTA.
    
    Arguments:
    - data (numpy array): data set with the feature values 
    - model (sklearn model): machine learning model
    - dummies (list): list of column names of dummy features
    - num_tta (integer): number of test-time augmentations
    - alpha (float): noise parameter for continuous features
    - beta (float): noise parameter for dummy features
    - seed (integer): random seed

    Returns:
    - array of predicted probabilities
    '''
    
    # set random seed
    np.random.seed(seed = seed)
    
    # original prediction
    preds = model.predict_proba(data.values) / (num_tta + 1)
     
    # select numeric features
    num_vars = [var for var in data.columns if data[var].dtype != 'object']
        
    # find dummies
    if dummies != None:
        num_vars = list(set(num_vars) - set(dummies))
    
    # synthetic predictions
    for i in range(num_tta):
        
        # copy data
        data_new = data.copy()
    
        # introduce noise to numeric vars
        for var in num_vars:
            data_new[var] = data_new[var] + alpha * np.random.normal(0, 1, size = len(data_new)) * data_new[var].std()
            
        # introduce noise to dummies
        if dummies != None:
            for var in dummies:
                probs = np.random.binomial(1, (1 - beta), size = len(data_new))
                data_new.loc[probs == 0, var] = 1 - data_new.loc[probs == 0, var]
            
        # predict probs
        preds_new = model.predict_proba(data_new) 
        preds    += preds_new / (num_tta + 1)
    
    # return probs
    return preds

In [13]:
def run_one_dataset(CFG,to_print=False):
    #load data
    skf = CFG.kfold(n_splits=CFG.n_folds, random_state=CFG.random_state,shuffle=True)
    df = CFG.df.fillna(0).reset_index(drop=True)
    if CFG.preprocces:
        df = CFG.preprocces(df)
    
    train_features = df.drop(CFG.label_col, axis=1)
    labels = df[CFG.label_col]
    N_df,fet = df.shape

    history = []
    y_preds= []
    y_preds2 = []
    y_true= []
    y_preds_without_augmentation = []

    skf_split = skf.split(train_features,labels) if CFG.kfold == StratifiedKFold else skf.split(train_features)
    for fold,(train_index, val_index) in enumerate(tqdm(skf_split)):
        # build model
        x_train,y_train = train_features.loc[train_index],labels.loc[train_index]
        x_val,y_val = train_features.loc[val_index],labels.loc[val_index]
        
        x_val_models = x_val.values
        
        rf_clf = CFG.model(random_state=CFG.random_state,min_samples_split=N_df//100,min_samples_leaf=N_df//100,max_leaf_nodes=300)
        rf_clf.fit(x_train,y_train)

        # call predict

        space = [Real(0.0, 0.7, name='A'),Integer(1, 2, name='N'),] # Bayesian optimization
        opt_result = optimize(space,rf_clf, x_val_models,y_val,CFG,fold,fun=calculate_final_score_class, n_calls=CFG.n_calls)
        A = opt_result.x[0]
        N = opt_result.x[1]
        print('A:',A,'N:',N)
        y_pred = rf_clf.predict_proba(x_val_models,A,N,to_print=False)

        y_pred2 =  predict_proba_with_tta(data    = x_val, 
                                                         model   = rf_clf, 
                                                         dummies = None,
                                                         num_tta = 5, 
                                                         alpha   = np.sqrt(len(x_train)) / 3000,
                                                         beta    = np.sqrt(len(x_train)) / 30000,
                                                         seed    = 1)

        y_pred_without_augmantation = rf_clf.predict_proba(x_val,0,1,to_print=to_print)


        auc = roc_auc_score(y_val,y_pred if CFG.multi_class else y_pred[:,1],multi_class='ovo' if CFG.multi_class else "raise")
        acc = accuracy_score(y_val.astype(int),y_pred.argmax(1).astype(int))
        
        auc2 = roc_auc_score(y_val,y_pred2 if CFG.multi_class else y_pred2[:,1],multi_class='ovo' if CFG.multi_class else "raise")
        acc2 = accuracy_score(y_val.astype(int),y_pred2.argmax(1).astype(int))
        

        auc_base = roc_auc_score(y_val,y_pred_without_augmantation if CFG.multi_class else y_pred_without_augmantation[:,1],multi_class='ovo' if CFG.multi_class else "raise")
        acc_base = accuracy_score(y_val.astype(int),y_pred_without_augmantation.argmax(1).astype(int))

        print('modified. auc:{auc:.6f}, acc:{acc:.6f}'.format(auc=auc,acc=acc))
        print('modified 2.   auc:{auc:.6f}, acc:{acc:.6f}'.format(auc=auc2,acc=acc2))
        print('normal.   auc:{auc:.6f}, acc:{acc:.6f}'.format(auc=auc_base,acc=acc_base))
        history.append({"model":rf_clf,'y_pred':y_pred,"y_pred_without_random":y_pred_without_augmantation,
                        "auc":auc,
                        "acc":acc,
                        "auc2":auc2,
                        "acc2":acc2,
                        "auc_base":auc_base,
                        "acc_base":acc_base,
                        "y_val":y_val,
                        "alpha":A,"iter":N,"opt_result":opt_result})
        y_preds.append(y_pred)

        y_preds2.append(y_pred2)
        
        y_true.append(y_val)

        y_preds_without_augmentation.append(y_pred_without_augmantation)

    cv_auc = roc_auc_score(np.concatenate(y_true),np.concatenate(y_preds) if CFG.multi_class else np.concatenate(y_preds)[:,1],multi_class='ovo' if CFG.multi_class else "raise")
    cv_acc = accuracy_score(np.concatenate(y_true).astype(int),np.concatenate(y_preds).argmax(1).astype(int))
    
    cv_auc2 = roc_auc_score(np.concatenate(y_true),np.concatenate(y_preds2) if CFG.multi_class else np.concatenate(y_preds2)[:,1],multi_class='ovo' if CFG.multi_class else "raise")
    cv_acc2 = accuracy_score(np.concatenate(y_true).astype(int),np.concatenate(y_preds2).argmax(1).astype(int))
    

    cv_auc_without_augmentation = roc_auc_score(np.concatenate(y_true),np.concatenate(y_preds_without_augmentation) if CFG.multi_class else np.concatenate(y_preds_without_augmentation)[:,1],multi_class='ovo' if CFG.multi_class else "raise")
    msg = f"datasets {CFG.d_name}, val oof auc score {cv_auc}, val oof auc2 score {cv_auc2},  auc score without random  - {cv_auc_without_augmentation}, total size {N_df}, num fet {fet}"
    print(msg)
    history.append(msg)
    
    return history,cv_auc,cv_acc

In [14]:
CFG = ionosphere()
history,cv_auc,cv_acc = run_one_dataset(CFG)

0it [00:00, ?it/s]

Print: False
Print: False
Print: False
Print: False
Print: False
Print: False
Print: False
Print: False
Print: False
Print: False
A: 0.07227894920279558 N: 1
Print: False
Print: False
Error when calling recurse_predict function. e:'>=' not supported between instances of 'numpy.ndarray' and 'str'
Error when calling predict_proba_one function
Error when calling predict_proba_rnd function
Error when calling predict_proba function


TypeError: '>=' not supported between instances of 'numpy.ndarray' and 'str'

In [None]:
# x_val

In [None]:
#load data
skf = CFG.kfold(n_splits=CFG.n_folds, random_state=CFG.random_state,shuffle=True)
df = CFG.df.fillna(0).reset_index(drop=True)
if CFG.preprocces:
    df = CFG.preprocces(df)

train_features = df.drop(CFG.label_col, axis=1)
labels = df[CFG.label_col]
N_df,fet = df.shape

history = []
y_preds= []
y_preds2 = []
y_true= []
y_preds_without_augmentation = []

skf_split = skf.split(train_features,labels) if CFG.kfold == StratifiedKFold else skf.split(train_features)
for fold,(train_index, val_index) in enumerate(tqdm(skf_split)):
    x_train,y_train = train_features.loc[train_index],labels.loc[train_index]
    x_val,y_val = train_features.loc[val_index],labels.loc[val_index]
    
    rf_clf = CFG.model(random_state=CFG.random_state,min_samples_split=N_df//100,min_samples_leaf=N_df//100,max_leaf_nodes=300)
    rf_clf.fit(x_train,y_train)
    
    y_pred = rf_clf.predict_proba(x_val.values,0.7,2,to_print=False)
    auc = roc_auc_score(y_val,y_pred if CFG.multi_class else y_pred[:,1],multi_class='ovo' if CFG.multi_class else "raise")
    print(auc)

In [None]:
x_val

In [13]:
for ind,x_row in enumerate(x_val.values):
    print('x_row',x_row)

x_row [True False 0.99539 -0.05889 0.85243 0.02306 0.83398 -0.37708 1.0 0.0376
 0.85243 -0.17755 0.59755 -0.44945 0.60536 -0.38223 0.84356 -0.38542
 0.58212 -0.32192 0.56971 -0.29674 0.36946 -0.47357 0.56811 -0.51171
 0.41078 -0.46168 0.21266 -0.3409 0.42267 -0.54487 0.18641 -0.453]
x_row [True False 1.0 -0.18829 0.93035 -0.36156 -0.10868 -0.93597 1.0 -0.04549
 0.50874 -0.67743 0.34432 -0.69707 -0.51685 -0.97515 0.05499 -0.62237
 0.33109 -1.0 -0.13151 -0.453 -0.18056 -0.35734 -0.20332 -0.26569 -0.20468
 -0.18401 -0.1904 -0.11593 -0.16626 -0.06288 -0.13738 -0.02447]
x_row [True False 0.50932 -0.93996 1.0 0.26708 -0.0352 -1.0 1.0 -1.0 0.43685
 -1.0 0.0 0.0 -1.0 -0.34265 -0.37681 0.03623 1.0 -1.0 0.0 0.0 0.0 0.0
 -0.16253 0.92236 0.39752 0.26501 0.0 0.0 1.0 0.23188 0.0 0.0]
x_row [False False 1.0 -1.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 -1.0 -0.71875 1.0 0.0
 0.0 -1.0 1.0 1.0 1.0 -1.0 1.0 1.0 0.5625 -1.0 1.0 1.0 1.0 1.0 -1.0 1.0
 1.0 1.0 1.0]
x_row [True False 0.96071 0.07088 1.0 0.04296 1.0 0.09

In [11]:
CFG = music()
history,cv_auc,cv_acc = run_one_dataset(CFG)

0it [00:00, ?it/s]

Error when calling predict_proba function


IndexError: string index out of range