# Imports

In [1]:
import pandas as pd
import random
import numpy as np
import catboost as cb
from catboost import Pool
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, ConfusionMatrixDisplay
from sklearn.model_selection import ParameterSampler, RandomizedSearchCV
from scipy.stats.distributions import expon
from scipy.stats import uniform
from sklearn.metrics import ConfusionMatrixDisplay
import multiprocessing
import os
import glob
import re
import h2o
import tqdm
import pickle
from autoxgb import AutoXGB
from autoxgb.cli.predict import PredictAutoXGBCommand
from optuna.samplers import TPESampler
import optuna
from sklearn.metrics import mean_squared_error as mse
SEED = 42

%matplotlib notebook
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(SEED)

In [2]:
from autoxgb import AutoXGB

# Functions

In [3]:
def look_at_anti_dist(all_ASR, col_name, col_order=None):
    col_dist = all_ASR.groupby(by='antibiotic_name')[col_name].apply(lambda x: x.value_counts()).reset_index()
    col_dist.columns = ['antibiotic_name', col_name, 'count']
    order = col_dist.groupby(by='antibiotic_name')['count'].apply(sum).sort_values().index
    col_dist = pd.pivot_table(col_dist, values='count', index=['antibiotic_name'],
                    columns=[col_name], aggfunc=np.sum).fillna(0)
    if col_order is not None:
        col_dist = col_dist[col_order]
    ax = col_dist.loc[order].plot.barh(stacked=True, rot=0, figsize=(20,15))
    plt.title('Distribution of ' + col_name+' for each anti-biotics')
    plt.ylabel('antibiotics')
    plt.xlabel('# of measurements')
    plt.legend(loc='lower right')

In [4]:
def print_anti_measure(all_ASR, anti_index):
    anti = all_ASR['antibiotic_name'].value_counts().index[anti_index]
    anti_MIC = all_ASR[all_ASR['antibiotic_name'] == anti]
    anti_MIC['measurement'] = anti_MIC['measurement'].apply(np.log2)
    low = anti_MIC['measurement'].min().round()
    high = anti_MIC['measurement'].max().round()
    hist_range = np.arange(low-0.5, high+1, 1)
    bins_count = pd.DataFrame(anti_MIC.groupby(by='measurement_sign')['measurement'].apply(lambda x: np.histogram(x, bins=hist_range)[0]))
    bins_count = bins_count.merge(pd.DataFrame({'fill': [np.zeros(len(hist_range)-1)]}, index=['=', '<=', '>=', '<', '>']), left_index=True, right_index=True, how='right')
    bins_count['measurement'].fillna(bins_count['fill'], inplace=True)
    pd.DataFrame(bins_count['measurement'].tolist(), index= bins_count.index, columns=hist_range[:-1]+0.5).T.plot.bar(stacked=True)
    plt.title(anti)
    plt.xlabel('log2(mg//L)')
    plt.ylabel('#')


In [5]:
def get_filtered_data(
    data = 'tot_filtered_data.csv', 
    features = 'final_features',
    ASR_data = 'filtered_ASR_data.csv', 
    species_sep = True, 
    species_filter_index=0, 
    naive=True, 
    test_range=False, 
    antibiotic_index=0,
    task='regression', 
    strip_range_train=False,
    distance_range_train=False,
    range_moved=5,
):
    data = pd.read_csv('../resources/'+data)
    with open("../resources/"+ features, "rb") as fp:
        features = pickle.load(fp)
    ASR_data = pd.read_csv('../resources/'+ASR_data)
    
    
    
    species2merge = data[['biosample_id', 'species_fam']]
    filtered_ASR = ASR_data.drop('species_fam', axis=1).merge(species2merge, on='biosample_id')
    filtered_ASR.set_index('biosample_id', inplace=True)
    filtered_ASR = filtered_ASR[filtered_ASR['units']=='mg/L']
    filtered_ASR = filtered_ASR[filtered_ASR['ast_standard']=='CLSI']
    filtered_ASR = filtered_ASR[filtered_ASR['species_fam']!='senterica']
    filtered_ASR = filtered_ASR[filtered_ASR['species_fam']!='spneumoniae']
    data.set_index('biosample_id', inplace=True)
    data.drop(['Unnamed: 0', 'species_fam', 'run_id'], axis=1, inplace=True)
            
    if species_sep:
        species = filtered_ASR['species_fam'].value_counts().reset_index()['index'].iloc[species_filter_index]
        filtered_ASR = filtered_ASR[filtered_ASR['species_fam'] == species]
    else:
        species = None
    
    if test_range:
        test_ASR = filtered_ASR[filtered_ASR['measurement_sign']!='=']    
        
    if naive:
        filtered_ASR = filtered_ASR[filtered_ASR['measurement_sign']=='=']
    
    anti_list = filtered_ASR['antibiotic_name'].value_counts().index.values
    label = anti_list[antibiotic_index]
    
    y = data.loc[filtered_ASR[filtered_ASR['antibiotic_name'] == label].index][label]
    
    if naive:
        if task == 'regression':
            y = y.apply(lambda x: float(x.split(' ')[1]))
        elif task == 'classification':
            y = y.apply(lambda x: str(x.split(' ')[1]))
            
    else:
        if task == 'classification':
            y=y
        elif task == 'regression':
            if strip_range_train:
                y = y.apply(lambda x: float(x.split(' ')[1]))
            elif distance_range_train:
                signs = y.apply(lambda x: str(x.split(' ')[0]))
                y = y.apply(lambda x: float(x.split(' ')[1]))
                
            else:
                print('regression not in the naive approach is not implemented yet.') 
        
            
    
    if test_range:
        range_test_values = data.loc[test_ASR[test_ASR['antibiotic_name'] == label].index][label]
        range_labels = pd.DataFrame({
            'values':[],
            'direction': [],
        })
        range_labels['values'] = range_test_values.apply(lambda x: float(x.split(' ')[1]))
        range_labels['direction'] = range_test_values.apply(lambda x: x.split(' ')[0].replace('=', ''))
    else:
        range_labels = None
        X_range = None
        
    
    X = data.loc[filtered_ASR[filtered_ASR['antibiotic_name'] == label].index][features]
    X.dropna(axis=1, how='all', inplace=True)
    X.fillna(0, inplace=True)
    
    if test_range:
        train_features = X.columns.values
        X_range = data.loc[test_ASR[test_ASR['antibiotic_name'] == label].index][train_features]
        X_range.fillna(0, inplace=True)
    
    return X, y, X_range, range_labels, list(train_features), label, species

In [6]:
def get_filtered_data_multi_anti(
    data = 'tot_filtered_data.csv', 
    features = 'final_features',
    ASR_data = 'filtered_ASR_data.csv', 
    species_sep = True, 
    species_filter_index=0, 
    naive=True, 
    test_range=False, 
    task='regression', 
    strip_range_train=False,
    distance_range_train=False,
    range_moved=5,
    filter_antibiotics_size = 30,
):
    data = pd.read_csv('../resources/'+'tot_filtered_data.csv')
    with open("../resources/"+ 'final_features', "rb") as fp:
        features = pickle.load(fp)
    ASR_data = pd.read_csv('../resources/'+'filtered_ASR_data.csv')



    species2merge = data[['biosample_id', 'species_fam']]
    filtered_ASR = ASR_data.drop('species_fam', axis=1).merge(species2merge, on='biosample_id')
    filtered_ASR.set_index('biosample_id', inplace=True)
    filtered_ASR = filtered_ASR[filtered_ASR['units']=='mg/L']
    filtered_ASR = filtered_ASR[filtered_ASR['ast_standard']=='CLSI']
    filtered_ASR = filtered_ASR[filtered_ASR['species_fam']!='senterica']
    filtered_ASR = filtered_ASR[filtered_ASR['species_fam']!='spneumoniae']
    data.set_index('biosample_id', inplace=True)
    data.drop(['Unnamed: 0', 'species_fam', 'run_id'], axis=1, inplace=True)

    if species_sep:
        species = filtered_ASR['species_fam'].value_counts().reset_index()['index'].iloc[species_filter_index]
        filtered_ASR = filtered_ASR[filtered_ASR['species_fam'] == species]
    else:
        species = None

    if test_range:
        test_ASR = filtered_ASR[filtered_ASR['measurement_sign']!='=']    

    if naive:
        filtered_ASR = filtered_ASR[filtered_ASR['measurement_sign']=='=']

    anti_list = filtered_ASR['antibiotic_name'].value_counts()[filtered_ASR['antibiotic_name'].value_counts() > filter_antibiotics_size].index.values
    filtered_ASR = filtered_ASR[filtered_ASR['antibiotic_name'].apply(lambda x: x in anti_list)]
    filtered_ASR = filtered_ASR.reset_index().set_index(['biosample_id', 'antibiotic_name']).drop('Unnamed: 0', axis=1)

    if test_range:
        test_ASR = test_ASR[test_ASR['antibiotic_name'].apply(lambda x: x in anti_list)]
        test_ASR = test_ASR.reset_index().set_index(['biosample_id', 'antibiotic_name']).drop('Unnamed: 0', axis=1)

    data = data.melt(
        id_vars=features,
        ignore_index=False,
        value_vars=anti_list,
        var_name='antibiotic_name',
        value_name='measurement',
    ).dropna(axis=0, subset=['measurement'])

    data = data.reset_index().set_index(['biosample_id', 'antibiotic_name'])
    label = 'measurement'
    y = data.loc[filtered_ASR.index]['measurement']

    if naive:
        if task == 'regression':
            y = y.apply(lambda x: float(x.split(' ')[1]))
        elif task == 'classification':
            y = y.apply(lambda x: str(x.split(' ')[1]))
    else:
        if task == 'classification':
            y=y
        elif task == 'regression':
            if strip_range_train:
                y = y.apply(lambda x: float(x.split(' ')[1]))
            elif distance_range_train:
                signs = y.apply(lambda x: str(x.split(' ')[0]))
                y = y.apply(lambda x: float(x.split(' ')[1]))

            else:
                print('regression not in the naive approach is not implemented yet.') 


    if test_range:
        range_test_values = data.loc[test_ASR.index]['measurement']
        range_labels = pd.DataFrame({
            'values':[],
            'direction': [],
        })
        range_labels['values'] = range_test_values.apply(lambda x: float(x.split(' ')[1]))
        range_labels['direction'] = range_test_values.apply(lambda x: x.split(' ')[0].replace('=', ''))
    else:
        range_labels = None
        X_range = None


    X = data.loc[filtered_ASR.index][features]
    X.dropna(axis=1, how='all', inplace=True)
    X.fillna(0, inplace=True)

    if test_range:
        train_features = X.columns.values
        X_range = data.loc[test_ASR.index][train_features]
        X_range.fillna(0, inplace=True)
        X_range = X_range.reset_index().set_index('biosample_id')
    
    train_id, test_id = train_test_split(list(set(y.index.get_level_values(0).values)), test_size=0.2, random_state=42)
    X_train = X.loc[train_id,].reset_index().set_index('biosample_id')
    X_test = X.loc[test_id,].reset_index().set_index('biosample_id')
    y_train = y.loc[train_id,].reset_index().set_index('biosample_id')
    y_test = y.loc[test_id,].reset_index().set_index('biosample_id')

    return X_train, y_train, X_test, y_test, X_range, range_labels, list(train_features)+['antibiotic_name'], label, species

# Building models

## CatBoost

In [7]:
def objective(trial):
    X_train, y_train, X_test, y_test, X_range, y_range, features, label, species = get_filtered_data_multi_anti(
        data = 'tot_filtered_data.csv', 
        features = 'final_features',
        ASR_data = 'filtered_ASR_data.csv', 
        species_sep = True, 
        species_filter_index=species_filter_index, 
        naive=naive, 
        strip_range_train=strip_range_train,
        test_range=test_range, 
        task='regression',
    )
    # X_train = np.array(X_train)
    y_train = np.array(y_train['measurement'])
    # X_test = np.array(X_test)
    y_test = np.array(y_test['measurement'])
    
    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
    train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
    test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices)
    # Parameters
    params = set_param(trial)
    # Learning
    
    model = cb.CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=SEED,
        border_count=64,
        cat_features = categorical_features_indices,
        **params
    )
    model.set_feature_names(features)
    model.fit(
        train_pool,
        eval_set=test_pool,
        use_best_model=True,
        verbose=500,
        plot=False,
    )
    # Predict
    preds = model.predict(test_pool)
    y_pred = np.rint(preds)
    
    # Evaluation
    rmse_test = mse(y_test, preds, squared=False)
    rmse_r_test = mse(y_test, y_pred, squared=False)
    print('rmse Score of CatBoost =', rmse_test)
    print('rounded rmse Score of CatBoost =', rmse_r_test)
    return rmse_r_test

In [8]:
def evaluate_best(trial, exp_name):
    # Use same code objective to reproduce the best model
    X_train, y_train, X_test, y_test, X_range, y_range, features, label, species = get_filtered_data_multi_anti(
        data = 'tot_filtered_data.csv', 
        features = 'final_features',
        ASR_data = 'filtered_ASR_data.csv', 
        species_sep = True, 
        species_filter_index=species_filter_index, 
        naive=naive, 
        strip_range_train=strip_range_train,
        test_range=test_range, 
        task='regression',
    )
    
    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
    train_pool = Pool(X_train, np.array(y_train['measurement']), cat_features=categorical_features_indices)
    test_pool = Pool(X_test, np.array(y_test['measurement']), cat_features=categorical_features_indices)
    range_pool = Pool(X_range, np.array(y_range['values']), cat_features=categorical_features_indices)
    
    # Parameters
    params = set_param(trial)
    # Learning
    model = cb.CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=SEED,
        border_count=64,
        cat_features = categorical_features_indices,
        **params
    )

    # calculate more evaluation metrics
    model.set_feature_names(features)
    model.fit(
        train_pool,
        eval_set=test_pool,
        use_best_model=True,
        verbose=500,
        plot=False,
    )
    pickle.dump(params, open('../experiments/{}/CatBoost_Hyperparameter.pickle'.format(exp_name), 'wb'))
    
    # Predict
    train_preds = model.predict(train_pool)
    pickle.dump(train_preds, open('../experiments/{}/train_pred.pickle'.format(exp_name), 'wb'))
    pickle.dump(y_train, open('../experiments/{}/train_y.pickle'.format(exp_name), 'wb'))
    
    preds = model.predict(test_pool)
    pickle.dump(preds, open('../experiments/{}/test_pred.pickle'.format(exp_name), 'wb'))
    pickle.dump(y_test, open('../experiments/{}/test_y.pickle'.format(exp_name), 'wb'))
    y_pred = np.rint(preds)

    if test_range:
        range_pred = model.predict(range_pool)
        pickle.dump(range_pred, open('../experiments/{}/range_pred.pickle'.format(exp_name), 'wb'))
        pickle.dump(y_range, open('../experiments/{}/range_y.pickle'.format(exp_name), 'wb'))
        
    rmse_test = mse(np.array(y_test['measurement']), preds, squared=False)
    rmse_r_test = mse(np.array(y_test['measurement']), y_pred, squared=False)
        
    return rmse_test, rmse_r_test

In [9]:
species_filter_index_list = [0]
naive = False
strip_range_train=False
train_time = 60
test_range=True
distance_range_train=True
range_moved=5
exp_describtion = 'range_distanced_by{}'.format(range_moved)
n_trials = 1000
def set_param(trial):
    return {
        'iterations' : trial.suggest_int('iterations', 1000, 10000),                                              
        'depth' : trial.suggest_int('depth', 4, 10),                                                    
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }

In [10]:
studies = []
for species_filter_index in species_filter_index_list:
    X_train, y_train, X_test, y_test, X_range, y_range, features, label, species = get_filtered_data_multi_anti(
        data = 'tot_filtered_data.csv', 
        features = 'final_features',
        ASR_data = 'filtered_ASR_data.csv', 
        species_sep = True, 
        species_filter_index=species_filter_index, 
        naive=naive, 
        strip_range_train=strip_range_train,
        distance_range_train=distance_range_train,
        range_moved=range_moved,
        test_range=test_range, 
        task='regression',
    )
    exp_name = '{}_{}_for_{}_trials'.format(species, exp_describtion, n_trials)
    os.makedirs('../experiments/{}'.format(exp_name), exist_ok=True)
    study = optuna.create_study(direction = "minimize", sampler = TPESampler(seed=int(SEED)))
    study.optimize(objective, n_trials = n_trials, n_jobs = 1)#multiprocessing.cpu_count())
    evaluate_best(study.best_trial, exp_name)
    print('CatBoost Hyperparameter:', study.best_trial.params)
    studies.append(study)

  
  
[32m[I 2022-03-09 00:27:27,473][0m A new study created in memory with name: no-name-b1851624-c472-41e1-bbd0-e22a04479b68[0m
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


regression not in the naive approach is not implemented yet.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
[33m[W 2022-03-09 00:27:31,337][0m Trial 0 failed because of the following error: CatBoostError('catboost/private/libs/target/target_converter.cpp:35: Target value ">= 6.0" cannot be parsed as float')[0m
Traceback (most recent call last):
  File "/home/amitdanw/.conda/envs/myenv/lib/python3.7/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1995/3355920613.py", line 41, in objective
    plot=False,
  File "/home/amitdanw/.conda/envs/myenv/lib/python3.7/site-packages/catboost/core.py", line 5302, in fit
    save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
  File "/home/amitdanw/.conda/envs/myenv/lib/python3.7/site-packages/catboost/core.py", line 2042, in _fit
    train_params["init_model"]
  File "/home/amitdanw/.conda/envs/myenv/lib/python3.7/site

CatBoostError: catboost/private/libs/target/target_converter.cpp:35: Target value ">= 6.0" cannot be parsed as float

## playing with autoxgb

In [None]:
for species_filter_index in species_filter_index_list:
    train, test, X_range, y_range, features, label, species = get_filtered_data_multi_anti(
        data = 'tot_filtered_data.csv', 
        features = 'final_features',
        ASR_data = 'filtered_ASR_data.csv', 
        species_sep = True, 
        species_filter_index=species_filter_index, 
        naive=naive, 
        strip_range_train=strip_range_train,
        test_range=test_range, 
        task='regression',
    )
    exp_name = '{}_{}_for_{}_min'.format(species, exp_describtion, train_time/60)
    train.to_csv('../resources/train_{}.csv'.format(exp_name))
    test.to_csv('../resources/test_{}.csv'.format(exp_name))
    X_range.to_csv('../resources/range_{}.csv'.format(exp_name))
    y_range.to_csv('../resources/y_range_{}.csv'.format(exp_name))
    pd.DataFrame({'features': features}).to_csv('../resources/features_{}.csv'.format(exp_name))
    pd.DataFrame({'label': [label]}).to_csv('../resources/label_{}.csv'.format(exp_name))

    # required parameters:
    train_filename = '../resources/train_{}.csv'.format(exp_name)
    output = '../experiments/{}'.format(exp_name)

    # optional parameters
    test_filename = '../resources/test_{}.csv'.format(exp_name)
    task = 'regression'
    idx = 'biosample_id'
    targets = [label]
    features = features
    categorical_features = None
    use_gpu = True
    num_folds = 5
    seed = 42
    num_trials = 100
    time_limit = train_time
    fast = False

    # Now its time to train the model!
    axgb = AutoXGB(
        train_filename=train_filename,
        output=output,
        test_filename=test_filename,
        task=task,
        idx=idx,
        targets=targets,
        features=features,
        categorical_features=None,
        use_gpu=use_gpu,
        num_folds=num_folds,
        seed=seed,
        num_trials=num_trials,
        time_limit=time_limit,
        fast=fast,
    )
    axgb.train()
    if test_range:
        PredictAutoXGBCommand('../experiments/{}'.format(exp_name), '../resources/range_{}.csv'.format(exp_name), '../experiments/{}/range_preds.csv'.format(exp_name)).execute()

In [None]:
# required parameters:
train_filename = '../resources/train_{}.csv'.format(exp_name)
output = exp_name

# optional parameters
test_filename = '../resources/test_{}.csv'.format(exp_name)
task = 'regression'
idx = 'unique_id'
targets = [y.name]
features = list(X.columns.values)
categorical_features = None
use_gpu = True
num_folds = 5
seed = 42
num_trials = 100
time_limit = 3600
fast = False

# Now its time to train the model!
axgb = AutoXGB(
    train_filename=train_filename,
    output=output,
    test_filename=test_filename,
    task=task,
    idx=idx,
    targets=targets,
    features=features,
    categorical_features=categorical_features,
    use_gpu=use_gpu,
    num_folds=num_folds,
    seed=seed,
    num_trials=num_trials,
    time_limit=time_limit,
    fast=fast,
)
axgb.train()

In [None]:
PredictAutoXGBCommand(exp_name, '../resources/range_{}.csv'.format(exp_name), '{}/range_preds.csv'.format(exp_name)).execute()

### exact results

In [None]:
# exp_name = 'largest_species_and_anti_train_striped_range_60min'

In [None]:
label = pd.read_csv('../resources/label_{}.csv'.format(exp_name)).loc[0, 'label']
y_range = pd.read_csv('../resources/y_range_{}.csv'.format(exp_name)).set_index('unique_id')

In [None]:
y = pd.read_csv('../resources/train_{}.csv'.format(exp_name)).set_index('unique_id')[label]
train_res = pd.read_csv('../notebooks/{}/oof_predictions.csv'.format(exp_name)).set_index('unique_id').merge(y, left_index=True, right_index=True, how='inner')
train_res = train_res.loc[set(train_res.index)-set(y_range.index)]
train_res.columns=['y_pred', 'y_true']
train_res['y_true'] = np.round(train_res['y_true'])
min_true = train_res['y_true'].min()
max_true = train_res['y_true'].max(axis=0)
train_res['y_pred'] = train_res['y_pred'].clip(lower=min_true, upper=max_true)
train_res['residual'] = train_res['y_true'] - train_res['y_pred']
train_res['y_pred'] = np.round(train_res['y_pred'])
train_res['round_residual'] = train_res['y_true'] - train_res['y_pred']
train_res.describe()

In [None]:
y = pd.read_csv('../resources/test_{}.csv'.format(exp_name)).set_index('unique_id')[label]
test_res = pd.read_csv('../notebooks/{}/test_predictions.csv'.format(exp_name)).set_index('unique_id').merge(y, left_index=True, right_index=True, how='inner')
test_res = test_res.loc[set(test_res.index)-set(y_range.index)]
test_res.columns=['y_pred', 'y_true']
test_res['y_true'] = np.round(test_res['y_true'])
min_true = test_res['y_true'].min()
max_true = test_res['y_true'].max(axis=0)
test_res['y_pred'] = test_res['y_pred'].clip(lower=min_true, upper=max_true)
test_res['residual'] = test_res['y_true'] - test_res['y_pred']
test_res['y_pred'] = np.round(test_res['y_pred'])
test_res['round_residual'] = test_res['y_true'] - test_res['y_pred']
test_res.describe()

In [None]:
for key, res in {'Train': train_res, 'Test': test_res}.items():
    print(key)
    print('RMSE: {}'.format(res['residual'].std()))
    print('RMSE after rounding: {}'.format(res['round_residual'].std()))

In [None]:
regression_res = pd.DataFrame({
    'exact RMSE': [train_res['residual'].std(), test_res['residual'].std()],
    'exact_rounded RMSE': [train_res['round_residual'].std(), test_res['round_residual'].std()],
}, index=['train', 'test'])
regression_res

In [None]:
for key, res in {'Train': train_res, 'Test': test_res}.items():
    titles_options = [
        (key+ " Confusion matrix, without normalization", None),
        (key+" Normalized confusion matrix", "true"),
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            np.round(res['y_true']),
            res['y_pred'],
            labels=np.sort(list(set(list(np.round(res['y_true']).unique())).union(set(list(np.round(res['y_pred']).unique()))))),
    #         display_labels=np.sort(list(train_res['y_true'].unique())),
            cmap=plt.cm.Blues,
            normalize=normalize,
        )
        disp.ax_.set_title(title)
        cm = pd.DataFrame(disp.confusion_matrix, index=disp.display_labels, columns=disp.display_labels)
        cm.columns.name='predicted_labels'
        cm.index.name='true_labels'
        cm.to_csv('../notebooks/{}/{}_df_confusion_matrix_{}.csv'.format(exp_name, key, normalize))
        plt.savefig('../notebooks/{}/{}_confusion_matrix_{}.png'.format(exp_name, key, normalize), format='png')
plt.show()

### range result

In [None]:
equal_meaning = True

In [None]:
range_res = pd.read_csv('../notebooks/{}/range_preds.csv'.format(exp_name)).set_index('unique_id').merge(y_range, left_index=True, right_index=True, how='inner')
range_res.columns=['y_pred'] + list(range_res.columns.values)[1:]
range_res['values'] = np.round(range_res['values'])
range_res['updated_values'] = np.nan
range_res['updated_direction'] = np.nan
if equal_meaning:
        range_res.loc[range_res['direction'] == '>=','updated_values'] = range_res['values'] - 1
        range_res.loc[range_res['direction'] == '<=','updated_values'] = range_res['values'] + 1
range_res.loc[range_res['direction'] == '>=','updated_direction'] = '>'
range_res.loc[range_res['direction'] == '<=','updated_direction'] = '<'

range_res.loc[:,'updated_values'].fillna(range_res['values'], inplace=True)
range_res.loc[:,'updated_direction'].fillna(range_res['direction'], inplace=True)

range_res.loc[range_res['updated_direction'] == '>','answer'] = (range_res['y_pred'] > range_res['updated_values'])
range_res.loc[range_res['updated_direction'] == '<','answer'] = (range_res['y_pred'] < range_res['updated_values'])
train_range_res = range_res.loc[set(range_res.index).intersection(set(train.index))]
test_range_res = range_res.loc[set(range_res.index) - set(train.index)]
for key, res in {'train': train_range_res, 'test': test_range_res}.items():
    range_confusion = res.groupby(by=['direction', 'values'])['answer'].agg(['count', 'sum']).replace(True, 1)
    range_confusion['perc'] = range_confusion['sum'] / range_confusion['count']
    range_confusion.columns = ['total', 'in range', 'accuracy']
    range_confusion = pd.DataFrame(range_confusion.stack()).T.swaplevel(i=2, j=0, axis=1)
    range_confusion.index=[key]
    regression_res = pd.concat([regression_res, range_confusion], axis=1)
regression_res_cleaned = pd.DataFrame({})
for col in regression_res.columns:
    if len(regression_res[[col]].columns) > 1:
        regression_res_cleaned[col] = regression_res[[col]].iloc[:,0].fillna(regression_res[[col]].iloc[:,1])
    else:
        regression_res_cleaned[col] = regression_res[[col]]
regression_res = regression_res_cleaned

In [None]:
regression_res

In [None]:
regression_res.to_csv('../notebooks/{}/df_regression_results.csv'.format(exp_name))

In [None]:
label

## Playing with h2o

In [None]:
filtered_data.head()

In [None]:
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
trainH2o = h2o.import_file('../resources/train_{}.csv'.format(exp_name))
testH2o = h2o.import_file('../resources/test_{}.csv'.format(exp_name))

# Identify predictors and response
x = features
y = label

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=100, seed=1, max_runtime_secs=86400)
aml.train(x=x, y=y, training_frame=trainH2o)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

# model_id                                                  auc    logloss    mean_per_class_error      rmse       mse
# ---------------------------------------------------  --------  ---------  ----------------------  --------  --------
# StackedEnsemble_AllModels_AutoML_20181212_105540     0.789801   0.551109                0.333174  0.43211   0.186719
# StackedEnsemble_BestOfFamily_AutoML_20181212_105540  0.788425   0.552145                0.323192  0.432625  0.187165
# XGBoost_1_AutoML_20181212_105540                     0.784651   0.55753                 0.325471  0.434949  0.189181
# XGBoost_grid_1_AutoML_20181212_105540_model_4        0.783523   0.557854                0.318819  0.435249  0.189441
# XGBoost_grid_1_AutoML_20181212_105540_model_3        0.783004   0.559613                0.325081  0.435708  0.189841
# XGBoost_2_AutoML_20181212_105540                     0.78136    0.55888                 0.347074  0.435907  0.190015
# XGBoost_3_AutoML_20181212_105540                     0.780847   0.559589                0.330739  0.43613   0.190209
# GBM_5_AutoML_20181212_105540                         0.780837   0.559903                0.340848  0.436191  0.190263
# GBM_2_AutoML_20181212_105540                         0.780036   0.559806                0.339926  0.436415  0.190458
# GBM_1_AutoML_20181212_105540                         0.779827   0.560857                0.335096  0.436616  0.190633
# GBM_3_AutoML_20181212_105540                         0.778669   0.56179                 0.325538  0.437189  0.191134
# XGBoost_grid_1_AutoML_20181212_105540_model_2        0.774411   0.575017                0.322811  0.4427    0.195984
# GBM_4_AutoML_20181212_105540                         0.771426   0.569712                0.33742   0.44107   0.194543
# GBM_grid_1_AutoML_20181212_105540_model_1            0.769752   0.572583                0.344331  0.442452  0.195764
# GBM_grid_1_AutoML_20181212_105540_model_2            0.754366   0.918567                0.355855  0.496638  0.246649
# DRF_1_AutoML_20181212_105540                         0.742892   0.595883                0.355403  0.452774  0.205004
# XRT_1_AutoML_20181212_105540                         0.742091   0.599346                0.356583  0.453117  0.205315
# DeepLearning_grid_1_AutoML_20181212_105540_model_2   0.741795   0.601497                0.368291  0.454904  0.206937
# XGBoost_grid_1_AutoML_20181212_105540_model_1        0.693554   0.620702                0.40588   0.465791  0.216961
# DeepLearning_1_AutoML_20181212_105540                0.69137    0.637954                0.409351  0.47178   0.222576
# DeepLearning_grid_1_AutoML_20181212_105540_model_1   0.690084   0.661794                0.418469  0.476635  0.227181
# GLM_grid_1_AutoML_20181212_105540_model_1            0.682648   0.63852                 0.397234  0.472683  0.223429
#
# [22 rows x 6 columns]

# The leader model is stored here
aml.leader

In [None]:
preds = aml.predict(testH2o)

## Playing with Amoxicillin - Clavulanic acid as regression

In [None]:
X = filtered_data[features]
y = filtered_data[best_label+'_filtered']

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=rng)

In [None]:
tot_actuals = []
tot_predictions = []
tot_rounded_predictions = []
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor(n_jobs=1, tree_method='gpu_hist', gpu_id=0).fit(X.iloc[train_index], y.iloc[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    rounded_predictions = [min(final_dict.values(), key=lambda x:abs(x-pred)) for pred in predictions]
    print("rounded: ", rounded_predictions)
    print("actual: ", actuals.T[0])
    print(mean_squared_error(actuals, predictions))
    print(confusion_matrix([str(x) for x in actuals.T[0]], [str(x) for x in rounded_predictions]))
    tot_actuals += list(actuals.T[0])
    tot_rounded_predictions += rounded_predictions
    tot_predictions += list(predictions)

In [None]:
print(mean_squared_error(tot_actuals, tot_predictions))

In [None]:
cm = confusion_matrix(
    [str(x) for x in tot_actuals], 
    [str(x) for x in tot_rounded_predictions], 
    labels=[str(x) for x in final_dict.values()],
)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=[str(x) for x in final_dict.keys()],
)
disp.plot() 

### run Parameter Opt

In [None]:
param_grid = {
    'max_depth':[3, 4, 6, 8, 10], 
    'n_estimators': [20, 50, 100, 200],
    'eta': uniform(loc=0, scale=1),
    'gamma': uniform(loc=0, scale=100),
    'min_child_weight': uniform(loc=0, scale=10),
    ''
}

In [None]:
xgb_model = xgb.XGBRegressor(n_jobs=1, tree_method='gpu_hist', gpu_id=0)

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=rng)

In [None]:
clf = RandomizedSearchCV(xgb_model, param_distributions=param_grid, random_state=rng, cv = kf)

In [None]:
search = clf.fit(X, y)
search.best_params_
search.best_score_

In [None]:
tot_actuals = []
tot_predictions = []
tot_rounded_predictions = []
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor(n_jobs=1, **search.best_params_).fit(X[train_index], y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    rounded_predictions = [min(final_dict.values(), key=lambda x:abs(x-pred)) for pred in predictions]
    tot_actuals += list(actuals.T[0])
    tot_rounded_predictions += rounded_predictions
    tot_predictions += list(predictions)
    
print(mean_squared_error(tot_actuals, tot_predictions))
cm = confusion_matrix(
    [str(x) for x in tot_actuals], 
    [str(x) for x in tot_rounded_predictions], 
    labels=[str(x) for x in final_dict.values()],
)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=[str(x) for x in final_dict.keys()],
)
disp.plot() 

In [None]:
xgb_model