# Contents

1. [Initialization](#imports)
2. [Euthyroid data](#euthyroid)
    1. [Settings](#exp_set)
    2. [Experiments with decision maker](#exp_dm)
    3. [All and free variables](#exp_all)
3. [NHANES data](#nhanes)
    1. [Settings](#nh_set)
    2. [Experiments with decision maker](#nh_dm)
    3. [All and free variables](#nh_all)

# Initialization <a name="imports"></a>

In [None]:
from decision_maker import *

In [None]:
import pickle
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import itertools
import seaborn as sns

In [None]:
# Function used for generation of masked and extended training data 
# - used for pretraining one classifier accepting any combination of acquired variables

def get_boots_data(df, z_init, dupl):
    # generate list of all possible masks
    print('masking started')
    is_free = np.where(np.array(z_init)==-1, True, False)
    is_free_df = df[:, is_free]
    is_costly_df = df[:, ~is_free]

    is_zero_indx = list(range(is_costly_df.shape[1]))
    is_zero_pwset = itertools.chain.from_iterable(
        itertools.combinations(is_zero_indx, r) for r in range(len(is_zero_indx) + 1))

    masks = []
    for idx in is_zero_pwset:
        z_np = np.zeros(is_costly_df.shape[1])
        z_np[list(idx)] = 1
        masks.append(z_np)

    # create empty data
    mask_dfs = {}
    costly_dfs = {}
    free_dfs = {}
    for i in range(dupl):
        mask_dfs[i] = np.zeros(is_costly_df.shape)
        costly_dfs[i] = is_costly_df.copy()
        free_dfs[i] = is_free_df.copy()

    # for each data row sample different masks
    k = len(masks)
    for i in range(is_costly_df.shape[0]):
        sampled_indx = np.random.choice(range(k), dupl, replace=False)
        for d in range(dupl):
            mask_dfs[d][i] = masks[sampled_indx[d]]

    # concatenate dfs from dictionaries into one dataframe
    mask_final_df = np.concatenate(list(mask_dfs.values()), axis=0)

    final_costly_df = np.concatenate(list(costly_dfs.values()), axis=0)
    final_costly_df = np.where(mask_final_df == 1, 0, final_costly_df)

    final_free_df = np.concatenate(list(free_dfs.values()), axis=0)

    final_df = np.concatenate([final_costly_df, mask_final_df], axis=1)
    final_df = np.concatenate([final_free_df, final_df], axis=1)

    return final_df

In [None]:
# Function used to mask data to be compatable with the pre-trained classifier

def mask_df(df, mask):
    # mask: -1 free var, 0 not acquired, 1 acquired
    is_free = np.where(np.array(mask)==-1, True, False)
    
    is_free_df = df[:, is_free].copy()
    is_costly_df = df[:, ~is_free].copy()
    
    mask_costly = [False if m == 1 else True for m in mask if m >= 0]
    mask_df = np.zeros(is_costly_df.shape)
    mask_df[:, mask_costly] = 1
    
    is_costly_df = np.where(mask_df == 1, 0, is_costly_df)
    final_df = np.concatenate([is_costly_df, mask_df], axis=1)
    final_df = np.concatenate([is_free_df, final_df], axis=1)

    return final_df

# Euthyroid <a name="euthyroid"></a>

### Settings <a name="exp_set"></a>

In [None]:
# Load the data:
data_dir = '../../data/final_data/'

file_name = 'euthyroid.pkl'
with open(data_dir + file_name, 'rb') as inp:
    euth = pickle.load(inp)

In [None]:
# Specify utility matrix, betas:
U = np.array([[0, -10],
              [-1, 0]])

betas = [20, 30, 50, 100]

# Split the data to train and test sample:
X_train, X_test, y_train, y_test = train_test_split(euth['features'], euth['targets'],
                                                    test_size=0.25, random_state=42, 
                                                    stratify=euth['targets'])

In [None]:
# Train a single classifier on masked data:
z_init = np.where(euth['costs'] == 0, -1, 0)

train_X_dupl = get_boots_data(X_train, z_init, 16)
train_y_dupl = np.concatenate([y_train] * 16)

params = {'max_depth': 11,
          'criterion': 'entropy',
          'min_samples_split': 15,
          'min_samples_leaf': 5,
          'n_estimators': 500,
          'random_state': 42}

clf_boots = RandomForestClassifier(**params)

clf_boots.fit(train_X_dupl, train_y_dupl)
clf_boots = NodeClassifier(clf_boots, 'external')

### Experiments with decision maker <a name="exp_dm"></a>

In [None]:
# Experiment with several combinations of decision maker:
results = []
acquisition_paths = {}
for clf_mode in ['internal', 'external']:
    for depth in [-1, 1]:
        for beta in betas:
            info = {'clf_mode': clf_mode, 'depth': depth, 'beta': beta}
            print(' >>>> {}'.format(info))

            if clf_mode == 'internal':
                dm = DecisionMaker(depth=depth,
                                   classifier_mode = clf_mode,
                                   classifier_class=RandomForestClassifier,
                                   classifier_params={'max_depth': 10,
                                                      'criterion': 'entropy',
                                                      'min_samples_split': 15,
                                                      'min_samples_leaf': 5,
                                                      'n_estimators': 200, 
                                                      'random_state': 42},
                                   policy_class=RandomForestRegressor,
                                   policy_params={'max_depth': 5,
                                                 'min_samples_split': 15,
                                                 'min_samples_leaf': 5,
                                                 'n_estimators': 50, 
                                                 'random_state': 42})
            else:
                dm = DecisionMaker(depth=depth,
                                   classifier_mode=clf_mode,
                                   classifier_boots=clf_boots,
                                   policy_class=RandomForestRegressor,
                                   policy_params={'max_depth': 5,
                                                  'min_samples_split': 15,
                                                  'min_samples_leaf': 5,
                                                  'n_estimators': 50, 
                                                  'random_state': 42})
                
            dm.fit(X_train, y_train, euth['costs'], beta*U)
            prob_test = dm.predict(X_test)
            test_evals = dm.evaluate(y_test)

            for k in test_evals:
                info['test_{}'.format(k)] = np.mean(test_evals[k])
            
            if clf_mode == 'internal' and depth == -1:
                acquisition_paths[beta] = dm.acquisition_paths.copy()

            prob_train = dm.predict(X_train)
            train_evals = dm.evaluate(y_train)
            for k in train_evals:
                info['train_{}'.format(k)] = np.mean(train_evals[k])
                
            results.append(info)

In [None]:
pdf = pd.DataFrame(results)
pdf['test_pred_u_scaled'] = pdf['test_prediction_utility'] / pdf['beta']
pdf['train_pred_u_scaled'] = pdf['train_prediction_utility'] / pdf['beta']
pdf

#### plot peformances

In [None]:
pdf['key'] = pdf['clf_mode'] + '_' + pdf['depth'].astype('str')

In [None]:
sns.scatterplot(y="test_pred_u_scaled", x="test_costs", data=pdf, hue="key")

In [None]:
sns.scatterplot(y="train_pred_u_scaled", x="train_costs", data=pdf, hue="key")

#### look at stored acquition paths

In [None]:
for beta, path in acquisition_paths.items():
    print('BETA: {}'.format(beta))
    unq, cnt = np.unique(path, axis=0, return_counts=True)
    for u, c in zip(unq, cnt):
        print(' > {}: {}'.format(c, u[-4:]))

### All and free variables <a name="exp_all"></a>

#### all variables

In [None]:
classifier_params={'max_depth': 10,
                  'criterion': 'entropy',
                  'min_samples_split': 15,
                  'min_samples_leaf': 5,
                  'n_estimators': 200, 
                  'random_state': 42}


clf_all = RandomForestClassifier(**classifier_params)
clf_all.fit(X_train, y_train)

for X, y in zip([X_train, X_test], [y_train, y_test]):
    prob = clf_all.predict_proba(X)
    eu = np.matmul(prob, np.transpose(U))
    predicted_classes = np.argmax(eu, axis=1)
    u = U[predicted_classes, y]
    print(np.mean(u)) 

In [None]:
np.sum(euth['costs'])

#### free variables

In [None]:
is_free = np.where(euth['costs'] == 0, True, False)
classifier_params={'max_depth': 10,
                  'criterion': 'entropy',
                  'min_samples_split': 15,
                  'min_samples_leaf': 5,
                  'n_estimators': 200, 
                  'random_state': 42}

clf_all = RandomForestClassifier(**classifier_params)
clf_all.fit(X_train[:, is_free], y_train)

for X, y in zip([X_train[:, is_free], X_test[:, is_free]], [y_train, y_test]):
    prob = clf_all.predict_proba(X)
    eu = np.matmul(prob, np.transpose(U))
    predicted_classes = np.argmax(eu, axis=1)
    u = U[predicted_classes, y]
    print(np.mean(u)) 

# NHANES <a name="nhanes"></a>

### Settings <a name="nh_set"></a>

In [None]:
# Load the data:
data_dir = '../../data/final_data/'

file_name = 'diabetes_all.pkl'
with open(data_dir + file_name, 'rb') as inp:
    diab = pickle.load(inp)
    
diab['costs'] = np.squeeze(diab['costs'])
diab['targets'] = diab['targets'].astype('int64')

# consider only costs associated with examination and laboratory tests
z_init = np.where(diab['costs'] < 5, -1, 0)
is_free = np.where(diab['costs'] < 5, True, False)
costs = np.where(diab['costs'] < 5, 0, diab['costs'])

In [None]:
# Set the utility matrix and betas:
U = np.array([[0, -10, -20],
              [-1, 0, -10],
              [-2, -1, 0]])

betas = [10, 15, 30, 50, 100]

# Split data to test and train:
X_train, X_test, y_train, y_test = train_test_split(diab['features'], diab['targets'],
                                                    test_size=0.3, random_state=42, 
                                                    stratify=diab['targets'])

In [None]:
# Train model on masked data (the code runs for a long time ~1 hour): 
train_X_dupl = get_boots_data(X_train, z_init, 10)
train_y_dupl = np.concatenate(10 * [y_train])

params = {'max_depth': 12,
          'criterion': 'entropy',
          'min_samples_split': 60,
          'min_samples_leaf': 20,
          'n_estimators': 1000,
          'random_state': 42}

clf_boots = RandomForestClassifier(**params)

clf_boots.fit(train_X_dupl, train_y_dupl)
clf_boots = NodeClassifier(clf_boots, 'external')

In [None]:
# Save the model:
with open('clf_boots.pkl', 'wb') as file:
    pickle.dump(clf_boots, file)

In [None]:
# Load the model:
with open('clf_boots.pkl', 'rb') as file:
    clf_boots = pickle.load(file)

### Experiment with decision maker <a name="nh_dm"></a>

In [None]:
results = []
for clf_mode in ['internal', 'external']:
    for depth in [1]:
        for beta in betas:
            info = {'clf_mode': clf_mode, 'depth': depth, 'beta': beta}

            if clf_mode == 'internal':
                dm = DecisionMaker(depth=depth,
                                   classifier_mode = clf_mode,
                                   classifier_class=RandomForestClassifier,
                                   classifier_params={'max_depth': 11,
                                                      'criterion': 'entropy',
                                                      'min_samples_split': 60,
                                                      'min_samples_leaf': 20,
                                                      'n_estimators': 300, 
                                                      'random_state': 42},
                                   policy_class=RandomForestRegressor,
                                   policy_params={'max_depth': 5,
                                                 'min_samples_split': 300,
                                                 'min_samples_leaf': 100,
                                                 'n_estimators': 10, 
                                                 'random_state': 42})
            else:
                dm = DecisionMaker(depth=depth,
                                   classifier_mode=clf_mode,
                                   classifier_boots=clf_boots,
                                   policy_class=RandomForestRegressor,
                                   policy_params={'max_depth': 5,
                                                  'min_samples_split': 300,
                                                  'min_samples_leaf': 100,
                                                  'n_estimators': 10, 
                                                  'random_state': 42})

                
            dm.fit(X_train, y_train, costs, beta*U)
            prob_test = dm.predict(X_test)
            test_evals = dm.evaluate(y_test)
            for k in test_evals:
                info['test_{}'.format(k)] = np.mean(test_evals[k])
                
            prob_train = dm.predict(X_train)
            train_evals = dm.evaluate(y_train)
            for k in train_evals:
                info['train_{}'.format(k)] = np.mean(train_evals[k])
                
            results.append(info)

In [None]:
pdf = pd.DataFrame(results)
pdf['test_pred_u_scaled'] = pdf['test_prediction_utility'] / pdf['beta']
pdf['train_pred_u_scaled'] = pdf['train_prediction_utility'] / pdf['beta']
pdf

In [None]:
# Safe the results:
with open('diabetes_results_2.pkl', 'wb') as file:
    pickle.dump(pdf, file)

### All and free varaibles <a name="nh_all"></a>

#### internal

In [None]:
classifier_params={'max_depth': 11,
                   'criterion': 'entropy',
                   'min_samples_split': 60,
                   'min_samples_leaf': 20,
                   'n_estimators': 300, 
                   'random_state': 42}


clf_all = RandomForestClassifier(**classifier_params)
clf_all.fit(X_train, y_train)

for X, y in zip([X_train, X_test], [y_train, y_test]):
    prob = clf_all.predict_proba(X)
    eu = np.matmul(prob, np.transpose(U))
    predicted_classes = np.argmax(eu, axis=1)
    u = U[predicted_classes, y]
    print(np.mean(u)) 

In [None]:
np.sum(diab['costs'])

In [None]:
is_free = np.where(costs == 0, True, False)
classifier_params={'max_depth': 11,
                   'criterion': 'entropy',
                   'min_samples_split': 60,
                   'min_samples_leaf': 20,
                   'n_estimators': 300, 
                   'random_state': 42}

clf_all = RandomForestClassifier(**classifier_params)
clf_all.fit(X_train[:, is_free], y_train)

for X, y in zip([X_train[:, is_free], X_test[:, is_free]], [y_train, y_test]):
    prob = clf_all.predict_proba(X)
    eu = np.matmul(prob, np.transpose(U))
    predicted_classes = np.argmax(eu, axis=1)
    u = U[predicted_classes, y]
    print(np.mean(u))

#### external

In [None]:
with open('clf_boots.pkl', 'rb') as file:
    clf = pickle.load(file)

In [None]:
z_all = np.where(costs==0, -1, 1)
X_train_mask = mask_df(X_train, z_all)
X_test_mask = mask_df(X_test, z_all)

for X, y in zip([X_train_mask, X_test_mask], [y_train, y_test]):
    prob = clf.classifier.predict_proba(X)
    eu = np.matmul(prob, np.transpose(U))
    predicted_classes = np.argmax(eu, axis=1)
    u = U[predicted_classes, y]
    print(np.mean(u))

In [None]:
z_free = np.where(costs==0, -1, 0)
X_train_mask = mask_df(X_train, z_free)
X_test_mask = mask_df(X_test, z_free)

for X, y in zip([X_train_mask, X_test_mask], [y_train, y_test]):
    prob = clf.classifier.predict_proba(X)
    eu = np.matmul(prob, np.transpose(U))
    predicted_classes = np.argmax(eu, axis=1)
    u = U[predicted_classes, y]
    print(np.mean(u))