In [1]:
import numpy as np
from numba import njit
import pickle
from sklearn.ensemble import BaggingClassifier
from sklearn.tree._classes import DecisionTreeClassifier
from joblib import Parallel, delayed
from scipy.stats import entropy, multivariate_normal

from hyppo.tools import multimodal_independence, indep_sim
from hyppo.ksample._utils import k_sample_transform
from tqdm import tqdm

import sys
import os
import multiprocessing as mp
from joblib import Parallel, delayed

C:\Users\siptest\anaconda3\envs\py36\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\siptest\anaconda3\envs\py36\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\siptest\anaconda3\envs\py36\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
C:\Users\siptest\anaconda3\envs\py36\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
def test_stat_helper(tree, tree_idx, X, y, sampled_indices, unsampled_indices, K, kappa=3, base=2):
    # Randomly split the rest into voting and evaluation.

    total_unsampled = len(unsampled_indices)
    # np.random.shuffle(unsampled_indices)
    vote_indices = unsampled_indices[:total_unsampled//2]
    eval_indices = unsampled_indices[total_unsampled//2:]

    # Store the posterior in a num_nodes-by-num_classes matrix.
    # Posteriors in non-leaf cells will be zero everywhere
    # and later changed to uniform.
    node_counts = tree.tree_.n_node_samples
    class_counts = np.zeros((len(node_counts), K))
    est_nodes = tree.apply(X[vote_indices])
    est_classes = y[vote_indices]
    for i in range(len(est_nodes)):
        class_counts[est_nodes[i], est_classes[i]] += 1

    # Total number of estimation points in each leaf.
    row_sums = class_counts.sum(axis=1)
    row_sums[row_sums == 0] = 1  # Avoid divide by zero.
    class_probs = class_counts / row_sums[:, None]

    # Make the nodes that have no estimation indices uniform.
    # This includes non-leaf nodes, but tha t will not affect the estimate.
    class_probs[np.argwhere(class_probs.sum(axis=1) == 0)] = [1 / K]*K
    # Apply finite sample correction and renormalize.
    where_0 = np.argwhere(class_probs == 0)
    for elem in where_0:
        class_probs[elem[0], elem[1]] = 1 / \
            (kappa*class_counts.sum(axis=1)[elem[0]])
    row_sums = class_probs.sum(axis=1)
    class_probs = class_probs / row_sums[:, None]

    # Place evaluation points in their corresponding leaf node.
    # Store evaluation posterior in a num_eval-by-num_class matrix.
    eval_class_probs = class_probs[tree.apply(X[eval_indices])]
    # eval_class_probs = [class_probs[x] for x in tree.apply(X[eval_indices])]
    eval_entropies = [entropy(posterior, base=base)
                      for posterior in eval_class_probs]
    return np.mean(eval_entropies)


In [3]:
def uf(X, y, n_estimators=300, max_samples=.4, base=2, kappa=3):
    # Build forest with default parameters.
    model = BaggingClassifier(DecisionTreeClassifier(),
                              n_estimators=n_estimators,
                              max_samples=max_samples,
                              n_jobs=40,
                              bootstrap=False)
    model.fit(X, y)
    n = X.shape[0]
    K = model.n_classes_
    _, y = np.unique(y, return_inverse=True)

    cond_entropy = 0
    final_null_dist = [0] * 100

    # Get real test statistics
    for tree_idx, tree in enumerate(model):
        # Find the indices of the training set used for partition.
        sampled_indices = model.estimators_samples_[tree_idx]
        unsampled_indices = np.delete(np.arange(0, n), sampled_indices)
        np.random.shuffle(unsampled_indices)

        cond_entropy += test_stat_helper(tree, tree_idx,
                                         X, y, sampled_indices, unsampled_indices, K)

        for j in range(100):
            np.random.seed(j)  # get same shuffle across
            null_unsampled_indices = unsampled_indices
            np.random.shuffle(null_unsampled_indices)
            final_null_dist[j] += test_stat_helper(
                tree, tree_idx, X, y, sampled_indices, null_unsampled_indices, K)

    new_final_null_dist = [entropy([np.mean(
        y), 1 - np.mean(y)], base=2) - val / n_estimators for val in final_null_dist]

    final_stat = entropy([np.mean(y), 1 - np.mean(y)],
                         base=2) - cond_entropy / n_estimators
    return final_stat, new_final_null_dist

In [4]:
MAX_SAMPLE_SIZE = 100
STEP_SIZE = 20
SAMP_SIZES = range(10, MAX_SAMPLE_SIZE + STEP_SIZE, STEP_SIZE)
POWER_REPS = 20

SIMULATIONS = [
    #"linear": "Linear",
    #"multimodal_independence": "Independence"
    #linear, 
    multimodal_independence
]

In [None]:
def estimate_power(sim):
    samp_size_dict = dict()
    samp_size_dict['sample_sizes'] = SAMP_SIZES
    samp_size_dict['n_power_reps'] = POWER_REPS
    power = []
    for n_samples in SAMP_SIZES: 
        pvalues = []
        samp_size_dict[n_samples] = {'stats': [], 'null_dists': []}
        for p in tqdm(range(POWER_REPS)):
            np.random.seed(None)
            matrix1, matrix2 = multimodal_independence(n_samples, 2)
            x, y = k_sample_transform([matrix1, matrix2])
            stat, null_dist = uf(x, y.ravel())
            samp_size_dict[n_samples]['stats'].append(stat)
            samp_size_dict[n_samples]['null_dists'].append(null_dist)
            pvalue = np.mean(np.asarray(null_dist) >= stat)
            print("P-value: " + str(pvalue))
            print(f'Test stat: {stat}')
            print(f'Null dist: {null_dist[:5]}')
            pvalues.append(pvalue)
        for pval in pvalues: 
            if pval <= 0.05: 
                num=num+1; 
        power.append(num / POWER_REPS) 
        #power.append((pvalues >= 0.05).sum() / POWER_REPS)
        
    with open('C:/Users/siptest/Desktop/NDD/multimodal_independence_power_reps_honest.pkl', 'wb') as handle: 
        pickle.dump(samp_size_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    np.savetxt('C:/Users/siptest/Desktop/NDD/multimodal_independence_power_honest.csv', power, delimiter=',')
    
    return power

# matrix1, matrix2 = multimodal_independence(10, 1)
# matrix1, matrix2 = indep_sim('multimodal_independence', 10, 1)
# x, y = k_sample_transform([matrix1, matrix2])
# print(matrix1.shape, y.shape)
estimate_power(SIMULATIONS[0])

  5%|████▏                                                                              | 1/20 [01:08<21:50, 68.95s/it]

P-value: 0.49
Test stat: 0.15098250045277772
Null dist: [0.13246716628897526, 0.16913473683127644, 0.1299241912973622, 0.21112465016096238, 0.1550214701164534]
