In [1]:
import numpy as np
from numba import njit
import pickle
from sklearn.ensemble import BaggingClassifier
from sklearn.tree._classes import DecisionTreeClassifier
from joblib import Parallel, delayed
from scipy.stats import entropy, multivariate_normal

from hyppo.tools import multimodal_independence, indep_sim, rot_ksamp
from hyppo.tools import SIMULATIONS
from hyppo.ksample._utils import k_sample_transform
from tqdm import tqdm
from hyppo.tools import * 

import sys
import os
import multiprocessing as mp
from proglearn import UncertaintyForest
from joblib import Parallel, delayed
sys.path.append('C:\\Users\\siptest\\AppData\\Roaming\\Python\\Python36\\Scripts')
#%load_ext autoreload
#%autoreload 2

In [2]:
class TestStat:
    def __init__(self, uf, oob_func):
        self.uf = uf
        self.oob_func = oob_func

    def __call__(self, X, y):
        if not hasattr(self, "leaf_indices_"):
            self.leaf_indices_ = []
            for tree, voter_indices in zip(self.uf.transformers_, self.uf.voter_indices_):
                self.leaf_indices_.append(tree.transform(X[voter_indices]))
        self._refit_leaves(y)

        _, counts = np.unique(y, return_counts=True)
        H_Y = entropy(counts, base=np.exp(1))
        H_YX = self._apply_oob(X, oob_func)
        return H_Y - H_YX

    def _refit_leaves(self, y):
        for leaf_indices, voter, voter_indices in zip(
            self.leaf_indices_, self.uf.voters_, self.uf.voter_indices_
        ):
            voter.fit(leaf_indices, y[voter_indices])

    def _apply_oob(self, X, func):
        sample_uses = np.zeros((len(X),))
        sample_posteriors = None
        for tree, voter, indices in zip(
            self.transformers_, self.voters_, self.voter_indices_
        ):
            pred_posterior = voter.predict_proba(tree.transform(X[indices]))
            if sample_posteriors is None:
                sample_posteriors = np.zeros((len(X), pred_posterior.shape[1]))
            sample_uses[indices] += 1
            sample_posteriors[indices] += pred_posterior

        non_zeros_indices = np.where(sample_uses > 0)[0]
        sample_posteriors = sample_posteriors[non_zeros_indices] / sample_uses[non_zeros_indices, None]

        return np.mean(func(sample_posteriors))


In [3]:
MAX_SAMPLE_SIZE = 500
STEP_SIZE = 100 
SAMP_SIZES = range(10, MAX_SAMPLE_SIZE + STEP_SIZE, STEP_SIZE)
POWER_REPS = 10

In [4]:
def _perm_stat(calc_stat, x, y):
    permy = np.random.permutation(y)
    perm_stat = calc_stat(x, permy)

    return perm_stat


def perm_test(calc_stat, X, y, reps=1000, workers=1):
    """
    Calculate the p-value via permutation
    """
    # calculate observed test statistic
    stat = calc_stat(X, y)

    # calculate null distribution
    null_dist = np.array(
        Parallel(n_jobs=workers)(
            [delayed(_perm_stat)(calc_stat, X, y) for rep in range(reps)]
        )
    )
    pvalue = (null_dist >= stat).sum() / reps

    # correct for a p-value of 0. This is because, with bootstrapping
    # permutations, a p-value of 0 is incorrect
    if pvalue == 0:
        pvalue = 1 / reps

    return stat, pvalue

In [5]:
def estimate_power(sim, test):
    
    est_power = []
    for i in tqdm(SAMP_SIZES):
        print("sample size" + str(i))
        #temp_power = calc_power(test, sim_type="indep", sim=sim, n=i, p=3, auto=False, noise=True)
        pvalues = []
        for _ in tqdm(range(POWER_REPS)): 
            x, y = rot_ksamp(sim, n=i, p=3, noise=True)
            X, y = k_sample_transform([x, y])
            uf = UncertaintyForest(
                n_estimators=300,
                tree_construction_proportion=0.5,
                kappa=np.inf,
                honest_prior="ignore",
                max_features=1.0,
                n_jobs=-2,
            )
            uf = uf.fit(X, y)

            ce = lambda x: entropy(x, axis=1, base=np.exp(1))
            obs_stat = TestStat(uf, ce)

            reps = 10
            stat, pvalue = perm_test(obs_stat, X, y, reps=reps, workers=-2)
            print(pvalue)
            pvalues.append(pvalue)
        empirical_power = (1 + (pvalues <= 0.05).sum()) / (1 + POWER_REPS) 
        est_power.append(empirical_power)
        
    print(est_power)
        
    
    print(sim + " done")
    np.savetxt('C:/Users/siptest/Desktop/NDD/{}_{}HonestSamplingPower4-7.csv'.format(sim, "UF"),
            est_power, delimiter=',')
    
    return est_power

In [6]:
for sim_name in tqdm(SIMULATIONS.keys()): 
    power = estimate_power(sim_name, "UF")

  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  0%|                                                                                            | 0/6 [00:00<?, ?it/s][A

  return f(**kwargs)


sample size10


  0%|                                                                                           | 0/10 [00:09<?, ?it/s]
  0%|                                                                                            | 0/6 [00:09<?, ?it/s]
  0%|                                                                                           | 0/20 [00:09<?, ?it/s]


TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'

In [None]:
import seaborn as sns
sns.set(color_codes=True, style='white', context='talk', font_scale=1.5)
PALETTE = sns.color_palette("Set1")
sns.set_palette(PALETTE[1:], n_colors=9)

def plot_power():
    fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(25,20))
    
    sim_title = [
        "Linear",
        "Exponential",
        "Cubic",
        "Joint Normal",
        "Step",
        "Quadratic",
        "W-Shaped",
        "Spiral",
        "Bernoulli",
        "Logarithmic",
        "Fourth Root",
        "Sine 4\u03C0",
        "Sine 16\u03C0",
        "Square",
        "Two Parabolas",
        "Circle",
        "Ellipse",
        "Diamond",
        "Multiplicative",
        "Independence"
    ]
    
    plt.suptitle("Multivariate Independence Testing (Increasing Sample Size)", y=0.93, va='baseline')
    
    for i, row in enumerate(ax):
        for j, col in enumerate(row):
            count = 5*i + j
            sim = list(SIMULATIONS.keys())[count]
            #sim = SIMULATIONS[count]
            
            #for test in tests:
            power = np.genfromtxt('C:/Users/siptest/Desktop/NDD/{}_{}HonestSamplingPower.csv'.format(sim, "UF"),
                                  delimiter=',')
            col.plot(SAMP_SIZES, power, label="UF", lw=2)

            col.set_xticks([])
            if i == 3:
                col.set_xticks([SAMP_SIZES[0], SAMP_SIZES[-1]])
            col.set_ylim(-1.05, 1.05)
            col.set_yticks([])
            if j == 0:
                col.set_yticks([-1, 0, 1])
            col.set_title(sim_title[count])

    fig.text(0.5, 0.07, 'Sample Size', ha='center')
    fig.text(0.07, 0.5, 'Statistical Power', va='center', rotation='vertical')
    leg = plt.legend(bbox_to_anchor=(0.5, 0.07), bbox_transform=plt.gcf().transFigure,
                     ncol=5, loc='upper center')
    leg.get_frame().set_linewidth(0.0)
    for legobj in leg.legendHandles:
        legobj.set_linewidth(5.0)
    plt.subplots_adjust(hspace=.50)
    plt.savefig('C:/Users/siptest/Desktop/NDD/indep_power_sampsize_allSim_HonestUF.png', transparent=True, bbox_inches='tight')

In [None]:
plot_power()