In [7]:
import numpy as np
import pandas as pd
import graphviz
from sklearn import datasets
from sklearn import preprocessing
from sklearn import tree

# Code has NOT been checked for PEP8 yet.

# Reading the contents of the iris dataset for testing purposes.
iris = datasets.load_iris()
# iris = np.column_stack((iris.data, iris.target))

# Number of perturbed samples to be generated.
n = 10000 
# Number of bins for the histograms of continous attributes.
num_bins = 20 

In [8]:
# Fitting a decision tree model to the iris dataset.
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=iris.feature_names,
                                class_names=iris.target_names,
                                filled=True, rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
# graph

In [9]:
# Functions used to generate perturbed samples.

def discrete_bucketize(np_vector):
    """
    Estimates the histogram buckets for a one-dimension, discrete valued dataset.
    
    Input is the vector, np_vector, of discrete values that is to be bucketized.
    Outputs are two vectors. The first, named values, contains the normalized version of the unique values
    found in np_vector. Normalized to mean zero and unit variance.
    The second output vector, named multinom_rand, contains the multinomial probability distribution for the
    elements of the values vector.
    """
    
    values,freqs = np.unique(np_vector, return_counts=True)
    freqs = freqs/np.sum(freqs)
    values = values.astype(float)
    values = preprocessing.scale(values)
    multinom_rand = np.random.multinomial(n, freqs, 1)[0]
    return(values, multinom_rand)


def continuous_bucketize(num_bins, np_vector):
    """
    Estimates the histogram buckets for a one-dimension, continous valued dataset.
    
    Inputs are the desired number of bins for the histogram, num_bins, and np_vector, the vector of continous
    values that are to be bucketized. 
    Outputs are two vectors. The first, named h_bins, is a vector of length num_bins + 1 that lists the values
    corresponding to the bin edges in the histogram, after they are normalized to mean zero and unit variance.
    The second output vector, named freqs, contains the probabilities that a randomly generated element will
    belong in each of the histogram's bins.
    """
    
    np_vector = preprocessing.scale(np_vector)
    freqs, h_bins = np.histogramdd(np_vector, bins = num_bins)
    freqs = freqs/np.sum(freqs)
    h_bins = np.asarray(h_bins[0])
    return(h_bins, freqs)


def discrete_rand_samples(n, values, multinom_rand):
    """
    Generates n random values following the multinomial probability distribution provided as input.
    
    Inputs are n, the number of random numbers that are to be generated, the vector values that lists
    all the possible values that n can have, and multinom_rand which contains the multinomial probability
    distribution that corresponds to each element in the values vector.
    Output is the vector rand that contains n numbers chosen at random, with replacement, from the
    elements in the values vector, following the multinomial probability distribution that was provided.
    """
    
    rand = np.zeros(n)
    k = 0
    for j in range(0, len(values)):
        rand[k:k+multinom_rand[j]] = values[j]
        k = k + multinom_rand[j]
    return(rand)


def continuous_rand_samples(n, bins, freqs):
    """
    Generates n random values following the probability distribution provided as input.
    
    Inputs are n, the number of random numbers that are to be generated, the vector bins that lists
    the values the bin edges of the histogram, and the vector freqs that lists the probability that
    a value that is randomly generated will be contained by the corresponding histogram bin.
    Each random number that is generated is chosen from within a uniform probability distribution with 
    end values equal to those of a given histogram bin.
    Output is the vector tot_samples that contains n random elements chosen as described above.
    """
    
    tot_samples = np.zeros(1)
    samples_bins = np.random.multinomial(n, freqs, 1)
    for j in range(0, len(freqs)):
        samples = np.random.uniform(bins[j], bins[j+1], samples_bins[0][j])
        tot_samples = np.hstack((tot_samples, samples))
    tot_samples = tot_samples[1:, ]
    return(tot_samples)


In [10]:
# This code uses the iris dataset to evaluate the distributions of each attribute and generate
# the perturbed samples. It calls the functions above.
array = iris.target
values, multinom_rand = discrete_bucketize(array)
output = discrete_rand_samples(n, values, multinom_rand)

perturbed_samples = np.zeros(n)
for j in range(0, iris.data.shape[1] ):
    array = iris.data[:, j]
    h_bins, freqs = continuous_bucketize(num_bins, array)
    output = continuous_rand_samples(n, h_bins, freqs)
    perturbed_samples = np.vstack((perturbed_samples, output))
perturbed_samples = np.transpose(perturbed_samples[1:,])
# Once perturbed samples have been generated, the line of code below uses the decision tree we 
# fitted earlier, to get a predicted classification for each of our perturbed samples.
class_perturb_samples = clf.predict(perturbed_samples)

In [13]:
# Computing the weights (this is not finished)
inst_num = np.round(np.random.uniform(0, iris.data.shape[0], 1))
inst_num = inst_num[0].astype(int)
x = iris.data[inst_num,:]
sigma = np.std(np.sum((perturbed_samples - x)**2, axis=1))
weights = np.exp(-np.sum((perturbed_samples - x)**2, axis=1)/sigma)
weights

array([0.01254673, 0.012059  , 0.01277307, ..., 0.64967583, 0.67483733,
       0.66559725])

In [12]:
# Ignore this code. Used for testing purposes.
x = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])
y = ([1,1,1,1])
sigma = np.var(np.sum((x-y)**2, axis=1))
np.exp(-np.sum((x-y)**2, axis=1)/sigma)

array([0.99981604, 0.99834557, 0.99520183, 0.99040066])