In [1]:
from scipy.special import gamma, loggamma
import numpy as np


class Dirichlet:
    
    def __init__(self, alpha):
        self.alpha = alpha
        
    ## In the following functions:
    
    ## probs correspond to the prior predictive distribution probabilities
    ## expert_probs correspond to the elicited probabilities from the expert
    
    ## sample_probs and sample_expert_probs are the same quantities but for multiple sets of covariates (J), each of which may have different partitions
    
    ## For both sample_probs and sample_expert_probs, the probabilities for each j = 1,...,J are in the j'th row
    
    
    ## Function to calculate the approximation of the MLE of alpha for J=1
        
    def alpha_mle(self, probs, expert_probs):
        
        #assert probs.ndim == 1 and expert_probs.ndim == 1, "This operation requires one set of probabilities only"
        assert np.isclose(np.sum(probs), 1) and np.isclose(np.sum(expert_probs), 1), "Probabilities must sum to 1"
        
        K = len(probs)
        
        kl_divergence = - np.sum([probs[k]*(np.log(expert_probs[k]) - np.log(probs[k])) for k in range(K)])
        
        return (K/2 - 1/2) / kl_divergence
    
    
    ## Function to calculate the same quantity for J>1
        
    def alpha_mle_multiple_samples(self, sample_probs, sample_expert_probs):
        
        J = len(sample_probs) if type(sample_probs[0]) in [list, np.ndarray] else 1
                
        if J == 1: return self.alpha_mle(sample_probs, sample_expert_probs)
                
        assert np.all(np.isclose(np.array([np.sum(probs) for probs in sample_probs]), np.ones(J))) and np.all(np.isclose(np.array([np.sum(probs) for probs in sample_expert_probs]), np.ones(J))), "Probabilities must sum to 1"
        
        nom = 0
        den = 0
        
        for j in range(J):
            
            n_j = len(sample_probs[j])
            
            nom += (n_j - 1)/2
            
            kl_divergence = - np.sum([sample_probs[j][k]*(np.log(sample_expert_probs[j][k]) - np.log(sample_probs[j][k])) for k in range(n_j)])
            
            den += kl_divergence
            
        return nom / den
    
    ## Simple function for the PDF of the Dirichet distribution
    
    def pdf(self, probs, expert_probs):
        
        #assert probs.ndim == 1 and expert_probs.ndim == 1, "Pdf is defined for one set of probabilities only"
        assert np.isclose(np.sum(probs), 1) and np.isclose(np.sum(expert_probs), 1), "Probabilities must sum to 1"
        
        reset = 0
        
        if self.alpha is None:
            reset = 1
            self.alpha = self.alpha_mle_multiple_samples(probs, expert_probs)
        
        num_1 = gamma(self.alpha)
        den_1 = np.prod([gamma(self.alpha*prob) for prob in probs])
        pt_1 = num_1 / den_1
                
        pt_2 = np.prod([expert_probs[i]**(self.alpha*probs[i] - 1) for i in range(len(probs))])
        
        if reset == 1: self.alpha = None
        
        return pt_1 * pt_2
    
    
    ## Function for log likelihood for J=1
        
    def llik(self, probs, expert_probs):
                
        #assert probs.ndim == 1 and expert_probs.ndim == 1, "Likelihood is defined for one set of probabilities only"
        
        assert np.isclose(np.sum(probs), 1) and np.isclose(np.sum(expert_probs), 1), "Probabilities must sum to 1"
        
        reset = 0
        
        if self.alpha is None:
            reset = 1
            self.alpha = self.alpha_mle_multiple_samples(probs, expert_probs)
        
        loggamma_alpha = loggamma(self.alpha)
        
        num_1 = loggamma_alpha
        den_1 = np.sum([loggamma_alpha + loggamma(prob) for prob in probs])
        pt_1 = num_1 - den_1
        
        pt_2 = np.sum([(self.alpha*probs[i] - 1) * np.log(expert_probs[i]) for i in range(len(probs))])
        
        if reset == 1: self.alpha = None
        
        return pt_1 + pt_2
    
    ## Sum of log-likelihoods. This will be used in later stages during optimization
    
    def sum_llik(self, sample_probs: list, sample_expert_probs: list):
        
        J = len(sample_probs) if type(sample_probs[0]) in [list, np.ndarray] else 1
                
        if J == 1: return self.llik(sample_probs, sample_expert_probs)
        
        assert np.all(np.isclose(np.array([np.sum(probs) for probs in sample_probs]), np.ones(J))) and np.all(np.isclose(np.array([np.sum(probs) for probs in sample_expert_probs]), np.ones(J))), "Probabilities must sum to 1"
        
        reset = 0
        
        if self.alpha is None:
            reset = 1
            self.alpha = self.alpha_mle_multiple_samples(sample_probs, sample_expert_probs)
        
        total_llik = 0
        
        for j in range(J):
            
            total_llik += self.llik(sample_probs[j], sample_expert_probs[j])
            
        if reset == 1: self.alpha = None
            
        return total_llik

In [2]:
import os
import pandas as pd

class PPEProbabilities:
    
    def __init__(self, target_type, path):
        self.target_type = target_type
        self.path = path
        
    def get_expert_data(self, expert_input):        
        
        if self.target_type == "discrete":
            
            if self.path:
                if os.path.isfile(expert_input):  ## Checking if the input is a single file or a folder (which is assumed to contain files). Note that if we have different number of partitions for different covariate sets, we need a folder to store them
                    expert_input = pd.read_csv(expert_input, index_col=0)
                    elicited_data = expert_input.to_numpy()
                     
                else: ## if not, then the path must lead to a folder containing multiple files. In the discrete case, we assume that each file contains the classes in column 1 and the probabilities at column 2
                    
                    files = os.listdir(expert_input)

                    # Filter only CSV files
                    csv_files = [file for file in files if file.endswith('.csv')]
                    
                    elicited_covariate_sets = []

                    # Loop through each CSV file and process its contents
                    for csv_file in csv_files:
                        df = pd.read_csv(expert_input + "/" + csv_file, index_col=0)
                        elicited_covariate_set = df.to_numpy()
                        
                        elicited_covariate_sets.append(elicited_covariate_set)
                        
                    elicited_data = np.zeros((elicited_covariate_sets[0].shape[0], len(elicited_covariate_sets) + 1))
                    
                    elicited_data[:,0] = elicited_covariate_sets[0][:,0]
                    
                    for j, set in enumerate(elicited_covariate_sets):
                        
                        elicited_data[:,j+1] = set[:,-1]
                        
            else:
                elicited_data = expert_input   ## the input is a matrix containing the classes and the corresponding probabilities
                
        
            elicited_data = elicited_data.astype(float)  ## ensuring that all values are numerical      
            
            partitions = elicited_data[:,0]
            
            expert_probabilities = [elicited_data[:,j+1] for j in range(elicited_data.shape[1] - 1)]
        
        if self.target_type == "continuous":
            
            ## Goal format: J separate matrices that have three columns; the first two being the partitions and third being the corresponding probabilities
            
            if self.path:
                if os.path.isfile(expert_input):  ## Checking if the input is a single file or a folder (which is assumed to contain files)
                    expert_input = pd.read_csv(expert_input, index_col=0)
                    elicited_data = expert_input.to_numpy()
                     
                else: ## if not, then the path must lead to a folder containing multiple files. In the continuous case, we assume that each file contains three columns; the first two being the partitions and third being the corresponding probabilities
                    
                    files = os.listdir(expert_input)

                    # Filter only CSV files
                    csv_files = [file for file in files if file.endswith('.csv')]
                    
                    elicited_data = []

                    # Loop through each CSV file and process its contents
                    for csv_file in csv_files:
                        df = pd.read_csv(expert_input + "/" + csv_file, index_col=0)
                        elicited_covariate_set = df.to_numpy()
                        
                        elicited_data.append(elicited_covariate_set)
                        
                    
                        
            else:
                elicited_data = expert_input   ## the input is a matrix containing the partitions and the corresponding probabilities
                
        
            elicited_data = [cov_set.astype(float) for cov_set in elicited_data]  ## ensuring that all values are numerical
            
            partitions = [covariate_set[:,[0,1]] for covariate_set in elicited_data]
            expert_probabilities = [covariate_set[:, -1] for covariate_set in elicited_data]
            
            
        return partitions, expert_probabilities
    
    ## discrete data: "partitions" are an array containing the classes and "expert_probabilities" are a matrix with one column for each J
    ## continuous data: "partitions" are a list of length J, containing one partition for each covariate set and "expert_probabilities" is a list of same length, containing the respective probabilities
    
    
    
    def ppd_probs(self, samples, partitions):
                
        
        if self.target_type == "discrete":
                
            J = samples.shape[1] ## Each column in "samples" corresponds to one set of covariates
            
            N_samples = samples.shape[0]
            
            N_classes = len(partitions)
            
            ## Here, the samples come from the prior predictive distribution and contain values for y, which is discrete
            
            ## In order to get the probabilities for each class c, we simply compute #(sample = c) / #(sample)
            
            model_probabilities = []
            
            
            for j in range(J):
                
                cov_set_j = samples[:,j]
                
                probs_list = np.zeros(N_classes)
                
                for i,C in enumerate(partitions):
                                        
                    probs_list[i] = np.sum(cov_set_j == C) / N_samples
                                    
                model_probabilities.append(probs_list)
                
                
        if self.target_type == "continuous":
                        
            
            J = samples.shape[1] ## Each column in "samples" corresponds to one set of covariates
    
            N_samples = samples.shape[0]
                
            ## We want the same format as the one of the elicited probabilities. For that reason,
            ## the output will be a list of probabilities
            
            model_probabilities = []
            
            for j in range(J):
                
                
                partition = np.copy(partitions[j])
                cov_set_j = samples[:,j]
                
                N_partitions = partition.shape[0]
                
                ## When sampling, it is possible that we get a value that is outside the partitions. In that case, we redifine the bounds according to the sampled value
                ## E.g. if the lower bound among all partitions is 15 and we sample the value 12, the new lower bound will be 12
                ## This however should not happen too often in the sampling process, as the lower and upper bounds should be wide enough to contain all samples

                sample_min = np.min(cov_set_j)
                sample_max = np.max(cov_set_j)
                
                if partition[0,0] > sample_min:
                    partition[0,0] = sample_min
                    
                if partition[-1,1] < sample_max:
                    partition[-1,1] = sample_max
                    
                probs_list = np.zeros(N_partitions)
                    
                for i in range(N_partitions):
                    
                    lower_bound = partition[i][0]
                    upper_bound = partition[i][1]
                                        
                    count = np.sum((cov_set_j >= lower_bound) & (cov_set_j <= upper_bound))
                    
                    probs_list[i] = count / N_samples
                                            
                model_probabilities.append(probs_list)


        return model_probabilities

### Some simple checks regarding the Dirichlet class

In [17]:
### Some simple checks:


dir_1 = Dirichlet(1)

incorrect_probs = np.array([0.5,0.1,0.2,0.3])
expert_probs = np.array([0.2,0.15,0.25,0.4])

## both give an error, as they should

## print(dir_1.alpha_mle(incorrect_probs, expert_probs))
## print(dir_1.alpha_mle_multiple_samples(incorrect_probs, expert_probs))

        
probs = np.array([0.3,0.05,0.2,0.45])
expert_probs = np.array([0.2,0.15,0.25,0.4])


print(dir_1.pdf(probs, expert_probs))
print(dir_1.llik(probs, expert_probs))
print(dir_1.alpha_mle(probs, expert_probs))


dir_2 = Dirichlet(None) 
dir_3 = Dirichlet(dir_1.alpha_mle(probs, expert_probs))

##checking that if alpha is None, then it is computed based on dirichlet mle

print(dir_2.llik(probs, expert_probs) == dir_3.llik(probs, expert_probs)) 

print("-----------------")

sample_incorrect_probs = [[0.3,0.05,0.2,0.45], [0.35,0.05,0.2,0.45]]
sample_expert_probs = [[0.2,0.15,0.25,0.4], [0.2,0.15,0.25,0.4]]

## Gives an error

## print(dir_1.sum_llik(sample_incorrect_probs, sample_expert_probs))



sample_probs = [np.array([0.3,0.05,0.2,0.45]), np.array([0.15,0.25,0.5,0.1])]
sample_expert_probs = [np.array([0.2,0.15,0.25,0.4]), np.array([0.1,0.2,0.5,0.2])]

print(dir_1.sum_llik(sample_probs, sample_expert_probs))
print(dir_1.alpha_mle_multiple_samples(sample_probs, sample_expert_probs))

print("----------------")


dir_4 = Dirichlet(dir_1.alpha_mle_multiple_samples(sample_probs, sample_expert_probs))

##checking that if alpha is None, then it is computed based on dirichlet mle

print(dir_4.sum_llik(sample_probs, sample_expert_probs) == dir_2.sum_llik(sample_probs, sample_expert_probs)) 

0.17835805432006951
-1.7239622080059016
19.978004541555926
True
-----------------
-2.7055326287365427
24.515109566030482
----------------
True


In [20]:
## Checking that having different probability dimensions is ok (necessary for having different number of partitions)

dir_1 = Dirichlet(None) 


sample_probs = [np.array([0.3,0.05,0.2,0.45]), np.array([0.3, 0.25, 0.45])] ## 4 and 3 probabilities (partitions)
sample_expert_probs = [np.array([0.2,0.15,0.25,0.4]), np.array([0.2, 0.7, 0.1])]



print(dir_1.sum_llik(sample_probs, sample_expert_probs))
print(dir_1.alpha_mle_multiple_samples(sample_probs, sample_expert_probs))

-20.237703436114796
4.057439841963832


# Running some tests to ensure that the two classes work as expected

## Discrete data

In [5]:


############### Getting the data from a folder ###############

probs_1 = np.array([[0, 1, 2], [0.5, 0.3, 0.2]]).T
probs_2 = np.array([[0, 1, 2], [0.4, 0.4, 0.2]]).T
probs_3 = np.array([[0, 1, 2], [0.3, 0.1, 0.6]]).T

# Folder path to store CSV files
folder_path = '/Users/panos/Desktop/Internship/test_folder'

probs_1 = pd.DataFrame(probs_1)
probs_2 = pd.DataFrame(probs_2)
probs_3 = pd.DataFrame(probs_3)

probs_1.to_csv(folder_path + "/probs_1.csv")
probs_2.to_csv(folder_path + "/probs_2.csv")
probs_3.to_csv(folder_path + "/probs_3.csv")

prob_class = PPEProbabilities(target_type="discrete", path=True)


partitions, expert_probs = prob_class.get_expert_data(folder_path)

print(partitions)
print(expert_probs)


[0. 1. 2.]
[array([0.3, 0.1, 0.6]), array([0.4, 0.4, 0.2]), array([0.5, 0.3, 0.2])]


In [6]:
############### Getting the data from a file ###############

probs_file = np.array([[0, 1, 2], [0.5, 0.3, 0.2], [0.4, 0.4, 0.2], [0.3, 0.1, 0.6]]).T

# Path to store the CSV file

probs_file = pd.DataFrame(probs_file)

probs_file.to_csv('/Users/panos/Desktop/Internship/probs_file.csv')

prob_class = PPEProbabilities(target_type="discrete", path=True)


partitions, expert_probs = prob_class.get_expert_data('/Users/panos/Desktop/Internship/probs_file.csv')

print(partitions)
print(expert_probs)

[0. 1. 2.]
[array([0.5, 0.3, 0.2]), array([0.4, 0.4, 0.2]), array([0.3, 0.1, 0.6])]


In [7]:
############### Feeding the data directly ###############
 

probs = np.array([[0, 1, 2], [0.5, 0.3, 0.2], [0.4, 0.4, 0.2], [0.3, 0.1, 0.6]]).T

prob_class = PPEProbabilities(target_type="discrete", path=False)


partitions, expert_probs = prob_class.get_expert_data(probs)

print(partitions)
print(expert_probs)

[0. 1. 2.]
[array([0.5, 0.3, 0.2]), array([0.4, 0.4, 0.2]), array([0.3, 0.1, 0.6])]


In [9]:
######## Test where we input samples from a simple prior predictive distribution and get model probabilities for the partitions ########

elicited_data_discrete = np.array([[0,1],[0.3,0.7],[0.6,0.4]]).T


cov_set_1 = np.random.binomial(n = 1, p = 0.7, size = 2000)
cov_set_2 = np.random.binomial(n = 1, p = 0.4, size = 2000)

samples = np.vstack((cov_set_1, cov_set_2)).T

prob_class = PPEProbabilities(target_type="discrete", path=False)


partitions, expert_probs = prob_class.get_expert_data(elicited_data_discrete)

model_probs = prob_class.ppd_probs(samples, partitions)

print(partitions)
print(expert_probs)
print(model_probs)
print("------------")

## Feeding these probabilities to dirichlet

dir = Dirichlet(None)

print(dir.alpha_mle_multiple_samples(model_probs, expert_probs))  ## very high alpha, which makes sense since we used the "expert" probabilities to sample

print(dir.sum_llik(model_probs, expert_probs))

dir_2 = Dirichlet(10) ## trying fixed alpha

print(dir_2.sum_llik(model_probs, expert_probs))

[0. 1.]
[array([0.3, 0.7]), array([0.6, 0.4])]
[array([0.2915, 0.7085]), array([0.599, 0.401])]
------------
5712.568749470156
-94695.3210428995
-37.9571996999857


## Continuous data

In [23]:

############### Getting the data from a folder ###############

probs_1 = np.array([[0, 100, 200], [100, 200, 300], [0.5, 0.3, 0.2]]).T  ## partitions (0,100), (100, 200), (200, 300)
probs_2 = np.array([[0, 150, 200], [150, 200, 300], [0.4, 0.4, 0.2]]).T  ## partitions (0,150), (150, 200), (200, 300)
probs_3 = np.array([[0, 100, 150, 200], [100, 150, 200, 300], [0.3, 0.1, 0.4, 0.2]]).T   ## Different number of partitions here! partitions (0,100), (100, 150), (150, 200), (200, 300)

# Folder path to store CSV files
folder_path = '/Users/panos/Desktop/Internship/test_folder_2'

probs_1 = pd.DataFrame(probs_1)
probs_2 = pd.DataFrame(probs_2)
probs_3 = pd.DataFrame(probs_3)

probs_1.to_csv(folder_path + "/probs_1.csv")
probs_2.to_csv(folder_path + "/probs_2.csv")
probs_3.to_csv(folder_path + "/probs_3.csv")

prob_class = PPEProbabilities(target_type="continuous", path=True)


partitions, expert_probs = prob_class.get_expert_data(folder_path)

for partition in partitions:
    print(partition)

print(expert_probs)


[[  0. 100.]
 [100. 150.]
 [150. 200.]
 [200. 300.]]
[[  0. 150.]
 [150. 200.]
 [200. 300.]]
[[  0. 100.]
 [100. 200.]
 [200. 300.]]
[array([0.3, 0.1, 0.4, 0.2]), array([0.4, 0.4, 0.2]), array([0.5, 0.3, 0.2])]


In [22]:
############### Feeding the data directly ###############

probs_1 = np.array([[0, 100, 200], [100, 200, 300], [0.5, 0.3, 0.2]]).T
probs_2 = np.array([[0, 150, 200], [150, 200, 300], [0.4, 0.4, 0.2]]).T
probs_3 = np.array([[0, 100, 150, 200], [100, 150, 200, 300], [0.3, 0.1, 0.4, 0.2]]).T   ## Different number of partitions here!

probs = [probs_1, probs_2, probs_3]

prob_class = PPEProbabilities(target_type="continuous", path=False)


partitions, expert_probs = prob_class.get_expert_data(probs)

for partition in partitions:
    print(partition)

print(expert_probs)

[[  0. 100.]
 [100. 200.]
 [200. 300.]]
[[  0. 150.]
 [150. 200.]
 [200. 300.]]
[[  0. 100.]
 [100. 150.]
 [150. 200.]
 [200. 300.]]
[array([0.5, 0.3, 0.2]), array([0.4, 0.4, 0.2]), array([0.3, 0.1, 0.4, 0.2])]


In [36]:
######## Test where we input samples from a simple prior predictive distribution and get model probabilities for the partitions ########

probs_1 = np.array([[0, 100, 200], [100, 200, 300], [0.5, 0.3, 0.2]]).T
probs_2 = np.array([[0, 150, 200], [150, 200, 300], [0.4, 0.4, 0.2]]).T
probs_3 = np.array([[0, 100, 150, 200], [100, 150, 200, 300], [0.3, 0.1, 0.4, 0.2]]).T   ## Different number of partitions here!

probs = [probs_1, probs_2, probs_3]





samples_1 = np.random.normal(loc = 66, scale = 45, size = 10000)
samples_2 = np.random.normal(loc = 150, scale = 35, size = 10000)
samples_3 = np.random.normal(loc = 200, scale = 35, size = 10000)

samples = np.vstack((samples_1, samples_2, samples_3)).T


prob_class = PPEProbabilities(target_type="continuous", path=False)


partitions, expert_probs = prob_class.get_expert_data(probs)

model_probs = prob_class.ppd_probs(samples, partitions)

print(partitions)
print(expert_probs)
print(model_probs)
print("------------")



dir = Dirichlet(None)

print(dir.alpha_mle_multiple_samples(model_probs, expert_probs))  ## low alpha, which makes sense since we used a "random" distribution to sample, loosely following the expert distribution

print(dir.sum_llik(model_probs, expert_probs))

dir_2 = Dirichlet(10) ## trying fixed alpha

print(dir_2.sum_llik(model_probs, expert_probs))


[array([[  0., 100.],
       [100., 200.],
       [200., 300.]]), array([[  0., 150.],
       [150., 200.],
       [200., 300.]]), array([[  0., 100.],
       [100., 150.],
       [150., 200.],
       [200., 300.]])]
[array([0.5, 0.3, 0.2]), array([0.4, 0.4, 0.2]), array([0.3, 0.1, 0.4, 0.2])]
[array([0.7788, 0.22  , 0.0012]), array([0.5072, 0.415 , 0.0778]), array([0.0024, 0.0749, 0.4232, 0.4995])]
------------
4.482511758834674
-40.166197918365945
-130.1189527892806
