In [None]:
import numpy as np
import random
import math
import itertools
from algorithm_greedy import*
from algorithm_baselines import*
from algorithm_pricebased import*
from algorithm_buyer_insight import*
from concurrent.futures import ProcessPoolExecutor, as_completed

In [None]:
# Function to generate the resulting accuracy of a combination given its individual accuracies and the beta parameter
def generate_combination_accuracies(accuracies, beta):
    accuracies = np.array(accuracies)
    scaled_accuracies = accuracies * np.exp(beta - 1)
    divisor = len(accuracies) ** (max(0, 1 - beta))  # Increasing divisor for 0 < beta < 1
    if beta == 0:
        return np.exp(np.mean(np.log(scaled_accuracies)))
    power_sum = np.power(scaled_accuracies, beta)
    combined_accuracy =  min(1, np.power(np.sum(power_sum) / divisor, 1/beta))
    return combined_accuracy

In [None]:
#Generate the catalog
def generate_datasets(beta, size_of_V):
    individual_accuracies = np.random.normal(0.15, 0.1, size_of_V) # Generate accuracies for individual datasets from a normal distribution
    individual_accuracies = np.clip(individual_accuracies, 0, 1)  # Ensure accuracies are between 0 and 1
    
    # Generate all possible non-empty combinations of datasets
    total_datasets = len(individual_accuracies)
    combinations = []
    for r in range(1, total_datasets + 1):
        for combo in itertools.combinations(individual_accuracies, r):
            combinations.append(combo)
            
    combined_accuracies = np.zeros((len(combinations)))
    for x, combo in enumerate(combinations):
        combined_accuracies[x] = generate_combination_accuracies(combo, beta)
    
    combined_accuracies = combined_accuracies[size_of_V:]
    return individual_accuracies, combined_accuracies

In [None]:
# Generate a set of keys for the combinations in the catalog
def generate_keys(catalogue_size, size_of_V):
    # Generate the original keys (assuming they are simply integers starting from 1 for simplicity)
    original_keys = list(range(1, size_of_V + 1))

   # Initialize the list to store the keys representing combinations of original datasets
    combination_keys_as_tuples = []

    # Start by adding the original datasets' keys as single-element tuples
    for key in original_keys:
        combination_keys_as_tuples.append((key,))

    # Generate combinations of increasing size and create keys representing these combinations as tuples
    for n in range(2, len(original_keys) + 1):
        for combo in itertools.combinations(original_keys, n):
            if len(combination_keys_as_tuples) >= catalogue_size:
                # Break early if catalogue limit is reached or about to be reached
                break
            # Create a tuple from the combo, which already contains integers
            combo_tuple = tuple(combo)
            combination_keys_as_tuples.append(combo_tuple)

    # Check if we reached the limit or need to stop
    if len(combination_keys_as_tuples) > catalogue_size:
        # Trim the list to the catalogue size if it exceeded the limit
        combination_keys_as_tuples = combination_keys_as_tuples[:catalogue_size]

    single_key_combinations = [t for t in combination_keys_as_tuples if len(t) == 1]
    multiple_key_combinations = [t for t in combination_keys_as_tuples if len(t) > 1]
    if (size_of_V,) in multiple_key_combinations:
        multiple_key_combinations.remove((size_of_V,))  # Remove from multiple-key list
        single_key_combinations.append((size_of_V,))  # Add to single-key list

    return single_key_combinations, multiple_key_combinations, combination_keys_as_tuples

In [None]:
# Generate the prices of the combinations in the catalog
def generate_prices(individual_accuracies, combination_accuracies, price_correlation):
    all_accuracies = np.hstack((individual_accuracies, combination_accuracies))
    keys_K, keys_U, keys = generate_keys((2**len(individual_accuracies))-1, len(individual_accuracies))
    
    random_list = np.random.normal(size=len(individual_accuracies))
    original_standardized = (individual_accuracies - np.mean(individual_accuracies)) / np.std(individual_accuracies)
    random_standardized = (random_list - np.mean(random_list)) / np.std(random_list)
    new_list = price_correlation * original_standardized + np.sqrt(1 - price_correlation**2) * random_standardized
    new_list_scaled = new_list * np.std(individual_accuracies) + np.mean(individual_accuracies)
    individual_prices = new_list_scaled / np.max(new_list_scaled)
    
    combination_prices_list = []
    for combo in keys_u:
        # Sum the prices of the components of the combination
        combo_price = sum(individual_prices[key - 1] for key in combo)  # Adjust index for 0-based indexing
        combination_prices_list.append(combo_price)
    
    full_prices = np.hstack((individual_prices, combination_prices_list))
    return full_prices, keys_K, keys_U, keys

In [None]:
# Create the set of known K datasets and unknown U combinations
def create_U_and_K_sets(original_array, size_of_K, size_of_V, keys):
    num_cols = original_array.shape[1]
    assert size_of_K <= size_of_V, "N should not be larger than 10"
    
    if N == size_of_V:
        # If size_of_K is size_of_V, K includes all first size_of_V columns, and U includes the rest
        D = original_array[:, :size_of_V]
        U = original_array[:, size_of_V:]
        keys_d = keys[:size_of_V]
        keys_u = keys[size_of_V:]
    else:
        # Randomly selecting size_of_K unique column indices from the first size_of_V columns
        random_col_indices = np.random.choice(size_of_V, size=size_of_K, replace=False)
        # Creating K with the randomly selected columns
        K = original_array[:, random_col_indices]
        # Creating keys_d with the randomly selected keys
        keys_K = [keys[i] for i in random_col_indices]
        
        # Creating a mask for all columns not included in K
        mask = np.ones(num_cols, dtype=bool)
        # Set all selected columns in the first size_of_V to False in the mask
        mask[random_col_indices] = False
        # Ensure that the rest of the first size_of_V columns are correctly identified
        for index in range(size_of_V):
            if index not in random_col_indices:
                mask[index] = True
        # Creating U with the remaining columns
        U = original_array[:, mask]
        # Creating keys_U with the remaining keys
        keys_U = [keys[i] for i in range(num_cols) if mask[i]]

    return K, U, keys_K, keys_U

In [None]:
# Filter the sets into K(B) and U(B)
def scale_sets(K, U, budget_scale, keys_K, keys_U):
    # Calculate the budget ceiling based on the maximum of the second row in U
    B = U[1,:]*budget_scale
    
    # Identify columns in U where the second row does not exceed B
    valid_indices_u = np.where(U[1] <= B)[0]
   
    # Filter U and keys_u accordingly
    U = U[:, valid_indices_u]
    keys_U = [keys_U[i] for i in valid_indices_u]  # Ensure this list comprehension does not go out of range

    # Sort K by the first row in descending order and update keys_K
    sorted_indices_k = np.argsort(-K[0])
    K = K[:, sorted_indices_k]
    keys_d = [keys_d[i] for i in sorted_indices_d]

    # Sort U by the second row in increasing order and update keys_u
    sorted_indices_u = np.argsort(U[1])
    U = U[:, sorted_indices_u]
    keys_U = [keys_U[i] for i in sorted_indices_u]
    return K, U, keys_K, keys_U, B

In [None]:
# Run the algorithms and output their performance and exploration counts
def process_task(B, R, K, U, keys, keys_K, keys_U, risk, gamma, epsilon, runs):
    probabilistic_pricebased_step = np.zeros((runs))
    blind_step = np.zeros((runs))
    epsilon_step = np.zeros((runs))
    
    probabilistic_pricebased_count = np.zeros((runs))
    blind_count = np.zeros((runs))
    epsilon_count = np.zeros((runs))
    
    complete = np.hstack((K,U))
    valid_indices = complete[0][complete[1]<B]
    offline = valid_indices.max()
    
    size_of_U = U.shape[1]  
    size_of_D = D.shape[1]
    
    greedy_performance, greedy_count = greedy(K,U,B,R)
    inv_greedy_performance, inv_greedy_count = inv_greedy(K,U,B,R)
    buyer_insight_performance, buyer_insight_count = buyer_insight_y(K, U, B, R, keys, keys_K, keys_U, gamma)

    for k in range(runs):
        probabilistic_pricebased_step[k], cor_count[k] = probabilistic_pricebased(K,U,B,R,risk)
        blind_step[k], blind_count[k] = blind_buyer(K,U,B,R)
        epsilon_step[k], epsilon_count[k] = epsilon_greedy_bandit(K,U,B,R,epsilon)
            
    probabilistic_pricebased_performance = np.mean(probabilistic_pricebased_step)
    blind_performance = np.mean(blind_step)
    epsilon_performance = np.mean(epsilon_step)  
    
    
    
    return offline, greedy_performance, inv_greedy_performance, probabilistic_pricebased_performance, blind_performance, epsilon_performance, buyer_insight_performance, 
            greedy_count, inv_greedy_count, np.mean(probabilistic_pricebased_count), np.mean(blind_count), np.mean(epsilon_count), buyer_insight_count

In [None]:
# Choose the parameters of the test run

beta = 1 # Set the beta value for the test run
price_cor = 0 # Set the price-quality correlation value for the test
iterations = 1 #Set how many iterations you want to run and averag for your results
R_values = np.linspace(0.001, 0.2, 40) # Set the range of revelation values you want to test (as % of B)
size_of_V = 10 # Set the number of available individual datasets in V
size_of_K = 3 # Set the number of datasets revealed for free in the known set K
risk = 0.2 # Set the sacrifice hyperparameter for the probabilistic price-based algorithm
gamma = 1 # Set the gamma hyperparameter for the deterministic buyer-insight algorithm
runs = 100 # Set the number of runs to run the probabilistic algorithms to average the performance
epsilon = 0.9 #Set the probability of epsilon

In [None]:
#Declare the empty result arrays
offline = np.zeros((len(R_values), iterations))
probabilistic_pricebased_performance = np.zeros((len(R_values), iterations))
greedy_performance = np.zeros((len(R_values), iterations))
inv_greedy_performance = np.zeros((len(R_values), iterations))
blind_performance = np.zeros((len(R_values), iterations))
epsilon_performance = np.zeros((len(R_values), iterations))
buyer_insight_performance = np.zeros((len(R_values), iterations))
    
#Declare the empty exploration count arrays
probabilistic_pricebased_counts = np.zeros((len(R_values), iterations))
greedy_counts = np.zeros((len(R_values), iterations))
inv_greedy_counts = np.zeros((len(R_values), iterations))
blind_counts = np.zeros((len(R_values), iterations))
epsilon_counts = np.zeros((len(R_values), iterations))
buyer_insight_counts = np.zeros((len(R_values), iterations))


#Execute the functions and run the test-bench
individual_accs, combination_accs = generate_datasets(beta)
full_accuracies = np.hstack((individual_accs, combination_accs))

full_prices, keys_K, keys_u, keys = generate_prices(individual_accs, combination_accs, price_cor)
original = np.vstack((full_accuracies, full_prices))
        
with ProcessPoolExecutor() as executor:
    futures = {}
        for y in range(iterations):
            K, U, keys_K, keys_U = create_U_and_K_sets(original, size_of_K, size_of_V, keys)
            K, U, keys_K, keys_U, B = scale_sets(K,U, budget_scale, keys_K, keys_U)
                for x, R in enumerate(R_values):
                    R = R * B
                    future = executor.submit(process_task, B, R, K, U, keys, keys_K, keys_U, risk, gamma, epsilon, runs)
                    futures[future] = (x,y)
                
                
            for future in as_completed(futures):
                x, y = futures[future]
                result = future.result()
                offline[x,y] = result[0]
                greedy_performance[x,y] = result[1]
                inv_greedy_performance[x,y] = result[2]
                probabilistic_pricebased_performance[x,y] = result[3]
                blind_performance[x,y] = result[4]
                epsilon_performance[x,y] = result[5]
                buyer_insight_performance[x,y] = result[6]


                greedy_counts[x,y] = result[7]
                inv_greedy_counts[x,y] = result[8]
                probabilistic_pricebased_counts[x,y] = result[9]
                blind_counts[x,y] = result[10]
                epsilon_counts[x,y] = result[11]
                buyer_insight_counts[x,y] = result[12]
                