In [13]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook

In [14]:
# Function to import the data from the Excel file
def load_datasets(benchmark_file='Benchmark.csv', target_file='Target.csv'):
    df_benchmark = pd.read_csv(benchmark_file, delimiter=';', decimal=',')
    df_target = pd.read_csv(target_file, delimiter=';', decimal=',')

    # Convert all columns to numeric, coercing errors to NaN
    df_benchmark = df_benchmark.apply(pd.to_numeric, errors='coerce')
    df_target = df_target.apply(pd.to_numeric, errors='coerce')

    return df_benchmark, df_target

In [15]:
# Function to calculate the necessary metrics for both datasets
def calculate_metrics(df, target_size, weights):
    metrics = {}

    # Handle 'collection' and 'payments' separately
    if 'Payments' in df.columns:
        total_paid = df['Payments'].sum()  # Total paid
        total_debt = df['Debt'].sum()  # Total debt
        metrics['collection'] = total_paid / (total_debt + total_paid) if total_debt + total_paid != 0 else 0  # % of collection
        metrics['payments'] = df['Payments'].notnull().sum() / len(df)
    elif 'LP' in df.columns:
        metrics['collection'] = None
        metrics['payments'] = df['LP'].notnull().sum() / len(df)
    else:
        metrics['collection'] = None
        metrics['payments'] = None

    # Calculate mean for each metric in weights if the column exists
    for column in weights.keys():
        if column in ['collection', 'payments']:
            continue  # already handled
        metrics[column] = df[column].mean(skipna=True) if column in df.columns else None   # Calculate averages for all columns, skipping empty cells

    return metrics

In [16]:
# Objective function to minimize the difference between benchmark and target metrics
def objective_function(selected_indices, df_benchmark, df_target, target_metrics, weights):
    sample = df_benchmark.iloc[selected_indices]  # Select rows based on indices
    sample_metrics = calculate_metrics(sample, len(selected_indices), weights)  # Calculate metrics for sample

    # Calculate the weighted percentage difference
    percentage_diff = 0
    for key in weights.keys():
        percentage_diff += weights[key] * abs((sample_metrics[key] - target_metrics[key]) / target_metrics[key]) * 100

    return percentage_diff

In [17]:
# Beam Search with Stochastic Pruning
def beam_search_stochastic_pruning(df_benchmark, df_target, weights, target_size, beam_width=24, top_k=5):
    target_metrics = calculate_metrics(df_target, target_size, weights)
    candidates = [[]]  # Start with an empty candidate list

    for _ in range(target_size):
        new_candidates = []

        for candidate in candidates:
            available_indices = [i for i in range(len(df_benchmark)) if i not in candidate]

            # Probabilities for stochastic sampling
            probabilities = [1.0 / (i + 1) for i in range(len(available_indices))]
            probabilities = np.array(probabilities) / sum(probabilities)

            sampled_indices = np.random.choice(available_indices, size=min(beam_width, len(available_indices)), replace=False, p=probabilities)

            for idx in sampled_indices:
                new_candidate = candidate + [idx]
                if len(new_candidate) == len(set(new_candidate)):
                    new_candidates.append(new_candidate)

        # Sort candidates by objective value and keep top beam_width
        new_candidates = sorted(new_candidates, key=lambda x: objective_function(x, df_benchmark, df_target, target_metrics, weights))
        candidates = new_candidates[:beam_width]

    # Return top_k solutions
    top_candidates = sorted(candidates, key=lambda x: objective_function(x, df_benchmark, df_target, target_metrics, weights))[:top_k]
    return top_candidates

def beam_search_stochastic_pruning(df_benchmark, df_target, weights, target_size, beam_width=24, top_k=5):
    target_metrics = calculate_metrics(df_target, target_size, weights)
    candidates = [[]]  # Start with an empty candidate list
    score_cache = {}  # Cache to store scores for each candidate

    for _ in range(target_size):
        new_candidates = []

        for candidate in candidates:
            available_indices = [i for i in range(len(df_benchmark)) if i not in candidate]

            # Probabilities for stochastic sampling (bias toward earlier indices)
            for idx in sampled_indices:
                new_candidate = candidate + [idx]
                if len(new_candidate) == len(set(new_candidate)):  # Ensure uniqueness
                    new_candidates.append(new_candidate)

        # Score candidates using caching
        scored_candidates = []
        for cand in new_candidates:
            key = tuple(sorted(cand))
            if key in score_cache:
                score = score_cache[key]
            else:
                score = objective_function(cand, df_benchmark, df_target, target_metrics, weights)
                score_cache[key] = score
            scored_candidates.append((cand, score))

        # Sort candidates by cached scores and keep top ones
        scored_candidates.sort(key=lambda x: x[1])
        candidates = [x[0] for x in scored_candidates[:beam_width]]

    # Return top_k final candidates based on cached scores
    final_candidates = sorted(candidates, key=lambda x: score_cache[tuple(sorted(x))])[:top_k]
    return final_candidates


In [18]:
def write_results_to_csv(df_benchmark, list_of_selected_indices, benchmark_file='Benchmark.csv'):
    # Add a column for each candidate solution
    for i, selected_indices in enumerate(list_of_selected_indices, start=1):
        sample_col = f'Sample {i}'
        df_benchmark[sample_col] = 0
        df_benchmark.loc[selected_indices, sample_col] = 1

    # Write to CSV
    df_benchmark.to_csv(benchmark_file, sep=';', index=False)

In [19]:
# Main function to run the process
def main():
    df_benchmark, df_target =  load_datasets()

    # Define the column weights
    weights = {
        'Debt': 0.20,
        'GR': 0.05,
        'LP': 0.10,        
        'DPD': 0.10,
        'GRDPD': 0.10,
        'GRLP': 0.10,
        'collection': 0.15,
        'payments': 0.10
    }


    # Perform Beam Search with Stochastic Pruning to select the best rows
    target_size = 120

    top_candidates = beam_search_stochastic_pruning(df_benchmark, df_target, weights, target_size)

    # Write the results back to the Excel file, adding a new sample column
    write_results_to_csv(df_benchmark, top_candidates)

In [20]:
if __name__ == "__main__":
    main()

NameError: name 'sampled_indices' is not defined