In [23]:
import pandas as pd
import numpy as np

# Function to import the data from the Excel file
def load_datasets(filename='benchmark_target.xlsx'):
    df_benchmark = pd.read_excel(filename, sheet_name='Benchmark')
    df_target = pd.read_excel(filename, sheet_name='Target')
    return df_benchmark, df_target

# Function to calculate the necessary metrics for both datasets
def calculate_metrics(df, target_size, weights):
    # Initialize metrics with default values
    metrics = {
        'Debt': None,
        'DPD': None,
        'GR': None,
        'LP': None,
        'GRDPD': None,
        'GRLP': None,
        'collection': None,
        'payments': None,
    }

    # Calculate averages for the available columns
    for col in ['Debt', 'DPD', 'GR', 'LP', 'GRDPD', 'GRLP']:
        if col in df.columns:
            metrics[col] = df[col].mean()

    # Check for collection and payments
    if 'Payments' in df.columns:
        # If Payments column exists
        total_paid = df['Payments'].sum()
        total_debt = df['Debt'].sum() if 'Debt' in df.columns else 0
        metrics['collection'] = total_paid / (total_debt + total_paid) if total_debt + total_paid != 0 else 0
        metrics['payments'] = df['Payments'].notnull().sum() / len(df)
    elif 'LP' in df.columns:
        # If LP column exists
        metrics['payments'] = df['LP'].notnull().sum() / len(df)
    else:
        # If neither Payments nor LP columns exist
        metrics['collection'] = None
        metrics['payments'] = None

    return metrics

# Objective function to minimize the difference between benchmark and target metrics
def objective_function(selected_indices, df_benchmark, df_target, target_metrics, weights):
    sample = df_benchmark.iloc[selected_indices]  # Select rows based on indices
    sample_metrics = calculate_metrics(sample, len(selected_indices), weights)  # Calculate metrics for sample

    # Calculate the weighted percentage difference
    percentage_diff = 0
    for key in weights.keys():
        percentage_diff += weights[key] * abs((sample_metrics[key] - target_metrics[key]) / target_metrics[key]) * 100

    return percentage_diff

# Beam Search with Stochastic Pruning
def beam_search_stochastic_pruning(df_benchmark, df_target, weights, target_size, beam_width=50):
    target_metrics = calculate_metrics(df_target, target_size, weights)
    candidates = [[]]  # Start with an empty candidate list

    for _ in range(target_size):
        new_candidates = []
        
        # Expand each candidate in the current beam
        for candidate in candidates:
            available_indices = [i for i in range(len(df_benchmark)) if i not in candidate]
            
            # Assign a higher probability to rows not yet included in the candidate solution
            probabilities = [1.0 / (i + 1) for i in range(len(available_indices))]
            probabilities = np.array(probabilities) / sum(probabilities)  # Normalize probabilities
            
            # Sample indices stochastically based on probabilities
            sampled_indices = np.random.choice(available_indices, size=min(beam_width, len(available_indices)), replace=False, p=probabilities)

            for idx in sampled_indices:
                new_candidate = candidate + [idx]  # Add new row index to candidate
                # Deduplicate by ensuring unique row indices in each candidate
                if len(new_candidate) == len(set(new_candidate)):
                    new_candidates.append(new_candidate)

        # Sort new candidates by their objective function value and keep the top beam_width candidates
        new_candidates = sorted(new_candidates, key=lambda x: objective_function(x, df_benchmark, df_target, target_metrics, weights))
        candidates = new_candidates[:beam_width]

    # Return the best candidate from the final set
    best_solution = min(candidates, key=lambda x: objective_function(x, df_benchmark, df_target, target_metrics, weights))
    return best_solution

# Function to write results to Excel, updated to take both df_benchmark and df_target
def write_results_to_excel(benchmark_df, target_df, selected_indices, filename='benchmark_target.xlsx'):
    with pd.ExcelWriter(filename, mode='a', if_sheet_exists='replace') as writer:
        new_sample_col = f'Sample {len(benchmark_df.columns) - 1}'  # For a new sample column

        # Add a new column for selected rows
        benchmark_df[new_sample_col] = 0
        benchmark_df.loc[selected_indices, new_sample_col] = 1  # Set to 1 for selected rows

        # Write both DataFrames to separate sheets in the same Excel file
        benchmark_df.to_excel(writer, sheet_name='Benchmark', index=False)
        target_df.to_excel(writer, sheet_name='Target', index=False)

# Main function to run the process
def main():
    df_benchmark, df_target = load_datasets('benchmark_target.xlsx')  # Load datasets from Excel

    # Define the column weights
    weights = {
        'Debt': 0.2,
        'GR': 0.2,
        'DPD': 0.2,
        'LP': 0.2,
        'collection': 0.1,
        'payments': 0.1
    }

    # Perform Beam Search with Stochastic Pruning to select the best rows
    target_size = 95  # Adjust the target size as needed
    selected_indices = beam_search_stochastic_pruning(df_benchmark, df_target, weights, target_size)

    # Write the results back to the Excel file, adding a new sample column
    write_results_to_excel(df_benchmark, df_target, selected_indices)

# Run the main function
if __name__ == "__main__":
    main()