In [3]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook

# Function to import the data from the Excel file
def load_datasets(filename='benchmark_target.xlsx'):
    df_benchmark = pd.read_excel(filename, sheet_name='Benchmark')
    df_target = pd.read_excel(filename, sheet_name='Target')
    return df_benchmark, df_target

# Function to calculate the necessary metrics for both datasets
def calculate_metrics(df, target_size, weights):
    clients_with_payments = df['LP'].notnull().sum() / len(df)  # % of clients with payments
    total_paid = df['Payments'].sum()  # Total paid
    total_debt = df['Deb'].sum()  # Total debt
    collection_percentage = total_paid / (total_debt + total_paid) if total_debt + total_paid != 0 else 0  # % of collection
    averages = df.mean(skipna=True)  # Calculate averages for all columns, skipping empty cells

    return {
        'Deb': averages['Deb'],
        'GR': averages['GR'],
        'DPD': averages['DPD'],
        'LP': averages['LP'],  # Average of LP column
        'collection': collection_percentage,  # % of collection
        'payments': clients_with_payments  # % of clients with payments
    }

# Objective function to calculate the weighted difference from target metrics
def calculate_difference(selected_indices, df_benchmark, target_metrics, weights):
    sample = df_benchmark.iloc[selected_indices]
    sample_metrics = calculate_metrics(sample, len(selected_indices), weights)
    percentage_diff = sum(
        weights[key] * abs((sample_metrics[key] - target_metrics[key]) / target_metrics[key]) * 100
        for key in weights
    )
    return percentage_diff

# Beam Search function to select rows
def beam_search(df_benchmark, df_target, weights, target_size, beam_width=5):
    target_metrics = calculate_metrics(df_target, target_size, weights)
    beam = [[]]  # Start with an empty selection
    for _ in range(target_size):
        candidates = []
        for partial_selection in beam:
            for i in range(len(df_benchmark)):
                if i in partial_selection:
                    continue
                new_selection = partial_selection + [i]
                diff = calculate_difference(new_selection, df_benchmark, target_metrics, weights)
                candidates.append((new_selection, diff))
        
        # Sort candidates by their difference and keep the best `beam_width`
        candidates.sort(key=lambda x: x[1])
        beam = [x[0] for x in candidates[:beam_width]]
    
    # Return the best selection from the final beam
    return beam[0]

# Function to write results to Excel
def write_results_to_excel(benchmark_df, target_df, selected_indices, filename='benchmark_target.xlsx'):
    with pd.ExcelWriter(filename, mode='a', if_sheet_exists='replace') as writer:
        new_sample_col = f'Sample {len(benchmark_df.columns) - 1}'
        benchmark_df[new_sample_col] = 0
        benchmark_df.loc[selected_indices, new_sample_col] = 1
        benchmark_df.to_excel(writer, sheet_name='Benchmark', index=False)
        target_df.to_excel(writer, sheet_name='Target', index=False)

# Main function to run the process
def main():
    df_benchmark, df_target = load_datasets('benchmark_target.xlsx')
    weights = {
        'Deb': 0.2,
        'GR': 0.2,
        'DPD': 0.2,
        'LP': 0.2,
        'collection': 0.1,
        'payments': 0.1
    }
    target_size = 300
    selected_indices = beam_search(df_benchmark, df_target, weights, target_size)
    write_results_to_excel(df_benchmark, df_target, selected_indices)

# Run the main function
if __name__ == "__main__":
    main()