In [13]:
import pandas as pd
import numpy as np
import random
from openpyxl import load_workbook

# Function to import the data from the Excel file
def load_datasets(filename='benchmark_target.xlsx'):
    df_benchmark = pd.read_excel(filename, sheet_name='Benchmark')
    df_target = pd.read_excel(filename, sheet_name='Target')
    return df_benchmark, df_target

# Function to calculate the necessary metrics for both datasets
def calculate_metrics(df, target_size, weights):
    clients_with_payments = df['LP'].notnull().sum() / len(df)  # % of clients with payments
    total_paid = df['Payments'].sum()  # Total paid
    total_debt = df['Deb'].sum()  # Total debt
    collection_percentage = total_paid / (total_debt + total_paid) if total_debt + total_paid != 0 else 0  # % of collection
    averages = df.mean(skipna=True)  # Calculate averages for all columns, skipping empty cells

    return {
        'Deb': averages['Deb'],
        'GR': averages['GR'],
        'DPD': averages['DPD'],
        'LP': averages['LP'],  # Average of LP column
        'collection': collection_percentage,  # % of collection
        'payments': clients_with_payments  # % of clients with payments
    }

# Objective function to minimize the difference between benchmark and target metrics
def objective_function(x, df_benchmark, df_target, target_metrics, weights):
    selected_indices = [int(i) for i in x]  # Convert row numbers to integers
    sample = df_benchmark.iloc[selected_indices]  # Select rows based on indices
    sample_metrics = calculate_metrics(sample, len(selected_indices), weights)  # Calculate metrics for sample

    # Calculate the weighted percentage difference
    percentage_diff = 0
    for key in weights.keys():
        percentage_diff += weights[key] * abs((sample_metrics[key] - target_metrics[key]) / target_metrics[key]) * 100

    return percentage_diff

# Genetic Algorithm (GA) with Uniqueness Constraints for row selection
def genetic_sampling(df_benchmark, df_target, weights, target_size, population_size=100, generations=200, mutation_rate=0.15):
    target_metrics = calculate_metrics(df_target, target_size, weights)

    # Initialize population with random selections of unique row indices
    def initialize_population():
        return [
            random.sample(range(len(df_benchmark)), target_size)  # Each individual is a unique set of row indices
            for _ in range(population_size)
        ]

    # Objective function for each individual
    def evaluate(individual):
        return objective_function(individual, df_benchmark, df_target, target_metrics, weights)

    # Selection function using tournament selection for breeding pairs
    def select_parents(population):
        return min(random.sample(population, 5), key=lambda ind: evaluate(ind)), min(random.sample(population, 5), key=lambda ind: evaluate(ind))

    # Crossover operation to produce a child from two parents
    def crossover(parent1, parent2):
        child = parent1[:target_size // 2] + [gene for gene in parent2 if gene not in parent1[:target_size // 2]]
        return child[:target_size]  # Ensure child has exactly target_size unique elements

    # Mutation operation to introduce slight variation by changing one or two indices
    def mutate(individual):
        if random.random() < mutation_rate:
            swap_index = random.randint(0, target_size - 1)
            new_gene = random.choice([i for i in range(len(df_benchmark)) if i not in individual])
            individual[swap_index] = new_gene
        return list(set(individual))[:target_size]  # Ensure uniqueness after mutation

    # Genetic algorithm main loop
    population = initialize_population()
    for generation in range(generations):
        population = sorted(population, key=evaluate)  # Sort population by fitness (lowest difference is best)
        new_population = population[:2]  # Elitism: carry over top 2 individuals directly

        # Generate new individuals for the next generation
        while len(new_population) < population_size:
            parent1, parent2 = select_parents(population)
            child = crossover(parent1, parent2)
            child = mutate(child)
            new_population.append(child)

        population = new_population

    # Return the best solution from the final population
    best_individual = min(population, key=evaluate)
    return best_individual

# Function to write results to Excel, updated to take both df_benchmark and df_target
def write_results_to_excel(benchmark_df, target_df, selected_indices, filename='benchmark_target.xlsx'):
    with pd.ExcelWriter(filename, mode='a', if_sheet_exists='replace') as writer:
        new_sample_col = f'Sample {len(benchmark_df.columns) - 1}'  # For a new sample column

        # Add a new column for selected rows
        benchmark_df[new_sample_col] = 0
        benchmark_df.loc[selected_indices, new_sample_col] = 1  # Set to 1 for selected rows

        # Write both DataFrames to separate sheets in the same Excel file
        benchmark_df.to_excel(writer, sheet_name='Benchmark', index=False)
        target_df.to_excel(writer, sheet_name='Target', index=False)

# Main function to run the process
def main():
    df_benchmark, df_target = load_datasets('benchmark_target.xlsx')  # Load datasets from Excel

    # Define the column weights
    weights = {
        'Deb': 0.2,
        'GR': 0.2,
        'DPD': 0.2,
        'LP': 0.2,
        'collection': 0.1,
        'payments': 0.1
    }

    # Perform genetic sampling to select the best rows
    target_size = 95  # Adjust the target size as needed
    selected_indices = genetic_sampling(df_benchmark, df_target, weights, target_size)

    # Write the results back to the Excel file, adding a new sample column
    write_results_to_excel(df_benchmark, df_target, selected_indices)

# Run the main function
if __name__ == "__main__":
    main()


KeyboardInterrupt: 