In [None]:
import itertools
import pandas as pd
import numpy as np

import pulp

# Reading constraints and preferences

In [None]:
def restructure_voorkeuren(df):
    """
    Restructure the input from wide to long format and do basic cleaning
    """
    df = df.melt(ignore_index=False, var_name='Type', value_name='Waarde').dropna()
    df[['Type', 'Nr']] = df['Type'].str.split('.', expand=True)
    df = (df.assign(Nr = lambda df: df['Nr'].fillna('0').astype(int))
          .set_index(['Type', 'Nr'], append=True)
          )
    return df

In [None]:
leerlingen = pd.read_excel('voorkeuren.xlsx', index_col=0).drop(columns=['GRAAG_MET', 'GRAAG_MET.1'])
voorkeuren = leerlingen.pipe(restructure_voorkeuren)

# Setting up the problem

In [None]:
M = 1_000_000 # A very big number, so that constraints are never larger than 1
EPS = 0.01 # A small number to correct for numerical inaccuracies

MBGROEPEN = ['Blauw', 'Groen', 'Oranje', 'Geel']

In [None]:
n_wishes_max = voorkeuren.groupby(['Leerling', "Type"]).size().max()
preference_value = {i: 0.5**i for i in range(1, n_wishes_max + 1)}
def get_satisfaction(i, preference_value=preference_value):
    """
    Get a score for the satisfaction of the leerling for getting i preferences granted

    Parameters
    ----------
    i: int
        The nr of accommodated preferences
    preference_values: dict
        A dictionary containing the score of getting the i-th preference valued
    """
    return sum(preference_value[i] for i in range(1, i + 1))

# Creating and solving the model

In [None]:
def solve_problem(voorkeuren: pd.DataFrame, leerlingen=leerlingen.index, groepen=MBGROEPEN, max_students_per_group=5, optimize_evenly=True):
    """
    Create a problem to distribute leerlingen over groepen

    Parameters
    ----------
    voorkeuren: pd.DataFrame
        A DataFrame with as MultiIndex with (Leerling, Type, Nr) and a value, where
        Leerling is the Name, Type is either "Graag met", "Niet in" or "Liever niet"
        Waarde is then a column with either a Leerling or Group name. In combination with
        Niet In only a Group name is allowed
    leerlingen: Iterable
        An Iterable containing all names of the leerlingen
    groepen: Iterable
        An interable containing all names of the groepen
    max_students_per_group, int (default = 5)
        The number of leerlingen that can go to the same group
    optimize_evenly, bool (default = True)
        Whether to optimize evenly across leerlingen(so getting 1 pref for each student
        is more valuable than 3 for one student and 0 for someone else). The alternative
        if optimizing for accommododating as many preferences, without regard as to for
        who it is
    
    Returns
    -------
        pulp.LpProblem - the solved problem
    
    Raises
    ------
    RuntimeError
        If problem is unsolvable

    """
    prob = pulp.LpProblem('leerlingindeling', pulp.LpMaximize)

    # Binary representations make more sense than a single integer per student which is basically an index
    in_group = pulp.LpVariable.dicts('group', itertools.product(leerlingen, groepen), cat='Binary')

    # Every student must be in exactly one group
    for ll in leerlingen:
        prob += pulp.lpSum([in_group[(ll, gr)] for gr in MBGROEPEN]) == 1

    # Every group can have a max number of students from an earlier group (no kliekjes)
    for gr in MBGROEPEN:
        prob += pulp.lpSum([in_group[(ll, gr)] for ll in leerlingen]) <= max_students_per_group

    # Some students can not move int other groups (e.g. a brother/sister is already there)
    for i, row in voorkeuren.query('Type == "Niet in"').iterrows():
        ll, type_, nr = i
        gr = row['Waarde']
        prob += in_group[(ll, gr)] == 0

    # Now it's really starting: who prefers to be with whom - this we want to optimize
    graag_met = voorkeuren.xs('Graag met', level='Type')
    satisfied = pulp.LpVariable.dicts("Satisfied", graag_met.index.to_list(), cat="Binary")
    # checks whether the preference is for a single groep (e.g. Blauw)
    pref_per_group = list(itertools.chain(*[[(ll, nr, gr) for gr in MBGROEPEN] for ll, nr in graag_met.index]))
    satisfied_per_group = pulp.LpVariable.dicts("Satisfied_per_group", pref_per_group, cat="Binary")
    for i, row in graag_met.iterrows():
        ll, nr = i
        if row['Waarde'] not in groepen:
            other_ll = row['Waarde']
            for gr in groepen:
                # Matching preferences are an XNOR problem, see https://yetanothermathprogrammingconsultant.blogspot.com/2022/06/xnor-as-linear-inequalities.html
                prob += satisfied_per_group[(ll, nr, gr)] >= 1 - in_group[(ll, gr)] - in_group[(other_ll, gr)]  # Allebei niet in deze groep ==> satisfied = 1
                prob += satisfied_per_group[(ll, nr, gr)] <= 1 + in_group[(ll, gr)] - in_group[(other_ll, gr)]  # ll niet in groep, ander wel ==> satisfied = 0
                prob += satisfied_per_group[(ll, nr, gr)] <= 1 - in_group[(ll, gr)] + in_group[(other_ll, gr)]  # ll in groep, ander niet ==> satisfied = 0
                prob += satisfied_per_group[(ll, nr, gr)] >= in_group[(ll, gr)] + in_group[(other_ll, gr)] - 1  # allebei in deze groep ==> satisfied = 1

                # The total preference is only satisfied if it is at least correct for this group
                # AND definition: see https://yetanothermathprogrammingconsultant.blogspot.com/2022/06/xnor-as-linear-inequalities.html
                prob += satisfied[i] <= satisfied_per_group[(ll, nr, gr)]  
            prob += satisfied[i] >= pulp.lpSum([satisfied_per_group[(ll, nr, gr)] for gr in MBGROEPEN]) - len(MBGROEPEN) + 1  # The preference is satisfied if it is correct for every group
        else:
            gr = row['Waarde']
            prob += (in_group[(ll, gr)] >= satisfied[i])  # TODO: This is not a full specification of the boundary - this only works because we always optimize for satisfied (directly or indirectly)

    # We do not want to optimize the number of matches: at least 1 match for a student is more valuable than the third
    satisfaction_per_ll = pulp.LpVariable.dict("LLSatisfaction", leerlingen, cat='Continuous')
    # Per ll whether at least i preferences are satisfied
    n_satisfied_per_ll = pulp.LpVariable.dicts("llassignedprefs", itertools.product(leerlingen, (i for i in range(1, n_wishes_max + 1))), cat='Binary')
    for ll in leerlingen:
        ll_prefs = []
        for i in range(n_wishes_max):
            try:
                ll_prefs.append(satisfied[(ll, i)])
            except KeyError:
                break
        n_satisfied = pulp.lpSum(ll_prefs)
        
        
        for i in range(1, n_wishes_max + 1):
            # The first constraint checks that n_satisfied(i) for each leerling is 0 if less than `i` preferences are satisfied
            prob += n_satisfied_per_ll[(ll, i)] <= n_satisfied / i # The division works in combination with the fact that n_true_per_ll is binary, so can never be larger than 1
            # The second constraint checks that n_satisfied(i) for each leerling is 1 if at least i preferences are satisfied
            prob += n_satisfied_per_ll[(ll, i)] >= (n_satisfied - (i - 1) - EPS) / M  # M ensures the constraint is never larger than 1

        satisfaction_per_ll[ll] = sum(val * n_satisfied_per_ll[(ll, i)] for i, val in preference_value.items())

    if optimize_evenly:
        prob += pulp.lpSum(satisfaction_per_ll)
    else:
        prob += pulp.lpSum(satisfied)

    prob.solve()
    if pulp.LpStatus[prob.status] != 'Optimal':
        raise RuntimeError(f'Could not solve LP-problem, status {pulp.LpStatus[prob.status]!r}')
    return prob

In [None]:
prob = solve_problem(voorkeuren)

In [None]:
type(prob.variables()[0])

# Get the outcome

In [None]:
def get_outcome(vars_) -> pd.DataFrame:
    """
    Restructure the Problem Variables in a nice DataFrame

    Parameters
    ----------
    vars: list of pulp.LpVariables
        The result of prob.variables()
    """
    chosen_groups = [v.name for v in vars_ if v.value() == 1 and v.name.startswith('group')]
    df = pd.DataFrame(chosen_groups)
    df[['Naam', 'Group']] = df[0].str.extract("group_\('(.*)',_'(.*)'\)")
    return df.drop(columns=[0])

def display_outcome_nicely(df):
    """
    Transform DataFrame so that leerlingen are grouped by the group in which they are placed
    """
    df = (outcome.assign(nr = lambda df: df.groupby('Group').cumcount().add(1))
                 .set_index(['Group', 'nr'])
                 ['Naam']
                 .unstack('Group', fill_value='')
        )
    return df

outcome = get_outcome(prob.variables())
outcome.pipe(display_outcome_nicely)

# Check solution

In [None]:
def calculate_satisfied_constraints(vars_):
    """
    Calculate which constraints and for whom are accommodated

    Parameters
    ----------
    vars: list of pulp.LpVariables
        The result of prob.variables()

    """
    constraints = {v.name: v.value() for v in vars_ if v.name.startswith('Satisfied') and not 'per_group' in v.name}
    satisfied_constraints = (pd.Series(constraints)
                            .to_frame(name='Satisfied')
                            .assign(Satisfied = lambda df: df['Satisfied'].astype('boolean'))
                            )
    ix = (satisfied_constraints.index.to_series()
            .str.extract("Satisfied_\('(?P<ll>.*)',_(?P<Nr>.*)\)")
            .set_index(['ll', 'Nr'])
            .index
            )
    
    satisfied_constraints.index = ix
    return satisfied_constraints


def calculate_performance_per_leerling(satisfied_constraints):
    """
    Calculate basic performance metrics per leerling

    Performance is better when more preferences are more accommodated

    Parameters
    ----------
    satisfied_constraints: pd.DataFrame
        The output of calculate_satisfied_constraints
    """
    df = satisfied_constraints.groupby('ll')['Satisfied'].agg(NrPreferences = 'count',
                                                              AccountedPreferences = 'sum',
                                                              PctAccounted = 'mean')

    possible_satisfaction = np.array([get_satisfaction(nprefs) for nprefs in df['NrPreferences']])
    actual_satisfaction = np.array([get_satisfaction(nprefs) for nprefs in df['AccountedPreferences']])
    df = df.assign(RelativeSatisfaction =  actual_satisfaction / possible_satisfaction)
    
    return df

def calculate_solution_performance(ll_performance):
    """
    Calculate the performance of the general model

    Parameters
    ----------
    ll_performance: pd.DataFrame
        The output of calculate_performance_per_leerling
    """
    max_attainable_satisfaction = (ll_performance['NrPreferences'].value_counts() # nr of ll with n_wishes
                                    .to_frame()
                                    .assign(MaxSatisfaction = lambda df: df.index.map(get_satisfaction), # value per student with n_wishes
                                            prod = lambda df: df['NrPreferences'] * df['MaxSatisfaction'] # max satisfaction per n_wishes
                                            )
                                    ['prod'].sum() # sum to get solution total
                                    )
    solution_performance = (ll_performance[['NrPreferences', 'AccountedPreferences']].sum().to_frame().transpose()


                                          .assign(PctAccountedPReferences = lambda df: df['AccountedPreferences'] / df['NrPreferences'],
                                                  Satisfaction = ll_performance['AccountedPreferences'].map(get_satisfaction).sum() / max_attainable_satisfaction)                
                        ).to_dict('records')[0]
    return solution_performance

satisfied_constraints = calculate_satisfied_constraints(prob.variables())
ll_performance = calculate_performance_per_leerling(satisfied_constraints)
solution_performance = calculate_solution_performance(ll_performance)  
print(solution_performance)
display(ll_performance)

# Analysis

In [None]:
def get_solution_performance(voorkeuren, n_students_max, optimize_evenly):
    """
    Convenience function which combines problem solving to get all metrics
    """
        prob = solve_problem(voorkeuren, max_students_per_group=n_students_max, optimize_evenly=optimize_evenly)
        satisfied_constraints = calculate_satisfied_constraints(prob.variables())
        ll_performance = calculate_performance_per_leerling(satisfied_constraints)
        return calculate_solution_performance(ll_performance)  

solution_performance_overview = dict()
for optimize_evenly in (True, False):
    for n_students_max in range(4, 7):
        solution_performance_overview[(optimize_evenly, n_students_max)] = get_solution_performance(voorkeuren, n_students_max, optimize_evenly)

In [None]:
pd.DataFrame.from_dict(solution_performance_overview, orient='index')

Optimaliseer naar leerlingtevredenheid:
* Max 6 per groep: 23 wensen vervuld, 86.2% (niemand naar Oranje)
* Max 5 per groep: 22 wensen vervuld, 85.0% (Ro 3e wens niet)
* Max 4 per groep: 21 wensen vervuld, 82.6% 

Optimaliseer naar aantal wensen vervuld
* Max 6 per groep: 25 wensen vervuld, 83.8% (El, Su geen wensen)
* Max 5 per groep: 23 wensen vervuld, 80.3% (El, Fl geen wensen)
* Max 4 per groep: 21 wensen vervuld, 79.1% (Fl geen wensen)