In [1]:
from queue import Queue
import heapq

In [2]:
class PriorityQueue:
    def __init__(self, maxsize):
        self.maxsize = maxsize
        self.data = []

    def insert_with_priority(self, item, priority):
        heapq.heappush(self.data, (-priority, item))
        if len(self.data) > self.maxsize:
            heapq.heappop(self.data)

    def get_front_element(self):
        if self.data:
            return heapq.heappop(self.data)[1]
        return None

    def __len__(self):
        return len(self.data)

    def is_empty(self):
        return len(self.data) == 0

In [3]:
def beam_search_EMM(dataset, quality_measure, refinement_operator, beam_width, beam_depth, result_set_size, constraints):
    candidateQueue = Queue()
    candidateQueue.put({})
    resultSet = PriorityQueue(result_set_size)

    for level in range(1, beam_depth + 1):
        beam = PriorityQueue(beam_width)
        while not candidateQueue.empty():
            seed = candidateQueue.get()
            desc_set = refinement_operator(seed)
            for desc in desc_set:
                quality = quality_measure(desc, dataset)
                if all(constraint(desc) for constraint in constraints):
                    resultSet.insert_with_priority(desc, quality)
                    beam.insert_with_priority(desc, quality)
        while not beam.is_empty():
            candidateQueue.put(beam.get_front_element())
    return [item for _, item in sorted(resultSet.data, reverse=True)]

In [10]:
import pandas as pd
import numpy as np  

student_data = pd.read_csv("./kaggle/input/studentInfo.csv")

In [8]:
student_data

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail
32589,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction
32590,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass
32591,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn


In [7]:
final_result = student_data["final_result"]

In [13]:
def quality_measure(description, dataset):
    # Example quality measure: proportion of students with 'pass' in the described subset
    if not description:
        return 0
    mask = np.ones(len(dataset), dtype=bool)
    for attr, value in description.items():
        mask &= (dataset[attr] == value)
    subset = dataset[mask]
    if len(subset) == 0:
        return 0
    return np.mean(subset["final_result"] == "pass")

In [15]:
description_columns = ["gender", "age_band", "highest_education"]

results = beam_search_EMM(
    student_data,
    quality_measure,
    lambda seed: [
        {**seed, col: val}
        for col in description_columns
        for val in student_data[col].unique()
        if col not in seed
    ],
    beam_width=5,
    beam_depth=3,
    result_set_size=10,
    constraints=[]
)
results

TypeError: '<' not supported between instances of 'dict' and 'dict'