In [1]:
# 1. import packages

import pandas as pd
import numpy as np
#from pulp import *
import pulp

In [2]:
# import data
students_df = pd.read_csv('data/example_subject_students.csv')
schools_df = pd.read_csv('data/example_subject_schools.csv')


In [None]:
# do pre-processing
# from https://github.com/UCL/ioe-student-school-allocation
# install using pip install -e .
# version pip-23.1.2 or greater needed
# upgrade pip using
# pip install --upgrade pip

# you will also need to add a TFL API key, available from https://api-portal.tfl.gov.uk/
# it is set in .envrc_sample export TFL_APP_KEY=
#What you should do is cp .envrc_sample .envrc. Then put in the key. Then run source .envrc. Then re-run.
#You can check if it’s worked by running echo $TFL_APP_KEY.
#and export N_CORES=1
# you will need to delete example_subject_student_school_journeys.csv and example_subject_student_school_failures.csv because it won't overwrite files automatically

# run using
#tfl data example_subject

# took about 25 min on 1 core for 10 student, 70 schools, 3 failures
# took about 14 min on 4 cores for 20 students, 70 schools
# took about 12 min on 8 cores for 20 students, 70 schools, 10 failures
# took about 6 min on 16 cores for 19 students, 70 schools, 2 failures

# it is recommened you use the provided data example_subject_student_school_journeys.csv for this example


In [3]:
# read in sample data
example_subject_time = pd.read_csv('data/example_subject_student_school_journeys.csv')

In [4]:
# create pivot table from data
example_subject_time_table = (
    example_subject_time.pivot_table(
        columns="school",
        fill_value=10000,
        index="student",
        sort=False,
        values="time",
    )
    .astype(int)
    .values
)

In [5]:
# 2. clean data for the model

# define the function to clean school and student dataframe
# to only keep the students and schools which have successful journeys
def data_clean(df, id_col, time_col, time):
    ids_to_remove = set(df[id_col]) - set(time[time_col].unique())
    mask = ~df[id_col].isin(ids_to_remove)
    df_clean = df[mask].reset_index().drop('index', axis=1)
    return df_clean

schools_df_clean = data_clean(schools_df,'SE2 PP: Code','school',example_subject_time)
students_df_clean = data_clean(students_df,'ST: ID','student',example_subject_time)

In [6]:
# check if data is okay
assert len(schools_df_clean) == len(example_subject_time_table[0])
assert len(students_df_clean) == len(example_subject_time_table)

In [None]:
# use spopt code
# Remember currently we need to use the version with the developments Rongbo has included
# it has not been included in the main spopt package yet
# To install the package you can do: `python3 -m pip install spopt@git+https://github.com/rongboxu/spopt`.
# If you are running this a Juypter notebook, remember to run this in the terminal inside the correct environemnt / notebook enviromnent / docker image and restart the kernal

In [7]:
# if the version of spopt is 0.5.0 this is not the correct version
# the version should be 0+untagged.952.g67aaaff or similar
import spopt
print(spopt.__version__)



0.5.0


In [8]:
from spopt.locate import PMedian

In [9]:
# data preparing

# a. set the amount of each demand point: in IOE case, it is 1.
demand = np.ones(len(students_df_clean))

# b. pick out predefined facilities: priority 1 schools
# please notice that the column name of priority can vary, for math it's 'MAT priority' etc.
schools_priority_1 = schools_df_clean[schools_df_clean['MAT priority'] == 1].index.tolist()
schools_priority_1_arr = np.array(schools_priority_1)

# c. set the facility capacities
capacities_arr = np.array(schools_df_clean['Count'])

In [11]:
# run the model


# if you get this error:
# Problem is infeasible. The predefined facilities can't be 
# fulfilled, because their capacity is larger than the total 
# demand 10.0.
# This is because you have more priority schools (priority = 1 or 2? need to check) than you do students who need placements. You need more schools, or fewer students. 
                        
# the fulfill_predefined_fac must be true, it is used to gurantee priority 1 schools will be fulfilled
solver = pulp.PULP_CBC_CMD()
pmedian_from_cost_matrix = PMedian.from_cost_matrix(
	example_subject_time_table,
	demand, 
	p_facilities=len(students_df_clean), 
	predefined_facilities_arr = schools_priority_1_arr, 
	facility_capacities = capacities_arr, 
	fulfill_predefined_fac = True
)
pmedian_from_cost_matrix = pmedian_from_cost_matrix.solve(solver)

In [13]:
# save the match result

match_df = students_df_clean

for i in range(len(students_df_clean)):
    school_index = pmedian_from_cost_matrix.cli2fac[i]
    match_df.loc[i, 'allocation_school_id'] = schools_df_clean.loc[school_index[0], 'SE2 PP: Code']

In [49]:
# match_df.to_csv('./data/example_subject_matches.csv')

In [55]:
# run the map creation py document
# the html file of the map will appear in the 'plot' folder
# I have run once, so both the match file and map html has already existed
!python scripts/create_allocation_map.py "example_subject"