C = set of points we want to cluster
metric space (X,d)
integer parameter k
F elem of X = possible cluster locations
F and C dont need to be disjoint
set S elem of X and point x elem of X d(x,S) = min distance of point x and all the points of S
[n] = set {1,2,3, .... n}

l = number of groups of C (sensitive groups)
triangle = maximum number of groups a point can be a part of

we also have 2 fairness vectors alpha, beta [0,1]^l

cluster assignment has these 2 additional fairness constraints:
number of elements of a group in a cluster has to be smaller than alpha * all elements of cluster
number of elements of a group in a cluster has to be larger than beta * all elements of cluster

In [2]:
from helper import data_loader, k_center
from cplex import Cplex
from scipy.spatial.distance import cdist
import numpy as np
import time

In [3]:
loader = data_loader.data_loader('./config/dataset_config.json')
config = loader.get_config()


dataset_name = "adult"
X,_ = loader.load_dataset(dataset_name)
prepared_dataset = loader.prepare_dataset(X, dataset_name, complexity="extended")

In [4]:
k = 5

In [5]:
sensitive_columns = config[dataset_name]['sensitive_column']

dataset = X[sensitive_columns + config[dataset_name]['distance_columns']]
for i in range(len(sensitive_columns)):
    dataset = dataset[dataset[sensitive_columns[i]].isin(config[dataset_name]['sensitive_values'][i])]

for i in range(len(sensitive_columns)):
    dataset[sensitive_columns[i]] = dataset[sensitive_columns[i]].apply(lambda x: config[dataset_name]['sensitive_values'][i].index(x))
    
dataset

Unnamed: 0,sex,race,age,fnlwgt,education-num,capital-gain,hours-per-week
0,0,0,39,77516,13,2174,40
1,0,0,50,83311,13,0,13
2,0,0,38,215646,9,0,40
3,0,4,53,234721,7,0,40
4,1,4,28,338409,13,0,40
...,...,...,...,...,...,...,...
48837,1,0,39,215419,13,0,36
48838,0,4,64,321403,9,0,40
48839,0,0,38,374983,13,0,50
48840,0,1,44,83891,13,5455,40


In [7]:
#sample dataset

sampled_data = loader.sample_data(dataset, dataset_name, 42)
sampled_data

Unnamed: 0,sex,race,age,fnlwgt,education-num,capital-gain,hours-per-week
7762,0,0,18,423024,9,0,20
23881,1,0,17,178953,8,0,20
30507,0,4,25,348986,9,0,40
28911,1,0,20,218215,10,0,30
19484,0,2,47,244025,9,0,56
...,...,...,...,...,...,...,...
14475,1,4,17,222618,7,0,30
4817,1,0,27,116531,13,0,40
5553,0,0,18,703067,7,0,20
36869,0,0,22,310197,4,0,40


In [11]:
#do vanilla clustering
kCenterInstance = k_center.k_center(dataset_name, loader)
kCenterInstance.fit(sampled_data, k)
costs, cluster_mapping = kCenterInstance.get_results()

In [105]:
colormapping_dict = {}
colormapping_per_point_dict = {}


#get color mapping for each point and each sensitive attribute
#and get mapping of each color class to indices of points of that class
sens_attr_idx = 0
for sensitive_attribute in sensitive_columns:
     colorlist = sampled_data[sensitive_attribute].tolist()
     
     color_class_list = []
     for color_class in range(len(config[dataset_name]['sensitive_values'][sens_attr_idx])):
          indices = [i for i in range(len(colorlist)) if colorlist[i] == color_class]
          color_class_list.append(indices)
     
     colormapping_dict[sensitive_attribute] = color_class_list
     colormapping_per_point_dict[sensitive_attribute] = colorlist
     sens_attr_idx+=1

In [108]:
#get balances
balances = {}
for sensitive_attribute, color_classes in colormapping_dict.items():
    rep = []
    for color_class in color_classes:
        color_rep = len(color_class)  / len(sampled_data)
        rep.append(color_rep)

    balances[sensitive_attribute] = rep

{'sex': [0.6533333333333333, 0.3466666666666667],
 'race': [0.865,
  0.04,
  0.008333333333333333,
  0.0033333333333333335,
  0.08333333333333333]}

In [125]:
#get alpha beta
deltas = [0.2, 0.2]

for delta in deltas: 
    alpha = {}
    beta = {}

    alpha_val, beta_val = 1 /(1- delta), 1- delta
    for sen_attr, colors in colormapping_dict.items():
        alpha[sen_attr] = {k : alpha_val * balances[sen_attr][k] for k in range(len(balances[sen_attr]))}
        beta[sen_attr] = {k : beta_val * balances[sen_attr][k] for k in range(len(balances[sen_attr]))}

In [None]:
#get cluster centers
cluster_centers = list(cluster_mapping.keys())
centers = sampled_data.iloc[cluster_centers]

In [51]:
num_points = len(sampled_data)
num_centers = len(centers)

variable_names = ["x_{}_{}".format(j,i) for j in range(num_points) for i in range(num_centers)]

total_variables = num_points * num_centers
lower_bounds = [0 for _ in range(total_variables)]
upper_bounds = [1 for _ in range(total_variables)]

costs_per_point = cdist(sampled_data.values, centers.values, 'euclidean')
costs = costs_per_point.ravel().tolist()

In [54]:
#requires dataset, centers, number of clusters alpha beta

problemSolver = Cplex()
problemSolver.objective.set_sense(problemSolver.objective.sense.minimize)
problemSolver.variables.add(obj=costs, lb=lower_bounds, ub=upper_bounds, names=variable_names)


range(0, 3000)

In [148]:
def add_color_constraint(color_mapping_per_point, var_beta, var_alpha):
    beta_constraints = [[["x_{}_{}".format(j, i) for j in range(num_points)],
        [var_beta[color] - 1 if color_mapping_per_point[j] == color else var_beta[color] for j in range(num_points)]]
        for i in range(num_centers) for color, _ in var_beta.items()]

    alpha_constraints = [[["x_{}_{}".format(j, i) for j in range(num_points)],
                          [np.round(1 - var_alpha[color], decimals=3) if color_mapping_per_point[j] == color else (-1) * var_alpha[color]
                           for j in range(num_points)]]
                         for i in range(num_centers) for color, _ in var_beta.items()]
    
    constraints = beta_constraints + alpha_constraints
    number_of_constraints = num_centers * len(var_beta) * 2
    rhs = [0] * number_of_constraints

    return constraints, rhs

In [145]:
num_points

600

In [149]:
constraints_row = [[["x_{}_{}".format(j, i) for i in range(num_centers)], [1] * num_centers] for j in range(num_points)]
print(len(constraints_row))
rhs = [1] * num_points
sum_const_len = len(rhs)

for var in colormapping_per_point_dict:
    color_mapping_per_point, var_alpha, var_beta = colormapping_per_point_dict[var], alpha[var], beta[var]
    color_constraint , color_rhs = add_color_constraint(color_mapping_per_point, var_beta, var_alpha)
    constraints_row.extend(color_constraint)
    rhs.extend(color_rhs)

senses = ["E" for _ in range(sum_const_len)] + ["L" for _ in range(len(rhs) - sum_const_len)]

constraint_names = ["c_{}".format(i) for i in range(len(rhs))]

600
sex
race


In [152]:
problemSolver.linear_constraints.add(lin_expr=constraints_row, senses=senses, rhs=rhs, names=constraint_names)

range(0, 670)

In [153]:
problemSolver.solve()

Using size restricted mode (Could not find directory for cpxchecklic).
CPLEX Error  1016: Community Edition. Problem size limits exceeded. Purchase at http://ibm.biz/error1016.


CplexSolverError: CPLEX Error  1016: Community Edition. Problem size limits exceeded. Purchase at http://ibm.biz/error1016.