In [1]:
import sys
import time
from sklearn_extra.cluster import KMedoids
import pandas as pd
import os

from failets import *

### PARAMETERS ###
topic = 'Xbox'

blues = 1
reds = 1
k = 3
input_csv_filename = "input_docs/embedding/"+ topic + ".csv"
remove_label = "Hisp"

### MAIN ###
label_dict = {'White': 0, 'Hisp': 1, 'AA': 2}
df_temp = pd.read_csv(input_csv_filename)
df_temp = df_temp[df_temp['label'] != label_dict[remove_label]]
df_temp = df_temp.reset_index(drop=True)
if remove_label == "White":
    df_temp.replace({'label': {1: 0, 2: 1}}, inplace=True)
elif remove_label == "Hisp":
    df_temp.replace({'label': {0: 0, 2: 1}}, inplace=True)
elif remove_label == "AA":
    df_temp.replace({'label': {0: 0, 1: 1}}, inplace=True)
else:
    print("Invalid label_dict")


temp_address = 'df_temp.csv'
df_temp.to_csv(temp_address, index=False)
try:
    p = min(blues, reds)
    q = max(blues, reds)
except:
    print("First two parameters must be non-negative integers that specify the target balance; terminating")
    sys.exit(0)

# Parse input file in CSV format, first column is colors, other columns are coordinates
print("Loading data from input CSV file")
colors = []
points = []
i = 0
skipped_lines = 0
for line in open(temp_address).readlines():
    if len(line.strip()) == 0:
        skipped_lines += 1
        continue
    tokens = line[:-1].split(",")
    try:
        color = int(tokens[0])
    except:
        print("Invalid color label in line", i, ", skipping")
        skipped_lines += 1
        continue
    try:
        point = [float(x) for x in tokens[1:]]
    except:
        print("Invalid point coordinates in line", i, ", skipping")
        skipped_lines += 1
        continue
    colors.append(color)
    points.append(point)
    i += 1

n_points = len(points)
if  n_points == 0:
    print("No successfully parsed points in input file, terminating")
    sys.exit(0)
dimension = len(points[0])

dataset = np.zeros((n_points, dimension))
for i in range(n_points):
    if len(points[i]) < dimension:
        print("Insufficient dimension in line", i+skipped_lines, ", terminating")
        sys.exit(0)
    for j in range(dimension):
        dataset[i,j] = points[i][j]

print("Number of data points:", n_points)
print("Dimension:", dimension)
print("Balance:", p, q)

print("Constructing tree...")
fairlet_s = time.time()
root = build_quadtree(dataset)

print("Doing fair clustering...")
cost = tree_fairlet_decomposition(p, q, root, dataset, colors)
fairlet_e = time.time()

print("Fairlet decomposition cost:", cost)

print("Doing k-median clustering on fairlet centers...")
fairlet_center_idx = [dataset[index] for index in FAIRLET_CENTERS]
fairlet_center_pt = np.array([np.array(xi) for xi in fairlet_center_idx])

# Run k-medoids clustering
cluster_s = time.time()

kmedoids = KMedoids(n_clusters=k, metric='euclidean')
kmedoids.fit(fairlet_center_pt)

# Get the cluster medoid locations
C = kmedoids.cluster_centers_
midx = kmedoids.medoid_indices_

# Convert medoid indices to a column vector
midx = np.array(midx).reshape(-1, 1)

# Calculate the sum of distances for each sample to its closest medoid
distances = kmedoids.transform(fairlet_center_pt)
sumd = np.min(distances, axis=1)

cluster_e = time.time()
#np_idx = (np.array(idx._data)).flatten()

# compute the indices of centers returned by Matlab in its input matrix
# which is mat_matrix or fairlet_center_pt
np_midx = (np.array(midx)).flatten()
c_idx_matrix = np_midx.astype(int)
#in matlab, arrays are numbered from 1
c_idx_matrix[:] = [index - 1 for index in c_idx_matrix]

# indices of center points in dataset
centroids = [FAIRLET_CENTERS[index] for index in c_idx_matrix]

print("Computing fair k-median cost...")
kmedian_cost = fair_kmedian_cost(centroids, dataset)
print("Fairlet decomposition cost:", cost)
print("k-Median cost:", kmedian_cost)

def find_fairlet_medoids(lst, list_of_lists):
    med_fairlet = []
    for sublist in list_of_lists:
        if any(element in sublist for element in lst):
            med_fairlet.append(sublist)
    return med_fairlet

med_fairlet = find_fairlet_medoids(centroids, FAIRLETS)

df_docs = pd.read_csv("input_docs/docs/" + topic + ".csv")
df_docs = df_docs[df_docs['label'] != remove_label]
df_docs = df_docs.reset_index(drop=True)

df_output = pd.DataFrame(columns=['label', 'text', 'cluster'])
for i in range(len(med_fairlet)):
    cluster_data = df_docs.loc[med_fairlet[i]]
    cluster_data['cluster'] = i + 1
    df_output = pd.concat([df_output, cluster_data])

df_output.to_csv("output/" + topic + ".csv", index=False)
os.remove(temp_address)

Loading data from input CSV file
Invalid color label in line 0 , skipping
Number of data points: 60
Dimension: 768
Balance: 1 1
Constructing tree...
Doing fair clustering...
Fairlet decomposition cost: 204.25336095191972
Doing k-median clustering on fairlet centers...
Computing fair k-median cost...
Fairlet decomposition cost: 204.25336095191972
k-Median cost: 312.27324521212574
