In [1]:
import os
import json
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
import pickle

In [2]:
# The directory of the dataset
read_directory = "./leaf/data/femnist/data/all_data/"

# The user in user_block_list will be removed
user_block_list = []

# The target class to be clustered
target_class = 4

# The number of kmeans clusters
n_clusters = 7

# Kmeans clustering result will be written into this directory
write_directory = "./femnist_by_cluster_{}".format(target_class)

# User assignment will be saved to this directory
assignment_save_directory = "./kmeans_user_assignment"

In [3]:
# Read raw dataset and keep only images of target class
# All target images will be stored in target_image_list for clustering

target_image_list = []
user_list = []

for json_file_name in os.listdir(read_directory):
    with open(os.path.join(read_directory, json_file_name), "r") as f_read:
        print("processing file: {}".format(json_file_name))
        data = json.load(f_read)
        
        for user_name in data["users"]:
            # Remove user in user_block_list
            if user_name in user_block_list:
                continue
            
            # Save all images of this user with the target class
            user_data = data["user_data"][user_name]
            for index, image_class in enumerate(user_data["y"]):
                if image_class == target_class:
                    image = user_data["x"][index]
                    target_image_list.append(image)
                    user_list.append(user_name)

print("There are {} images to be clustered.".format(len(target_image_list)))

processing file: all_data_5.json
processing file: all_data_4.json
processing file: all_data_34.json
processing file: all_data_3.json
processing file: all_data_27.json
processing file: all_data_10.json
processing file: all_data_16.json
processing file: all_data_13.json
processing file: all_data_19.json
processing file: all_data_31.json
processing file: all_data_21.json
processing file: all_data_18.json
processing file: all_data_15.json
processing file: all_data_0.json
processing file: all_data_1.json
processing file: all_data_6.json
processing file: all_data_20.json
processing file: all_data_25.json
processing file: all_data_7.json
processing file: all_data_14.json
processing file: all_data_17.json
processing file: all_data_2.json
processing file: all_data_24.json
processing file: all_data_22.json
processing file: all_data_12.json
processing file: all_data_23.json
processing file: all_data_8.json
processing file: all_data_32.json
processing file: all_data_11.json
processing file: all_da

In [4]:
# Perform kmeans clustering
target_image_arr = np.array(target_image_list)
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(target_image_arr)
cluster_labels = kmeans.labels_
print("The number image in each cluster: ", Counter(cluster_labels))

# Save kmeans clustering result
pickle_save_directory = "./kmeans_pickle_save"
with open(os.path.join(pickle_save_directory, "{}.pickle".format(target_class)), "wb") as f_write:
    pickle.dump(kmeans, f_write)

The number image in each cluster:  Counter({3: 6778, 6: 6388, 2: 5920, 1: 5328, 4: 4630, 5: 4629, 0: 4466})


In [5]:
# Save images into separate directories according to their cluster

# Create directory
if not os.path.isdir(write_directory):
    os.makedirs(write_directory)

for i in range(n_clusters):
    cur_dir = os.path.join(write_directory, str(i))
    if not os.path.isdir(cur_dir):
        os.makedirs(cur_dir)
        os.makedirs(os.path.join(cur_dir, "all"))

# Save images into separate directories according to their cluster_index
count = 0
for image, user_name, cluster_index in zip(target_image_list, user_list, cluster_labels):
    image = np.array([pixel * 255 for pixel in image], dtype=np.uint8)
    image = np.reshape(image, (28, 28))
    image = Image.fromarray(image, 'L')
    count += 1
    image.save(os.path.join(write_directory, str(cluster_index), "all", "{}_{}.png".format(user_name, count)))

In [6]:
# Get cluster assignment of each user.
# For instance, user_i is with 100 images, 
# 55 of them are assigned to cluster 1, 40 of them are assigned to cluster 2,
# 5 of them are assigned to cluster 3.
# Then user_i will be assigned to cluster 1, as most of its images belongs to cluster 1

# Count the images beloning to each cluster for each user
user_cluster_cnt = {}
for user_name, cluster_index in zip(user_list, cluster_labels):
    if user_name not in user_cluster_cnt:
        user_cluster_cnt[user_name] = [0 for _ in range(n_clusters)]
        
    user_cluster_cnt[user_name][cluster_index] += 1

# Assign user to the cluster with most of its images
user_cluster_assignment = {i: [] for i in range(n_clusters)}
for user_name, cluster_cnt in user_cluster_cnt.items():
    max_cnt, max_index = cluster_cnt[0], 0
    for cluster_index, cnt in enumerate(cluster_cnt):
        if cnt > max_cnt:
            max_cnt, max_index = cnt, cluster_index
            
    user_cluster_assignment[max_index].append(user_name)

# Save the result of user assignment
with open(os.path.join(assignment_save_directory, "assignment_{}.json".format(target_class)), "w") as f:
    json.dump(user_cluster_assignment, f)