# C-mean Clustering
Fuzzy c-Means Clustering} (FMC) is a clustering method which, unlike Hard k-Means Clustering, allows a data sample to more or less belong to one or more clusters. In this notebook, we will try to develop a semi-supervised C-mean Clustering.

## Loading the packages

In [50]:
import numpy as np
import matplotlib.pyplot as plt

import random
import operator
import math
import copy
colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']

PATH_TO_DATASET = '' # Not Given Yet 

%matplotlib inline

## The dataset
The following script allows you to create a 2D dataset by using the mouse. The fift first click adds points belonging to class A (blue), the 5th following click adds points belonging to class B (red) and the next click add unclassified points. You can create as many points as you desire. The final dataset will contain hence three values per point: x coordinate (-1 ≤ x ≤ 1), y coordinate (-1 ≤ y ≤ 1) and the class ∈ {0,1,2} where 0 represent unclassified points.

In [63]:
%matplotlib notebook

fig = plt.figure(figsize=(6,6))
plt.title("Input Dataset")
plt.xlim((-1.2,1.2))
plt.ylim((-1.2,1.2))

dataset = []

i = 0
def onclick(event):
    global dataset
    global i
    cx = event.xdata
    cy = event.ydata
    classe = 1 if(i<10) else 2 if(i<20) else 0
    i=i+1
    dataset.append((cx, cy, classe))

    plt.scatter(cx, cy, c=['k', 'b', 'r'][classe], s=100, lw=0)
    plt.grid(True)

cid = fig.canvas.mpl_connect('button_press_event', onclick)


<IPython.core.display.Javascript object>

In [64]:
dataset = np.array(dataset)

## Membership Matrix
The Membership matrix lists all the data samples membership grades to a particular cluster. Here we first randomly initialize it.

In [26]:
print(dataset)

def initializeMembershipMatrix(dataset, k):
    membership_mat = list()
    for i in range(len(dataset)):
        random_num_list = [random.random() for i in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]
        membership_mat.append(temp_list)
    return membership_mat

[[-1.01075273  0.96102538  1.        ]
 [-0.81720434  0.8065      1.        ]
 [-0.58781366  0.85800846  1.        ]
 [-0.65232979  1.0198922   1.        ]
 [-0.79569897  0.79178329  1.        ]
 [-0.71684592  0.65197461  1.        ]
 [-0.63082442  0.40179065  1.        ]
 [-0.89605739  0.54159933  1.        ]
 [-1.06093194  0.57103274  1.        ]
 [-0.90322585  0.29877373  1.        ]
 [ 0.82437272 -0.97422112  2.        ]
 [ 0.95340497 -0.78290398  2.        ]
 [ 0.69534046 -0.66517035  2.        ]
 [ 0.78853042 -0.8859209   2.        ]
 [ 0.69534046 -1.0845964   2.        ]
 [ 0.61648741 -0.92271266  2.        ]
 [ 0.5663082  -0.81233738  2.        ]
 [ 0.5663082  -0.99629618  2.        ]
 [ 0.27956985 -0.67988705  2.        ]
 [ 0.7741935  -0.70932046  2.        ]
 [ 0.37992827  0.33556549  0.        ]
 [ 0.53046591  0.6225412   0.        ]
 [ 0.74551967  0.89480022  0.        ]
 [ 0.98924727  1.01253385  0.        ]
 [ 0.9318996   0.66669131  0.        ]
 [ 0.83154117  0.82857505

## Cluster Center
Here we calculate each cluster center depending on the membership grades.

In [5]:
def calculateClusterCenter(dataset, membership_mat, k, fuzzy_param):
    cluster_mem_val = list(zip(*membership_mat))
    cluster_centers = list()
    for j in range(k):
        x = cluster_mem_val[j]
        xraised = [e ** fuzzy_param for e in x]
        denominator = sum(xraised)
        temp_num = list()
        for i in range(len(dataset)):
            data_point = dataset[i]
            prod = [xraised[i] * val for val in data_point]
            temp_num.append(prod)
        numerator = map(sum, zip(*temp_num))
        center = [z/denominator for z in numerator]
        cluster_centers.append(center)
    return cluster_centers

## Semi-supervised C-Mean

In [6]:
def evalConvergence(mb, previous_mb, threshold):
    return (np.absolute(np.array(mb) - np.array(previous_mb))).max() < threshold
        

In [45]:
def initializeMembershipMatrixSupervised(dataset, k):
    membership_mat_supervised = list()
    for i in range(len(dataset)):
        if (dataset[2] == 0):
            random_num_list = [0 for i in range(k)]
        else:
            random_num_list = [1 if((i+1)==dataset[2]) else 0 for i in range(k)]
        summation = sum(random_num_list)
        temp_list = [x/summation for x in random_num_list]
        membership_mat_supervised.append(temp_list)
    return membership_mat_supervised

In [46]:
def updateMembershipMatrixSupervised(dataset, k):
    pass

In [None]:
# Evaluate J_m, used for convergence detection
def evaluateObjectiveFunc(V, U, X, m):
    norm_vals = np.zeros((U.shape[0], U.shape[1]))
    for i in range(len(V)):
        norm_vals[:,i] = np.power(np.linalg.norm(X-V[i], axis=1), 2)
    
    fitness_val = (np.power(U,m)*norm_vals).sum()
    return fitness_val;

### Cluster Validity Indices

In [79]:
import numpy as np
import scipy.spatial

def pairwise_squared_distances(A, B):
    return scipy.spatial.distance.cdist(A, B)**2

def calculate_covariances(x, u, v, m):
    c, n = np.array(u).shape
    d = np.array(v).shape[1]
    
    um = np.array(u)**m

    covariances = np.zeros((c, d, d))

    for i in range(c):
        xv = x - v[i]
        uxv = um[i, :, np.newaxis]*xv
        covariances[i] = np.einsum('ni,nj->ij', uxv, xv)/np.sum(um[i])
    
    return covariances

# Partition Coefficient
def pc(x, u, v, m):
    c, n = np.array(u).shape
    return np.square(np.array(u)).sum()/n

# Fuzzy Hyperbolic Volume
def fhv(x, u, v, m):
    covariances = calculate_covariances(x, u, v, m)
    return sum(np.sqrt(np.linalg.det(cov)) for cov in covariances)

# Xie-Beni Index
def xb(x, u, v, m):
    n = np.array(x).shape[0]
    c = np.array(v).shape[0]

    um = np.array(u)**m
    
    d2 = pairwise_squared_distances(x, v)
    v2 = pairwise_squared_distances(v, v)
    
    v2[v2 == 0.0] = np.inf

    return np.sum(um.T*d2)/(n*np.min(v2))

def checkKnownEntries(x, labels, c):
    
    # 4 columns for each cluster :
    # |# class0 |# class1 | %0 | %1 |
    min_c = min(labels)
    res = [[0 for i in range(4)] for j in range(c)]
    for i in range(len(x)):
        # If the class is known (i.e. not 0)
        if(x[i][-1]!= 0):
            print(str(x[i][-1]))
            res[labels[i] - min_c][int(x[i][-1])] = res[labels[i] - min_c][int(x[i][-1])] + 1
            
    for i in range(c):
        res[i][2] = res[i][0] / (res[i][0] + res[i][1])        
        res[i][3] = res[i][1] / (res[i][0] + res[i][1])
        
    return res

## Main Execution 
blablabla

In [80]:
def updateMembershipValue(dataset, membership_mat, cluster_centers, fuzzy_param, k):
    p = float(2/(fuzzy_param-1))
    for i in range(len(dataset)):
        x = dataset[i]
        distances = [np.linalg.norm(list(map(operator.sub, x, cluster_centers[j]))) for j in range(k)]
        for j in range(k):
            den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(k)])
            membership_mat[i][j] = float(1/den)       
    return membership_mat


def getClusters(dataset, membership_mat):
    cluster_labels = list()
    for i in range(len(dataset)):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels


def fuzzyCMeansClustering(data, k = 3, threshold = 0.1, fuzzy_param = 2.00, maxSteps = 10000):
#    print(str(data))
    # Membership Matrix
    membership_mat = initializeMembershipMatrix(data, k)
    previous_mb = np.zeros((len(data), k))
    steps = 0
    while not evalConvergence(membership_mat, previous_mb, threshold) or steps >= maxSteps:
        cluster_centers = calculateClusterCenter(data, membership_mat, k, fuzzy_param)
        membership_mat = updateMembershipValue(data, membership_mat, cluster_centers, fuzzy_param, k)
        cluster_labels = getClusters(data, membership_mat)
        previous_mb = membership_mat
        steps += 1
    return cluster_labels, cluster_centers, membership_mat

# TOFIX : KEEP TRACK OF INDEX, CLUSTER LABELS IN RESULT..
def clusterizzeee(input_data):
    result_labels = [[]]
    result_mb     = [[]]
    temp_data = copy.deepcopy(input_data)
    c = 2
    done = 0
    while(done == 0): 
        labels, centers, mb = fuzzyCMeansClustering(temp_data[:,:-1], k = c)
        
        # First, check supervised values
        # TOFIX : Bordel with cluster indexes, won't work.
        sup_verif = checkKnownEntries(input_data, labels, c)
        cluster_ok = []
        for i in range(c):
            if(sup_verif[i][2] < 0.05 or sup_verif[i][2] > 0.95):
                cluster_ok.append(i)

        if(len(cluster_ok) == 0):
            # If no cluster is good enough (with known values) start with more clusters
            c =c + 1
            continue
        else:
            # Check our values aren't just luckily in same clusters using unsup CVI
            fhv_s = fhv(x = temp_data[:,:-1], v = centers, u = mb, m =2)
            pc_s  = pc(temp_data[:,:-1], mb, centers, 2)
            xb_s  = xb(x = temp_data[:,:-1], u = mb, v = centers, m = 2)
            # If values aren't good enough, our know entries were "luckily" grouped
            if(fhv_s > 0.1 or pc_s < 0.9 or xb_s > 0.1):
                c = c + 1
                continue
            # Else, we have to keep labeled data and remove it  
            temp_data = [[]]
            for i in range(len(input_data)):
                if(data[i][-1] in cluster_ok):
                    result_labels.append(input_data[i].append(labels[i]))
                    result_mb.append(mb[i])
                else:
                    temp_data.append(input_data[i])
                    
            # Keep same number of pixels (equivalent to split remaining clusters in +1)
            continue
            
    return result_labels

labels = clusterizzeee(dataset)


print(str(labels))
centers = np.array(centers)
# Visualize the test data
fig2 = plt.figure(figsize=(6,6))
plt.title("Output Dataset")

plt.scatter(dataset[:,0], dataset[:,1], c=[(['r', 'b', 'orange', 'brown'])[int(label)] for label in labels])
plt.scatter(centers[:,0], centers[:,1], c='y')
plt.grid()
plt.show()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0


TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

## Results and Analyse

In [14]:
from sklearn.metrics import confusion_matrix
import itertools
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    labels_names = ["Woman", "Man"]
    tick_marks = np.arange(len(labels_names))
    plt.xticks(tick_marks, labels_names, rotation=45)
    plt.yticks(tick_marks, labels_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        pl.text(j, i, format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
plot_confusion_matrix(conf_mat)

NameError: name 'conf_mat' is not defined

In [None]:
precision = (conf_mat[0][0]/(conf_mat[0][0] + conf_mat[1][0]) + conf_mat[1][1]/(conf_mat[1][1] + conf_mat[0][1]))/2
print("Precision: ", precision)
recall = (conf_mat[0][0]/(conf_mat[0][0] + conf_mat[0][1]) + conf_mat[1][1]/(conf_mat[1][1] + conf_mat[1][0]))/2
print("Recall: ", recall)
f1score = 2*(precision*recall)/(precision+recall)
print("F1-Score: ", f1score)