# Introduction

This is fuzzy c-mean clustering using scikit-fuzzy. 

References:
https://pythonhosted.org/scikit-fuzzy/auto_examples/plot_cmeans.html#example-plot-cmeans-py

In [306]:
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz
import pandas as pd

In [307]:
dataset = pd.read_csv('balanced_diabetes.csv')

In [308]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [309]:
#Remove this comment if want to select attribute manually
labels = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]

X = dataset[labels]
y = dataset['Outcome']

In [310]:
#from sklearn.model_selection import train_test_split

# Separate the dataset become 70 percent trainning data and 30 percent testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [311]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
69,4,146,85,27,100,28.9,0.189,27
382,1,109,60,8,182,25.4,0.947,21
195,5,158,84,41,210,39.4,0.395,29
84,5,137,108,0,0,48.8,0.227,37
442,4,117,64,27,120,33.2,0.230,24
...,...,...,...,...,...,...,...,...
645,2,157,74,35,440,39.4,0.134,30
715,7,187,50,33,392,33.9,0.826,34
72,13,126,90,0,0,43.4,0.583,42
235,4,171,72,0,0,43.6,0.479,26


## Parameters:	

data : 2d array, size (S, N)

    Data to be clustered. N is the number of data sets; S is the number of features within each sample vector.

c : int

    Desired number of clusters or classes.

m : float

    Array exponentiation applied to the membership function u_old at each iteration, where U_new = u_old ** m.

error : float

    Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.

maxiter : int

    Maximum number of iterations allowed.

init : 2d array, size (S, N)

    Initial fuzzy c-partitioned matrix. If none provided, algorithm is randomly initialized.

seed : int

    If provided, sets random seed of init. No effect if init is provided. Mainly for debug/testing purposes.


## Returns:	

cntr : 2d array, size (S, c)

    Cluster centers. Data for each center along each feature provided for every cluster (of the c requested clusters).

u : 2d array, (S, N)

    Final fuzzy c-partitioned matrix.

u0 : 2d array, (S, N)

    Initial guess at fuzzy c-partitioned matrix (either provided init or random guess used if init was not provided).

d : 2d array, (S, N)

    Final Euclidian distance matrix.

jm : 1d array, length P

    Objective function history.

p : int

    Number of iterations run.

fpc : float

    Final fuzzy partition coefficient.

In [316]:
#(data, c, m, error, maxiter, init=None, seed=None)
num_of_cluster = 4
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(dataset.T, num_of_cluster, 2, error=0.005, maxiter=1000)

In [317]:
cntr

array([[3.56618762e+00, 1.36124749e+02, 7.27624027e+01, 3.11612642e+01,
        1.97293371e+02, 3.48025684e+01, 5.86121416e-01, 3.32217613e+01,
        4.93380271e-01],
       [3.49026919e+00, 1.61224857e+02, 7.19807188e+01, 3.39332483e+01,
        4.87960090e+02, 3.58617168e+01, 5.91356900e-01, 3.31952757e+01,
        6.18115884e-01],
       [4.34486776e+00, 1.17305897e+02, 6.99684192e+01, 1.17447605e+01,
        2.60124822e+00, 3.08918013e+01, 4.14775214e-01, 3.51059805e+01,
        3.39351733e-01],
       [2.95594338e+00, 1.07554423e+02, 6.75106744e+01, 2.61683524e+01,
        8.23909447e+01, 3.13733200e+01, 4.82271964e-01, 2.88162879e+01,
        1.88046079e-01]])

## Parameters:	

test_data : 2d array, size (S, N)

    New, independent data set to be predicted based on trained c-means from cmeans. N is the number of data sets; S is the number of features within each sample vector.

cntr_trained : 2d array, size (S, c)

    Location of trained centers from prior training c-means.

m : float

    Array exponentiation applied to the membership function u_old at each iteration, where U_new = u_old ** m.

error : float

    Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.

maxiter : int

    Maximum number of iterations allowed.

init : 2d array, size (S, N)

    Initial fuzzy c-partitioned matrix. If none provided, algorithm is randomly initialized.

seed : int

    If provided, sets random seed of init. No effect if init is provided. Mainly for debug/testing purposes.

## Returns:	

u : 2d array, (S, N)

    Final fuzzy c-partitioned matrix.

u0 : 2d array, (S, N)

    Initial guess at fuzzy c-partitioned matrix (either provided init or random guess used if init was not provided).

d : 2d array, (S, N)

    Final Euclidian distance matrix.

jm : 1d array, length P

    Objective function history.

p : int

    Number of iterations run.

fpc : float

    Final fuzzy partition coefficient.


In [318]:
# (test_data, cntr_trained, m, error, maxiter, init=None, seed=None)
test_data = [[111, 110, 111, 987, 4, 1, 33.6, 80,12]]

#test_data = np.array([
   # [111, 110, 111, 987, 4 , 1, 33.6 ,0, 12] 
#])
test_data = pd.DataFrame(test_data) 

In [319]:
test_data
#dataset.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,111,110,111,987,4,1,33.6,80,12


In [320]:
u, u0, d, jm, p, fpc = fuzz.cluster.cmeans_predict(test_data.T, cntr, 2, error=0.005, maxiter=1000)

In [322]:
print("********************************************")
print("Available cluster:")
print("cntr: " + str(cntr))
print("********************************************")

print("Final fuzzy c-partitioned matrix(u): " + str(u))
print("")
print("Initial guess at fuzzy c-partitioned matrix (either provided init or random guess used if init was not provided)(u0): " + str(u0))
print("")
print("Final Euclidian distance matrix(d): " + str(d))
print("")
print("Objective function history(jm): " + str(jm))
print("")
print("Number of iterations run(p): " + str(p))
print("")
print("Final fuzzy partition coefficient(fpc): " + str(fpc))

********************************************
Available cluster:
cntr: [[3.56618762e+00 1.36124749e+02 7.27624027e+01 3.11612642e+01
  1.97293371e+02 3.48025684e+01 5.86121416e-01 3.32217613e+01
  4.93380271e-01]
 [3.49026919e+00 1.61224857e+02 7.19807188e+01 3.39332483e+01
  4.87960090e+02 3.58617168e+01 5.91356900e-01 3.31952757e+01
  6.18115884e-01]
 [4.34486776e+00 1.17305897e+02 6.99684192e+01 1.17447605e+01
  2.60124822e+00 3.08918013e+01 4.14775214e-01 3.51059805e+01
  3.39351733e-01]
 [2.95594338e+00 1.07554423e+02 6.75106744e+01 2.61683524e+01
  8.23909447e+01 3.13733200e+01 4.82271964e-01 2.88162879e+01
  1.88046079e-01]]
********************************************
Final fuzzy c-partitioned matrix(u): [[0.25926404]
 [0.21608907]
 [0.25949068]
 [0.26515621]]

Initial guess at fuzzy c-partitioned matrix (either provided init or random guess used if init was not provided)(u0): [[0.28438238]
 [0.44855246]
 [0.16087304]
 [0.10619212]]

Final Euclidian distance matrix(d): [[ 984.49