#1. 1D kClass Continuous - Iris/Breast-Cancer Datasets

In [67]:
from pylab import *
%matplotlib inline
from sklearn.cross_validation import KFold
from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression
from __future__ import division
t=np.seterr(divide='ignore', invalid='ignore')

### Getting Dataset Ready for the experiment

In [68]:
def getData(fileName, classCount=2):
    dim = 1 #Only one dimensional feature vector required
    if fileName == "Iris":
        from sklearn import datasets
        data = datasets.load_iris() 
        #data = datasets.load_boston() 
        #data = datasets.load_diabetes()
        X = data['data']
        Y = data['target']
        if dim==1:
            #X = X[:,1].reshape(len(data['data']),1) #Sepal Width
            X = X[:,2].reshape(len(data['data']),1) #Petal Length
        Y = Y.reshape(len(data['target']), 1)
        dataset = np.hstack((X, Y))

    idx_k=[]
    if classCount == 2:
        (m, n) = dataset.shape
        class_k = np.unique(dataset[:,n-1])
        k = len(class_k)
        for i in range(classCount):
            idx_k.append(filter((lambda x : dataset[:,n-1][x] == class_k[i]), range(m)))
        idx_k = np.concatenate((idx_k[0], idx_k[1]), axis=0)
        dataset = dataset[idx_k]
        
    return dataset

###Computing the Confusion Matrix

In [69]:
def getConfusionMatrix(actual, prediction, class_k):
    k = len(class_k)
    m = len(actual)
    cm = np.zeros((k, k))
    for i in range(k):
        actual_i = filter(lambda x: actual[x] == class_k[i], range(m))
        for j in range(k):
            predicted_j = filter(lambda x: prediction[x] == class_k[j], range(m))
            cm[i,j] = len(np.intersect1d(actual_i, predicted_j))
    return cm 

###Computing Performance Metrics

In [70]:
def getAccuracy(matrix):
    return np.trace(matrix)/np.sum(matrix)

def getPrecision(matrix):
    k = len(matrix)
    precision = np.zeros(k)
    for i in range(k):
        actual_positives = np.sum(matrix[i])
        precision[i] = matrix[i,i]/actual_positives
    return precision

def getRecall(matrix):
    k = len(matrix)
    recall = np.zeros(k)
    for i in range(k):
        predicted_positives = np.sum(matrix[:,i])
        recall[i] = matrix[i,i]/predicted_positives
    return recall

def getFMeasure(precision, recall):
    k = len(precision)
    fm = np.zeros(k)
    for i in range(k):
        fm[i] = 2* precision[i]*recall[i]/(precision[i]+recall[i])
    return fm

###Classifying based on Membership Function

In [71]:
def classify(g, class_k):
    (m, n) = g.shape
    estimate = np.zeros((m,1))
    
    for i in range(m):
            estimate[i] = class_k[np.argmax(g[i])]
            
    return estimate

###Computing Membership Function

In [72]:
def computeMembership(X, modelParam):
    (m, n) = X.shape
    mean_k = modelParam['mean']
    sigma_k = modelParam['sigma']
    alpha_k = modelParam['alpha']
    m_k = modelParam['m']
    k = len(m_k)
    
    g = np.zeros((m,k))
    for j in range(k):
        if m_k[j] is not 0:
            det_sigma_k = np.linalg.det(sigma_k[j])
            sigma_inv = np.linalg.inv(sigma_k[j])
            for i in range(m):
                X_mu = X[i] - mean_k[j]

                g[i,j] = -np.log(det_sigma_k) + np.log(alpha_k[j]) - (X_mu**2)/(2 * sigma_k[j]**2)
        else:
            g[:,j] = -inf * np.ones(m)
            
    return g

###Training the Classifier

In [75]:
def train(data, train_idx, test_idx, class_k):
    (m, n) = data.shape
    k = len(class_k)
    training_data = data[train_idx]
    m_train = len(training_data)
    X_train = training_data[:,:n-1].reshape((m_train,n-1))
    Y_train = training_data[:,n-1].reshape((m_train,1))
    
    
    #Distribution Model = Gaussian
    mean_k = {}
    covariance_k = {}
    m_train_k = {}
    X_train_k = {}
    train_idx_k = {}
    alpha_k = {}
    
    #Determining Model Parameters
    for i in range(k):
        train_idx_k[i] = filter((lambda x : data[:,n-1][x] == class_k[i]), train_idx)
        X_train_k[i] = data[:, :n-1][train_idx_k[i]]
        m_train_k[i] = len(X_train_k[i])
        alpha_k[i] = m_train_k[i]/m_train
        if m_train_k[i] is not 0:
            mean_k[i] = np.mean(X_train_k[i],axis=0)
            X_mu_k = X_train_k[i]-mean_k[i]
            covariance_k[i] = np.dot(X_mu_k.T,(X_mu_k))/m_train_k[i]
    
    #Computed Model Parameters
    modelParam = {}
    modelParam['mean'] = mean_k
    modelParam['sigma'] = covariance_k
    modelParam['alpha'] = alpha_k
    modelParam['m'] = m_train_k
    
    #Compute Membership Function
    g_train = computeMembership(X_train, modelParam)
    
    #Classification
    Y_train_hat = classify(g_train, class_k)
    
    #For Debug pupose: Computing the Training Error
    '''
    result = {}
    confMatrix = getConfusionMatrix(Y_train, Y_train_hat, class_k)    
    precision = getPrecision(confMatrix)
    accuracy = getAccuracy(confMatrix)
    recall = getRecall(confMatrix)
    FMeasure = getFMeasure(precision, recall) 
    
    result['ConfusionMatrix'] = confMatrix
    result['precision'] = precision
    result['accuracy'] = accuracy
    result['recall'] = recall
    result['FMeasure'] = FMeasure
    '''
    test_data = data[test_idx]
    m_test = len(test_data)
    X_test = test_data[:,:n-1].reshape((m_test,n-1))
    Y_test = test_data[:,n-1].reshape((m_test,1))
    
    #Compute Membership Function
    g_test = computeMembership(X_test, modelParam)

    #Classification
    Y_test_hat = classify(g_test, class_k)
    
    #Computing Performance Metrics
    result = {}
    confMatrix = getConfusionMatrix(Y_test, Y_test_hat, class_k)    
    precision = getPrecision(confMatrix)
    accuracy = getAccuracy(confMatrix)
    recall = getRecall(confMatrix)
    FMeasure = getFMeasure(precision, recall)
    
    #Preparing the results
    result['ConfusionMatrix'] = confMatrix
    result['precision'] = precision
    result['accuracy'] = accuracy
    result['recall'] = recall
    result['FMeasure'] = FMeasure
    
    return result

###Running the experiment - Main Function

In [80]:
def run(fileName, K):
    dataset = getData("Iris")
    (rows, cols) = dataset.shape
    class_k = np.unique(dataset[:,cols-1])
    CV_idx = KFold(len(dataset), n_folds=K)
    
    i = 0
    result = []
    for train_idx, test_idx in CV_idx:
        result.append(train(dataset, train_idx, test_idx, class_k))
        i+=1
    
    #Interpreting the Results obtained
    k = len(class_k)
    conf_mat = np.zeros((k, k))
    precision = np.empty((1,k))
    recall = np.empty((1,k))
    accuracy=0
    
    for j in range(i):
        conf_mat = np.add(conf_mat, result[j]['ConfusionMatrix'])
        precision = np.vstack((precision, result[j]['precision']))
        recall = np.vstack((recall, result[j]['recall']))
        accuracy += result[j]['accuracy']
    
    precision = np.nanmean(precision[1:,:], axis=0)
    recall = np.nanmean(recall[1:,:], axis=0)
    fMeasure = getFMeasure(precision, recall)
    accuracy/=K
    
    print "Confusion Matrix: "
    print conf_mat
    print "Accuracy\t:\t", accuracy
    print "Precision\t:\t", precision[0]
    print "Recall\t\t:\t", recall[0]
    print "F-Measure\t:\t", fMeasure[0]
        
if __name__ == '__main__':
    fileName = "Iris"
    run(fileName, 10)

Confusion Matrix: 
[[ 47.   3.]
 [  0.  50.]]
Accuracy	:	0.97
Precision	:	0.94
Recall		:	1.0
F-Measure	:	0.969072164948
