#5. nD 2Class Discrete - Spambase

In [132]:
from pylab import *
%matplotlib inline
from sklearn.cross_validation import KFold
from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression
from __future__ import division
t=np.seterr(divide='ignore', invalid='ignore')

###Preparing the dataset for the experiment

In [133]:
def getData():
    with open('spambase.data','r') as f:
        records=[]
        for line in f:
            records.append(line.split(','))
    data = np.array(records).astype(np.float)
    
    (m, n) = data.shape
    Y = data[:, n-1].reshape(m,1)
    P = data[:,n-2].reshape(m,1)
    X = data[:, :n-4].reshape(m,n-4)
    for i in range(m):
        for j in range(n-4):
            if X[i,j] > 0:
                X[i,j] *= P[i]/100.0
    data = np.hstack((X,P,Y))
    data = data.astype(np.int)
    
    return data    

###Computing the Confusion Matrix

In [72]:
def getConfusionMatrix(actual, prediction, class_k):
    k = len(class_k)
    m = len(actual)
    cm = np.zeros((k, k))
    for i in range(k):
        actual_i = filter(lambda x: actual[x] == class_k[i], range(m))
        for j in range(k):
            predicted_j = filter(lambda x: prediction[x] == class_k[j], range(m))
            cm[i,j] = len(np.intersect1d(actual_i, predicted_j))
    return cm 

###Calculating the Performance Metrics

In [73]:
def getAccuracy(matrix):
    return np.trace(matrix)/np.sum(matrix)

def getPrecision(matrix):
    k = len(matrix)
    precision = np.zeros(k)
    for i in range(k):
        actual_positives = np.sum(matrix[i])
        precision[i] = matrix[i,i]/actual_positives
    return precision

def getRecall(matrix):
    k = len(matrix)
    recall = np.zeros(k)
    for i in range(k):
        predicted_positives = np.sum(matrix[:,i])
        recall[i] = matrix[i,i]/predicted_positives
    return recall

def getFMeasure(precision, recall):
    k = len(precision)
    fm = np.zeros(k)
    for i in range(k):
        fm[i] = 2* precision[i]*recall[i]/(precision[i]+recall[i])
    return fm

###Classifying based on Membership Function

In [74]:
def classify(g, class_k):
    (m, n) = g.shape
    estimate = np.zeros((m,1))
    
    for i in range(m):
            estimate[i] = class_k[np.argmax(g[i])]
            
    return estimate

###Computing the Membership Function

In [128]:
def computeMembership(data, modelParam):
    from scipy.special import comb
    (m, n) = data.shape
    X = data[:,:n-1]
    P = data[:, n-1]
    condProb_k = modelParam['condProb']
    alpha_k = modelParam['prior']
    m_k = modelParam['m']
    
    k = len(m_k)    
    g = np.zeros((m,k))
    for mail in range(m):
        for j in range(k):
            g[mail, j] = np.log(alpha_k[j])
            for i in range(n-1):
                if X[mail,i] <> 0:
                    g[mail,j] += np.log(comb(P[mail],X[mail,i], exact=False)) 
                    g[mail,j] += (X[mail,i]*np.log(condProb_k[i,j]))
                g[mail,j] += (P[mail]*np.log(1-condProb_k[i,j]))
    return g

###Training the Classifier and Testing the Performance

In [136]:
def train(data, train_idx, test_idx, class_k):
    (m, n) = data.shape
    k = len(class_k)
    training_data = data[train_idx]
    m_train = len(training_data)
    X_train = training_data[:,:n-2].reshape((m_train,n-2))
    P_train = training_data[:,n-2].reshape((m_train,1))
    Y_train = training_data[:,n-1].reshape((m_train,1))
    
    
    #Distribution Model: Naive Bayes-Binomial
    condProb_k = np.zeros((n-2, k))
    m_train_k = {}
    X_train_k = {}
    train_idx_k = {}
    alpha_k = {}
    P_train_k = {}
    
    #Laplacian Smoothing
    epsilon = 0.01
    
    #Determining Model Parameters
    for i in range(k):
        train_idx_k[i] = filter((lambda x : data[:,n-1][x] == class_k[i]), train_idx)
        X_train_k[i] = data[:, :n-2][train_idx_k[i]]
        P_train_k[i] = data[:,n-2][train_idx_k[i]]
        m_train_k[i] = len(X_train_k[i])
        alpha_k[i] = m_train_k[i]/m_train

        for j in range(n-2):
            m_n_k = 0
            for x in range(m_train_k[i]):
                m_n_k += X_train_k[i][x,j]
            condProb_k[j,i] = (m_n_k+epsilon)/(np.sum(P_train_k[i])+(k*epsilon))
    
    #Consolidating the Model Parameters
    modelParam = {}
    modelParam['condProb'] = condProb_k
    modelParam['prior'] = alpha_k
    modelParam['m'] = m_train_k

    #Compute Membership Function
    input_data = np.hstack((X_train,P_train))
    g_train = computeMembership(input_data, modelParam)
    
    #Classification
    Y_train_hat = classify(g_train, class_k)
    
    #For Debug Pupose Only: Computing Training Performance
    '''
    result = {}
    confMatrix = getConfusionMatrix(Y_train, Y_train_hat, class_k)    
    precision = getPrecision(confMatrix)
    accuracy = getAccuracy(confMatrix)
    recall = getRecall(confMatrix)
    FMeasure = getFMeasure(precision, recall)
    
    result['ConfusionMatrix'] = confMatrix
    result['precision'] = precision
    result['accuracy'] = accuracy
    result['recall'] = recall
    result['FMeasure'] = FMeasure
    '''
    
    #Testing
    test_data = data[test_idx]
    m_test = len(test_data)
    X_test = test_data[:,:n-2].reshape((m_test,n-2))
    P_test = test_data[:,n-2].reshape((m_test,1))
    Y_test = test_data[:,n-1].reshape((m_test,1))
    input_data = np.hstack((X_test,P_test))
    
    #Compute Membership Function
    g_test = computeMembership(input_data, modelParam)

    #Classification
    Y_test_hat = classify(g_test, class_k)
    
    result = {}
    confMatrix = getConfusionMatrix(Y_test, Y_test_hat, class_k)    
    precision = getPrecision(confMatrix)
    accuracy = getAccuracy(confMatrix)
    recall = getRecall(confMatrix)
    FMeasure = getFMeasure(precision, recall)
    
    #Consolidating the Results
    result['ConfusionMatrix'] = confMatrix
    result['precision'] = precision
    result['accuracy'] = accuracy
    result['recall'] = recall
    result['FMeasure'] = FMeasure
    
    return result

In [139]:
def run(K):
    dataset = getData()
    (rows, cols) = dataset.shape
    class_k = np.unique(dataset[:,cols-1])
    CV_idx = KFold(len(dataset), n_folds=K)
    i = 0
    result = []
    
    #K-Fold Classification
    for train_idx, test_idx in CV_idx:
        result.append(train(dataset, train_idx, test_idx, class_k))
        i+=1
    
    #Interpreting Results
    k = len(class_k)
    conf_mat = np.zeros((k, k))
    precision = np.empty((1,k))
    recall = np.empty((1,k))
    accuracy=0
    
    for j in range(i):
        conf_mat = np.add(conf_mat, result[j]['ConfusionMatrix'])
        precision = np.vstack((precision, result[j]['precision']))
        recall = np.vstack((recall, result[j]['recall']))
        accuracy += result[j]['accuracy']
    
    precision = np.nanmean(precision[1:,:], axis=0)
    recall = np.nanmean(recall[1:,:], axis=0)
    fMeasure = getFMeasure(precision, recall)
    accuracy/=K
    
    print "Confusion Matrix"
    print conf_mat
    print "Accuracy\t:\t", accuracy
    print "Precision\t:\t", precision
    print "Recall\t\t:\t", recall
    print "F-Measure\t:\t", fMeasure
    
if __name__ == '__main__':
    run(10)

Confusion Matrix
[[ 2562.   226.]
 [  479.  1334.]]
Accuracy	:	0.846809865133
Precision	:	[ 0.92981366  0.7357646 ]
Recall		:	[ 0.61944444  0.4       ]
F-Measure	:	[ 0.74354029  0.51825148]
