##1.b. Using Non-Linear Combination of Inputs for the Classifier

In [23]:
from pylab import *
%matplotlib inline
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression
from __future__ import division
import random
t=np.seterr(divide='ignore', invalid='ignore')

###Get the input dataset

In [24]:
def getData(fileName):
    if fileName == 'Breast-Cancer':
        with open('breast-cancer-wisconsin.data','r') as f:
            records=[]
            for line in f:
                records.append(line.split(','))
        records = records[2:]
        dataset = np.array(records).astype(np.int)        
    elif fileName == 'Iris':
        classCount = 2 #2 #Only two classes required
        from sklearn import datasets
        data = datasets.load_iris() 
        X = data['data']
        Y = data['target']
        Y = Y.reshape(len(data['target']), 1)
        dataset = np.hstack((X, Y))
    
        idx_k=[]
        if classCount == 2:
            (m, n) = dataset.shape
            class_k = np.unique(dataset[:,n-1])
            k = len(class_k)
            for i in range(classCount):
                idx_k.append(filter((lambda x : dataset[:,n-1][x] == class_k[i]), range(m)))
            idx_k = np.concatenate((idx_k[0], idx_k[1]), axis=0)
            dataset = dataset[idx_k]
    elif fileName == "Bank":
        with open('data_banknote_authentication.txt') as f:
            records=[]
            for line in f:
                records.append(line.split(','))
        records = records[2:]
        dataset = np.array(records).astype(float)
    
    return dataset

###Normalize the input dataset

In [25]:
def normalize(data):
    
    [nrows, ncols] = data.shape
    col_mean = np.zeros(ncols-1).reshape(ncols-1, 1)
    col_sd = np.zeros(ncols-1).reshape(ncols-1, 1)
    normalized_data = data.astype(float64)
    
    for i in range(ncols-1):
        col_mean[i] = np.mean(data[:,i])
        col_sd = np.std(data[:,i])
        normalized_data[:,i] = [(data[:,i][j] - col_mean[i])/col_sd for j in range(nrows)]
    return normalized_data

###Calculate the Confusion Matrix

In [26]:
def getConfusionMatrix(actual, prediction, class_k):
    k = len(class_k)
    m = len(actual)
    cm = np.zeros((k, k))
    for i in range(k):
        actual_i = filter(lambda x: actual[x] == class_k[i], range(m))
        for j in range(k):
            predicted_j = filter(lambda x: prediction[x] == class_k[j], range(m))
            cm[i,j] = len(np.intersect1d(actual_i, predicted_j))
    return cm 

###Calculate the performance metrics

In [27]:
def getAccuracy(matrix):
    return np.trace(matrix)/np.sum(matrix)

def getPrecision(matrix):
    k = len(matrix)
    precision = np.zeros(k)
    for i in range(k):
        actual_positives = np.sum(matrix[i])
        precision[i] = matrix[i,i]/actual_positives
    return precision

def getRecall(matrix):
    k = len(matrix)
    recall = np.zeros(k)
    for i in range(k):
        predicted_positives = np.sum(matrix[:,i])
        recall[i] = matrix[i,i]/predicted_positives
    return recall

def getFMeasure(precision, recall):
    k = len(precision)
    fm = np.zeros(k)
    for i in range(k):
        fm[i] = 2* precision[i]*recall[i]/(precision[i]+recall[i])
    return fm

###Perform Classification

In [28]:
def classify(g, class_k):
    m = len(g)
    estimate = np.zeros((m,1))
    for i in range(m):
        if g[i] <0.5:
            estimate[i] = class_k[0]
        else:
            estimate[i] = class_k[1]            
    return estimate

###Calculate the Hyposthesis value

In [29]:
def hypothesis(X, theta):
    import math
    scores = np.dot(theta.T, X)
    sigmoid = 1/(1.0 + np.exp(-1*scores))
    return sigmoid

###Calculate the Indicator function output

In [30]:
def identity_func(label, actual_label):
    if label == actual_label:
        return 1
    return 0

###Perform Training and calculate error

In [68]:
def train(data, train_idx, test_idx, class_k, degree, theta0, learning_rate, iterations):
    
    (m, n) = data.shape
    
    k = len(class_k)
    training_data = data[train_idx]
    m_train = len(training_data)
    X_train = training_data[:,:n-1].reshape((m_train,n-1))
    Y_train = training_data[:,n-1].reshape((m_train,1))
    
    poly = PolynomialFeatures(degree)
    Z_train = poly.fit_transform(X_train)
    
    randomIdx = np.arange(m_train)
    random.shuffle(randomIdx)
    Z_train = Z_train[randomIdx]
    (m_train, nz) = Z_train.shape
    Y_train = Y_train[randomIdx]
    theta_old = np.zeros(nz)
    theta_old.fill(theta0)
    theta = np.zeros(nz)
    indicator = np.zeros(m_train)
    eps = 1
    
    indicator = [identity_func(class_k[1],x) for x in Y_train]
    
    for z in range(iterations):
        for j in range(m_train): #training samples
            h_theta = hypothesis(Z_train[j],theta_old)
            gradient = learning_rate * (h_theta - indicator[j]) * Z_train[j].T
            theta = theta_old - gradient
            eps = np.max(theta_old - theta)
            theta_old = theta
            if eps <= 0.0000001:
                break
            
    #Compute Hypothesis
    h_train = np.zeros(m_train)
    for i in range(m_train):
        h_train[i] = hypothesis(Z_train[i], theta)
    
    #Classification
    Y_train_hat = classify(h_train, class_k)
    
    #Testing
    test_data = data[test_idx]
    m_test = len(test_data)
    X_test = test_data[:,:n-1].reshape((m_test,n-1))
    Y_test = test_data[:,n-1].reshape((m_test,1))
    result = {}
    
    #Non Linear Mapping
    poly = PolynomialFeatures(degree)
    Z_test = poly.fit_transform(X_test)
    
    #Compute Hypothesis
    h_test = np.zeros(m_test)
    for i in range(m_test):
        h_test[i] = hypothesis(Z_test[i], theta)
    
    #Classification
    Y_test_hat = classify(h_test, class_k)
    
    confMatrix = getConfusionMatrix(Y_test, Y_test_hat, class_k) 
    precision = getPrecision(confMatrix)
    accuracy = getAccuracy(confMatrix)
    recall = getRecall(confMatrix)
    FMeasure = getFMeasure(precision, recall)
    
    
    #Consolidating the results
    result['ConfusionMatrix'] = confMatrix
    result['precision'] = precision
    result['accuracy'] = accuracy
    result['recall'] = recall
    result['FMeasure'] = FMeasure
    
    return result

###Runs the algorithm

In [69]:
def run(fileName, K):
    dataset = getData(fileName)
    (rows, cols) = dataset.shape
    class_k = np.unique(dataset[:,cols-1])
    k = len(class_k)
    
    CV_idx = KFold(len(dataset), n_folds=K)
    i = 0
    result = []
    
    normalized_dataset = normalize(dataset.reshape((len(dataset),len(dataset[0]))))
    theta0 = 0.01
    learning_rate = 0.2
    iterations = 1000
    degree = 3
    
    for train_idx, test_idx in CV_idx:
        result.append(train(normalized_dataset, train_idx, test_idx, class_k, degree, theta0, learning_rate, iterations))
        i+=1
    
    
    conf_mat = np.zeros((k, k))
    precision = np.empty((1,k))
    recall = np.empty((1,k))
    accuracy=0
    
    for j in range(i):
        conf_mat = np.add(conf_mat, result[j]['ConfusionMatrix'])
        precision = np.vstack((precision, result[j]['precision']))
        recall = np.vstack((recall, result[j]['recall']))
        accuracy += result[j]['accuracy']
    
    precision = np.nanmean(precision[1:,:], axis=0)
    recall = np.nanmean(recall[1:,:], axis=0)
    fMeasure = getFMeasure(precision, recall)
    accuracy/=K
    
    print "Confusion Matrix"
    print conf_mat
    print "Accuracy\t:\t", accuracy
    print "Precision\t:\t", precision[0]
    print "Recall\t\t:\t", recall[0]
    print "F-Measure\t:\t", fMeasure[0]
    
if __name__ == '__main__':
    
    fileName='Iris'
    #fileName="Bank"
    #fileName='Breast-Cancer'
    run(fileName,10)

Confusion Matrix
[[ 50.   0.]
 [  0.  50.]]
Accuracy	:	1.0
Precision	:	1.0
Recall		:	1.0
F-Measure	:	1.0
