##1.c. Logistic Regression for K-class discrimination

In [35]:
from pylab import *
%matplotlib inline
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression
from __future__ import division
import random
t=np.seterr(divide='ignore', invalid='ignore')

###Get the input dataset

In [84]:
def getData(fileName):
    if fileName == 'Breast-Cancer':
        with open('breast-cancer-wisconsin.data','r') as f:
            records=[]
            for line in f:
                records.append(line.split(','))
        records = records[2:]
        dataset = np.array(records).astype(np.int)
        dataset = normalize(dataset)
        
    elif fileName == 'Iris':
        from sklearn import datasets
        data = datasets.load_iris() 
        X = data['data']
        Y = data['target']
        Y = Y.reshape(len(data['target']), 1)
        dataset = np.hstack((X, Y))
        dataset = normalize(dataset)
        
    elif fileName=='mnist':
        from sklearn.datasets import fetch_mldata
        data = fetch_mldata('mnist-original', data_home='C:\\Users\\admin\\Anaconda\\CS584')
        randIdx = np.zeros(1000).astype(int) #Take only 1000 random rows
        for i in range(1000):
            randIdx[i] = random.randint(0, 69999)
            
        Y = data['target']
        X = data['data']/255. #Normalization
        Y = Y.reshape(len(Y), 1)
        dataset = np.hstack((X, Y))
        dataset = dataset[randIdx]
    return dataset

###Normalize the given dataset

In [73]:
def normalize(data):
    
    [nrows, ncols] = data.shape
    col_mean = np.zeros(ncols-1).reshape(ncols-1, 1)
    col_sd = np.zeros(ncols-1).reshape(ncols-1, 1)
    normalized_data = data.astype(float64)
    
    for i in range(ncols-1):
        col_mean[i] = np.mean(data[:,i])
        col_sd = np.std(data[:,i])
        normalized_data[:,i] = [(data[:,i][j] - col_mean[i])/col_sd for j in range(nrows)]
        
    return normalized_data

###Calculate the Confusion Matrix

In [38]:
def getConfusionMatrix(actual, prediction, class_k):
    k = len(class_k)
    m = len(actual)
    cm = np.zeros((k, k))
    for i in range(k):
        actual_i = filter(lambda x: actual[x] == class_k[i], range(m))
        for j in range(k):
            predicted_j = filter(lambda x: prediction[x] == class_k[j], range(m))
            cm[i,j] = len(np.intersect1d(actual_i, predicted_j))
    return cm 

###Calulate the Performance Metrics

In [39]:
def getAccuracy(matrix):
    return np.trace(matrix)/np.sum(matrix)

def getPrecision(matrix):
    k = len(matrix)
    precision = np.zeros(k)
    for i in range(k):
        actual_positives = np.sum(matrix[i])
        precision[i] = matrix[i,i]/actual_positives
    return precision

def getRecall(matrix):
    k = len(matrix)
    recall = np.zeros(k)
    for i in range(k):
        predicted_positives = np.sum(matrix[:,i])
        recall[i] = matrix[i,i]/predicted_positives
    return recall

def getFMeasure(precision, recall):
    k = len(precision)
    fm = np.zeros(k)
    for i in range(k):
        fm[i] = 2* precision[i]*recall[i]/(precision[i]+recall[i])
    return fm

###Perform Classification

In [40]:
def classify(g, class_k):
    (m, n) = g.shape
    estimate = np.zeros((m,1))
    
    for i in range(m):
            estimate[i] = class_k[np.argmax(g[i])]
            
    return estimate

###Calculate the Hyposthesis Value

In [41]:
def hypothesis(X, theta):
    import math
    scores = np.dot(theta.T, X)
    softMax = np.exp(scores)/np.sum(np.exp(scores))
    return softMax

###Calculate the Indicator function Output

In [42]:
def identity(label, actual_label):
    if label == actual_label:
        return 1
    return 0

###Train and calculate the error

In [74]:
def train(data, train_idx, test_idx, class_k, theta0, learning_rate, iterations):
    
    (m, n) = data.shape
    
    k = len(class_k)
    training_data = data[train_idx]
    m_train = len(training_data)
    X_train = training_data[:,:n-1].reshape((m_train,n-1))
    Y_train = training_data[:,n-1].reshape((m_train,1))
    X_ones = np.ones((m_train, 1))
    X_train = np.hstack((X_ones,X_train))
    
    randomIdx = np.arange(m_train)
    random.shuffle(randomIdx)
    X_train = X_train[randomIdx]
    Y_train = Y_train[randomIdx]
    theta_old = np.zeros((n,k))
    theta_old.fill(theta0)
    theta = np.zeros((n,k))
    indicator = np.zeros((m_train, k))
    eps = np.ones(class_k.shape)
    
    for i in range(k):
        indicator[:,i] = [identity(class_k[i],x) for x in Y_train]
    
    for z in range(iterations):
        for j in range(m_train): #training samples
            h_theta = hypothesis(X_train[j],theta_old)
            for i in range(k):                  
                gradient = learning_rate * (h_theta[i] - indicator[j,i]) * X_train[j].T
                theta[:,i] = theta_old[:,i] - gradient
                eps[i] = np.max(theta_old[:,i] - theta[:,i])
                theta_old[:,i] = theta[:,i]
                if eps[i] <= 0.000001:
                    break
    
    #Compute Hypothesis
    h_train = np.zeros((m_train,k))
    for i in range(m_train):
        h_train[i] = hypothesis(X_train[i], theta)
    
    #Classification
    Y_train_hat = classify(h_train, class_k)

    #Testing
    test_data = data[test_idx]
    m_test = len(test_data)
    X_test = test_data[:,:n-1].reshape((m_test,n-1))
    Y_test = test_data[:,n-1].reshape((m_test,1))
    X_ones = np.ones((m_test, 1))
    X_test = np.hstack((X_ones,X_test))
    result = {}
    
    #Compute Hypothesis
    h_test = np.zeros((m_test,k))
    for i in range(m_test):
        h_test[i] = np.dot(theta.T, X_test[i])

    #Classification
    Y_test_hat = classify(h_test, class_k)
    
    confMatrix = getConfusionMatrix(Y_test, Y_test_hat, class_k) 
    precision = getPrecision(confMatrix)
    accuracy = getAccuracy(confMatrix)
    recall = getRecall(confMatrix)
    FMeasure = getFMeasure(precision, recall)
    
    #Consolidating the results
    result['ConfusionMatrix'] = confMatrix
    result['precision'] = precision
    result['accuracy'] = accuracy
    result['recall'] = recall
    result['FMeasure'] = FMeasure
    
    return result

###Run the algorithm

In [85]:
def run(fileName, K):
    dataset = getData(fileName)

    (rows, cols) = dataset.shape
    class_k = np.unique(dataset[:,cols-1])
    k = len(class_k)
    
    CV_idx = KFold(len(dataset), n_folds=K)
    i = 0
    result = []
    
    theta0 = 0.01
    learning_rate = 0.1
    iterations = 1000
    
    for train_idx, test_idx in CV_idx:
        result.append(train(dataset, train_idx, test_idx, class_k, theta0, learning_rate, iterations))
        i+=1
    
    
    conf_mat = np.zeros((k, k))
    precision = np.empty((1,k))
    recall = np.empty((1,k))
    accuracy=0
    
    for j in range(i):
        conf_mat = np.add(conf_mat, result[j]['ConfusionMatrix'])
        precision = np.vstack((precision, result[j]['precision']))
        recall = np.vstack((recall, result[j]['recall']))
        accuracy += result[j]['accuracy']
    
    precision = np.nanmean(precision[1:,:], axis=0)
    recall = np.nanmean(recall[1:,:], axis=0)
    fMeasure = getFMeasure(precision, recall)
    accuracy/=K
    
    print "Confusion Matrix"
    print conf_mat
    print "Accuracy\t:\t", accuracy
    print "Precision\t:\t", precision
    print "Recall\t\t:\t", recall
    print "F-Measure\t:\t", fMeasure
    
if __name__ == '__main__':
    #fileName='Iris'
    #fileName='Breast-Cancer'
    fileName = 'mnist'
    run(fileName,10)

Confusion Matrix
[[ 99.   0.   0.   0.   0.   5.   1.   0.   1.   0.]
 [  0.  97.   2.   1.   0.   0.   1.   0.  25.   0.]
 [  2.   0.  67.   2.   1.   3.   9.   0.  10.   1.]
 [  0.   0.   2.  66.   0.  14.   0.   0.  14.   1.]
 [  1.   0.   0.   1.  72.  10.   0.   0.   3.  16.]
 [  4.   0.   2.   6.   1.  76.   4.   0.   7.   1.]
 [  2.   0.   3.   0.   1.   5.  70.   0.   2.   0.]
 [  2.   1.   1.   1.   1.   9.   0.  53.   4.  10.]
 [  1.   1.   3.   1.   1.   9.   0.   0.  85.   0.]
 [  1.   0.   4.   3.  17.   6.   0.   4.   6.  65.]]
Accuracy	:	0.75
Precision	:	[ 0.93457209  0.79722976  0.699993    0.70571678  0.68974525  0.75808081
  0.85809969  0.65025974  0.83571429  0.63283951]
Recall		:	[ 0.88143107  0.97888889  0.80822039  0.8165898   0.78837302  0.56304813
  0.835       0.9520202   0.54433582  0.76392136]
F-Measure	:	[ 0.90722406  0.8787694   0.75022357  0.75711573  0.73576866  0.64616854
  0.84639227  0.77272441  0.65926479  0.69222961]
