In [None]:
# Load plotting libraries.
%pylab inline
import numpy
import sys
import math
from matplotlib import pylab as plt

matrix = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(1,785))
label = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(0,1))

# Plot a single digit.
def displayDigit(row):
    matshow(matrix[row].reshape(28,28), cmap='gray')
    return

# Plot a histogram for training data label.
def showLabelDistribution():
    plt.hist(label, bins=range(0,11))
    plt.title("Training Data Label Histogram")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    return

def classify(row):
    print(label[row])
    displayDigit(row)
    dist = []
    for i in matrix:
        dist.append(numpy.linalg.norm(i - matrix[row]))
    # Avoid matching to itselft.
    dist[row] = dist[0] + dist[1]
    index = dist.index(min(dist))
    displayDigit(index)
    return label[index]

def genuineAndImpostor():
    zeros = []
    ones = []
    for i in range(0,len(label)):
        if label[i] == 0:
            zeros.append(matrix[i])
        if label[i] == 1:
            ones.append(matrix[i])
    genuine = []
    impostor = []
    for i in range(0,len(zeros)):
        for j in range(i):
            genuine.append(numpy.linalg.norm(zeros[i] - zeros[j]))
    for i in range(0,len(ones)):
        for j in range(i):
            genuine.append(numpy.linalg.norm(ones[i] - ones[j]))
    for i in zeros:
        for j in ones:
            impostor.append(numpy.linalg.norm(i - j))
    
    xweights = 100 * np.ones_like(genuine) / len(genuine)
    yweights = 100 * np.ones_like(impostor) / len(impostor)
    
    fig, ax = plt.subplots()
    fig.set_size_inches(12, 7)
    binRange = range(int(min(genuine)), int(max(impostor)) + 1, 15)
    ax.hist(genuine, weights=xweights, bins=binRange, color="lightblue", alpha=0.5)
    ax.hist(impostor, weights=yweights, bins=binRange, color="salmon", alpha=0.5)
    ax.set(title="Genuine:lightblue, Impostor:salmon", xlabel="Distance", ylabel="% of Dataset in Bin")
    plt.show()

    rocCurve(genuine, impostor)

def rocCurve(genuine, impostor):
    print("ROC")
    axisX = []
    axisY = []
    genuine.sort()
    impostor.sort()
    print(len(impostor))
    for i in range(0, len(impostor), 50000):
        fpr = 100.0*(impostor < impostor[i]).sum()/float(len(impostor))
        tpr = 100.0*(genuine < impostor[i]).sum()/float(len(genuine))
        axisY.append(tpr)
        axisX.append(fpr)
        fnr = 100 - tpr
        if (int(fnr) == int(fpr)):
            print("Equal Error: " + repr(fpr))
    print("loopend")
    plt.plot(axisX, axisY)
    plt.title("ROC Curve")
    plt.xlabel("False Positive")
    plt.ylabel("True Positive")  
    plt.show()
    return

#genuineAndImpostor()
#showLabelDistribution()
#print(classify(28))

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


ROC
19354288


In [8]:
import math
import operator
import numpy
import sys
from sklearn.metrics import confusion_matrix

def classify(trainingSet, label, testInstance, k):
    # Get neighbors.
    distances = []
    for i in range(len(trainingSet)):
        dist = numpy.linalg.norm(trainingSet[i] - testInstance)
        distances.append((i, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    
    # Get votes.
    votes = {}
    for i in range(len(neighbors)):
        response = label[neighbors[i]]
        if response in votes:
            votes[response] += 1
        else:
            votes[response] = 1
    votes = sorted(votes.items(), key=operator.itemgetter(1), reverse=True)
    return votes[0][0]

def computeAccuracy(testLabel, predictions):
    correct = 0
    for i in range(len(testLabel)):
        if testLabel[i] == predictions[i]:
            correct += 1
    return (correct / float(len(testLabel))) * 100.0

def getPredictions(trainingSet, trainingLabel, testSet, k):
    predictions=[]
    for i in range(len(testSet)):
        if i % 500 == 0:
            print(i)
        res = classify(trainingSet, trainingLabel, testSet[i], k)
        predictions.append(res)
    return predictions    
        
def crossValidation(trainingSet, trainingLabel, testSet, testLabel, k):
    predictions = getPredictions(trainingSet, trainingLabel, testSet, k)
    accuracy = computeAccuracy(testLabel, predictions)
    print("Accuracy: " + repr(accuracy) + "%")
    return accuracy

def confusionMatrix(k):
    totalTrainingSet = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(1,785))
    totalLabel = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(0,1))
    length = len(totalTrainingSet)
    trainingSet = totalTrainingSet[0:int(length*2/3)]
    trainingLabel = totalLabel[0:int(length*2/3)]
    testSet = totalTrainingSet[int(length*2/3):]
    testLabel = totalLabel[int(length*2/3):]
    print("Train set: " + repr(len(trainingSet)))
    print("Test set: " + repr(len(testSet)))
    predictions = getPredictions(trainingSet, trainingLabel, testSet, k)
    print(confusion_matrix(testLabel, predictions))
    return
    
def threefold(k):
    totalTrainingSet = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(1,785))
    totalLabel = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(0,1))
    length = len(totalTrainingSet)
    # Case 1.
    trainingSet1 = totalTrainingSet[0:int(length*2/3)]
    trainingLabel1 = totalLabel[0:int(length*2/3)]
    testSet1 = totalTrainingSet[int(length*2/3):]
    testLabel1 = totalLabel[int(length*2/3):]
    print("Train set: " + repr(len(trainingSet1)))
    print("Test set: " + repr(len(testSet1)))
    acc1 = crossValidation(trainingSet1, trainingLabel1, testSet1, testLabel1, k)
    
    trainingSet2 = totalTrainingSet[int(length/3):]
    trainingLabel2 = totalLabel[int(length/3):]
    testSet2 = totalTrainingSet[0:int(length/3)]
    testLabel2 = totalLabel[0:int(length/3)]
    print("Train set: " + repr(len(trainingSet2)))
    print("Test set: " + repr(len(testSet2)))
    acc2 = crossValidation(trainingSet2, trainingLabel2, testSet2, testLabel2, k)
    
    trainingSet3Part1 = totalTrainingSet[0:int(length/3)]
    trainingLabel3Part1 = totalLabel[0:int(length/3)]
    trainingSet3Part2 = totalTrainingSet[int(length*2/3):]
    trainingLabel3Part2 = totalLabel[int(length*2/3):]
    trainingSet3 = []
    trainingSet3.extend(trainingSet3Part1)
    trainingSet3.extend(trainingSet3Part1)
    trainingLabel3 = []
    trainingLabel3.extend(trainingLabel3Part1)
    trainingLabel3.extend(trainingLabel3Part2)
    testSet3 = totalTrainingSet[int(length/3):int(length*2/3)]
    testLabel3 = totalLabel[int(length/3):int(length*2/3)]
    print("Train set: " + repr(len(trainingSet3)))
    print("Test set: " + repr(len(testSet3)))
    acc3 = crossValidation(trainingSet3, trainingLabel3, testSet3, testLabel3, k)
    avg = (acc1 + acc2 + acc3)/3
    print("Average: " + repr(avg))

def kaggle(k):
    trainingSet = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(1,785))
    label = numpy.loadtxt(open("train.csv", "rb"), delimiter=",", skiprows=1, usecols=range(0,1))
    testSet = numpy.loadtxt(open("test.csv", "rb"), delimiter=",", skiprows=1)
    
    pred = getPredictions(trainingSet, label, testSet, 3)
    numpy.savetxt("kaggle.csv", pred, delimiter=",")

#threefold(3)
#confusionMatrix(3)
#kaggle(3)