In [2]:
import pandas as pd
import numpy as np
import operator
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import csv

In [2]:
def load_train_data():
    raw_data = pd.read_csv('data/train.csv')
    data = np.array(pd.DataFrame(raw_data.iloc[:, 1:]))
    labels = np.array(pd.DataFrame(raw_data.iloc[:, :1]))
    data[data != 0] = 1
    return data, labels

def load_test_data():
    raw_data = pd.read_csv('data/test.csv')
    data = np.array(pd.DataFrame(raw_data.iloc[:, :]))
    return data

def predict(in_x, train_data, train_labels, k):
    in_x = np.mat(in_x)
    train_data = np.mat(train_data)
    train_labels = np.mat(train_labels)
    train_data_size = train_data.shape[0]
    diff_matrix = np.tile(in_x, (train_data_size, 1)) - train_data
    square_diff_matrix = np.array(diff_matrix) ** 2
    square_distances = square_diff_matrix.sum(axis=1)
    distances = square_distances ** 0.5
    sorted_distances_indexes = distances.argsort()
    count = {}
    for i in range(k):
        vote_i_label = train_labels[sorted_distances_indexes[i], 0]
        count[vote_i_label] = count.get(vote_i_label, 0) + 1
    sorted_count = sorted(count.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sorted_count[0][0]

def write_result(result):
    with open('result.csv', 'wb') as outcsv:
        writer = csv.writer(outcsv)
        writer.writerow(['ImageId', 'Label'])
        for i in range(len(result)):
            writer.writerow([i + 1, result[i]])

In [12]:
raw_data = pd.read_csv('data/train.csv')
data = np.array(pd.DataFrame(raw_data.iloc[:, 1:]))
labels = np.array(pd.DataFrame(raw_data.iloc[:, :1]))
data[data != 0] = 1
kf = KFold(3, True)
total_score = 0.0
y_pred = []
y_true = []
for train_index, test_index in kf.split(data):
    train_data = data[train_index[0]]
    train_labels = labels[train_index[0]]
    test_data = data[test_index[0]]
    test_labels = labels[test_index[0]]
    print "Generating train_data and train_labels"
    for i in range(1, len(train_index)):
        train_data = np.vstack((train_data, data[train_index[i]]))
        train_labels = np.vstack((train_labels, labels[train_index[i]]))
    print "Generating train_data and train_labels done!"
    print "Generating test_data and test_labels"
    for i in range(1, len(test_index)):
        test_data = np.vstack((test_data, data[test_index[i]]))
        test_labels = np.vstack((test_labels, labels[test_index[i]]))
    print "Generating test_data and test_labels done!"
    correct, wrong, score = 0, 0, 0
    print "Start CrossValidation"
    for i in range(len(test_data)):
        prediction = predict(test_data[i], train_data, train_labels, 10)
        truth = test_labels[i][0]
        y_pred.append(prediction)
        y_true.append(truth)
        if prediction == truth:
            correct += 1
        else:
            wrong += 1
    score = correct * 1.0 / (correct + wrong)
    print "CrossValidation done!"
    print "The score of this epoch is:",
    print score
    total_score += score
print "Final score is: ",
print total_score / 3
print "Confusion_matrix is:"
print confusion_matrix(y_true, y_pred)

Generating train_data and train_labels
Generating train_data and train_labels done!
Generating test_data and test_labels
Generating test_data and test_labels done!
Start CrossValidation
CrossValidation done!
The score of this epoch is: 0.954571428571
Generating train_data and train_labels
Generating train_data and train_labels done!
Generating test_data and test_labels
Generating test_data and test_labels done!
Start CrossValidation
CrossValidation done!
The score of this epoch is: 0.956785714286
Generating train_data and train_labels
Generating train_data and train_labels done!
Generating test_data and test_labels
Generating test_data and test_labels done!
Start CrossValidation
CrossValidation done!
The score of this epoch is: 0.956857142857
Final score is:  0.956071428571
Confusion_matrix is:
[[4094    7    3    1    0    4   14    0    7    2]
 [   0 4646   10    4    4    0    5    7    2    6]
 [  38   52 3977   17    3    0    4   47   33    6]
 [  12   39   26 4078    0   30    

In [None]:
train_data, train_labels = load_train_data()
test_data = load_test_data()
result = []
for data in test_data:
    prediction = predict(data, train_data, train_labels, 10)
    result.append(prediction)
write_result(result)