In [25]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats

traindata = np.loadtxt('IDSWeedCropTrain.csv', delimiter=',')
testdata = np.loadtxt('IDSWeedCropTest.csv', delimiter=',')
train_data = traindata[:,:-1]
train_labels = traindata[:,-1]
test_data = testdata[:,:-1]
test_labels = testdata[:,-1]

In [26]:
#Exercise 1
from sklearn.neighbors import KNeighborsClassifier

#Training the model
kNN = KNeighborsClassifier(n_neighbors=1)
kNN.fit(train_data,train_labels)

#Predicting test and training
test_prediction = kNN.predict(test_data)
train_prediction = kNN.predict(train_data)

#Calculating accuracy
def classification_error(prediction, labels):
    if len(prediction) != len(labels):
        print("Prediction and labels should have the same dimension.")
        return 0
        
    acc = 0
    for i in range(len(prediction)):
        if prediction[i] == labels[i]:
            acc+=1
    return acc/len(prediction)

print("Train accuracy: " + str(classification_error(train_prediction, train_labels)))
print("Test accuracy: " + str(classification_error(test_prediction, test_labels)))

Train accuracy: 1.0
Test accuracy: 0.945993031358885


In [27]:
#Exercise 2
from sklearn.model_selection import KFold

def cross_validater(classifier, train_data):
    error_rates = []
    cv = KFold(n_splits=5)
    for train, test in cv.split(train_data):
        traindataCV, testdataCV, trainlabelsCV, testlabelsCV = train_data[train], train_data[test], train_labels[train], train_labels[test]
        classifier.fit(traindataCV,trainlabelsCV)
        test_prediction = classifier.predict(testdataCV)
        error_rates.append(classification_error(test_prediction,testlabelsCV))
    return np.mean(error_rates)

def best_k(k_array, train_data):
    k_accuracies = []
    for i in k_array:
        kNN = KNeighborsClassifier(n_neighbors=i)
        k_accuracies.append(cross_validater(kNN, train_data))
    best_accuracy = k_accuracies.index(max(k_accuracies))
    return k_array[best_accuracy]

print("Best k is: k=" + str(best_k([1,3,5,7,9,11], train_data)))

Best k is: k=3


In [28]:
#Exercise 3
#Training the model with our best_k hyperparameter
kNN = KNeighborsClassifier(n_neighbors=3)
kNN.fit(train_data,train_labels)

#Predicting test and train data
test_prediction = kNN.predict(test_data)
train_prediction = kNN.predict(train_data)

print("New train accuracy: " + str(classification_error(train_prediction, train_labels)))
print("New test accuracy: " + str(classification_error(test_prediction, test_labels)))

New train accuracy: 0.971
New test accuracy: 0.9494773519163763


In [29]:
#Exercise 4
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_data)
train_dataN = scaler.transform(train_data)
test_dataN = scaler.transform(test_data)

print("The transformed dataset now has mean: " + str(np.mean(train_dataN)) + ", variance: " + str(np.var(train_dataN)))

best_k_n = best_k([1,3,5,7,9,11], train_dataN)
print("Best k is: k=" + str(best_k_n))

#Training the model with our best_k hyperparameter
kNN = KNeighborsClassifier(n_neighbors=3)
kNN.fit(train_dataN,train_labels)

#Predicting test and train data
test_prediction = kNN.predict(test_dataN)
train_prediction = kNN.predict(train_dataN)

print("New train accuracy: " + str(classification_error(train_prediction, train_labels)))
print("New test accuracy: " + str(classification_error(test_prediction, test_labels)))

The transformed dataset now has mean: -6.5588560224e-18, variance: 1.0
Best k is: k=3
New train accuracy: 0.972
New test accuracy: 0.9599303135888502
