# "Solving the Problem of the K Parameter in the KNN Classifier Using an Ensemble Learning Approach"

# Ideea principala a acestui articol este utilizarea algoritmului KNN fara a specifica parametrul k in mod empiric.


# Metoda propusa in acest articol a fost asamblarea clasificatoarelor KNN cu k=1, 3, 5, 7 ... n (unde n reprezinta radacina patrata a dimensiunii setului de date) intr-un singur clasificator care va clasifica in urma deciziei majoritare 

# Pasul 1: importam librariile necesare

In [3]:
#import subprocess
import pandas as pd
import numpy as np

from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils import shuffle
from matplotlib import pyplot

from sklearn.ensemble import VotingClassifier
import math


#  Pasul 2: definim metoda de instantiere a clasificatorului asamblat

In [4]:
# get a voting ensemble of models
def get_voting(n):
	k=-1; count=0; models = list(); label="-NN"; labelList=[];
	while k<n: 
		k=k+2;
		count=count+1;
		labelList.append(str(k)+label)
		# define the base models
		models.append((str(k)+label, KNeighborsClassifier(n_neighbors=k)))
	# define the voting ensemble
	ensemble = VotingClassifier(estimators=models, voting='hard')
	return ensemble

#  Pasul 3: vom crea o lista cu clasificatorii care vor fi evaluati, aceasta lista contine clasificatorii 1NN, 3NN, 5NN.... nNN (unde n reprezinta radacina patrata a dimensiunii setului de date), si clasificatorul care asambleaza toti clasificatorii mentionati anterior

In [5]:
# get a list of models to evaluate
def get_models(n):
	models = dict()
	k=-1; count=0; label="-NN"; labelList=[];
	while k<n: 
		k=k+2;
		count=count+1;
		labelList.append(str(k)+label)
		# define the base models
		if(k<10):
			models['  '+str(k)+label] = KNeighborsClassifier(n_neighbors=k)
		elif(k>10 and k<100):
			models[' '+str(k)+label] = KNeighborsClassifier(n_neighbors=k)
		else:
			models[str(k)+label] = KNeighborsClassifier(n_neighbors=k)
		
	models['ensemble'] = get_voting(n)
	return models

# Pasul 4: vom crea o metoda care va evalua fiecare model individual, metrica de interes fiind acuratetea. Pentru testare am impartit setul de date in 70% date de antrenare si 30% date de testare cum a specificat autorul documentului

In [6]:
# evaluate a give model using cross-validation
def evaluate_model(model):
	cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

# Un exemplu propus de autor foloseste setul de date QSAR.csv care contine 43 de feature-uri, din care primele 42 sunt date de intrare, iar al 43-lea feature reprezinta clasa din care face parte obiectul interogat. 
# Dimensiunea setului de date este de 1055 de unde tragem concluzia ca vom utiliza clasificatorii 1NN, 3NN, 5NN, 7NN, 9NN, 11NN, 13NN, 15NN, 17NN, 19NN, 21NN, 23NN, 25NN, 27NN, 29NN, 31NN(deoarece 31 este cel mai apropiat numar impar de radical(1055)) in cadrul clasificatorului asamblat. 

In [7]:
input_file = "QSAR .csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F43')], data['F43']

n=int(math.sqrt(1055))


if(n % 2 == 0):
	n=n-1

models = get_models(n)

# Datorita unui bug modelele sunt analizate intr-o ordine aleatoare, motiv pentru care voi introduce o sortare alfabetica a numelor clasificatoriilor care va ordona indirect si lista performantelor obtinute


In [8]:
# evaluate the models and store results (unsorted)
results, names = list(), list()

for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))

  1-NN 0.7981 
  3-NN 0.8190 
  5-NN 0.8038 
  7-NN 0.8057 
  9-NN 0.8095 
 11-NN 0.8076 
 13-NN 0.8009 
 15-NN 0.7943 
 17-NN 0.7886 
 19-NN 0.7839 
 21-NN 0.7754 
 23-NN 0.7735 
 25-NN 0.7773 
 27-NN 0.7744 
 29-NN 0.7687 
 31-NN 0.7640 
ensemble 0.7972 


In [9]:
# evaluate the models and store results (sorted)
results, names = list(), list()

for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))

  1-NN 0.7981 
  3-NN 0.8190 
  5-NN 0.8038 
  7-NN 0.8057 
  9-NN 0.8095 
 11-NN 0.8076 
 13-NN 0.8009 
 15-NN 0.7943 
 17-NN 0.7886 
 19-NN 0.7839 
 21-NN 0.7754 
 23-NN 0.7735 
 25-NN 0.7773 
 27-NN 0.7744 
 29-NN 0.7687 
 31-NN 0.7640 
ensemble 0.7972 


# Australian data set contine 690 randuri de date, 42 de feature-uri, feature-ul pe care il vom clasifica este F15 care are 2 posibile clase 

In [10]:
print('Evaluate Australian dataset')
input_file = "australian.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F15')], data['F15']

n=int(math.sqrt(690))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Australian dataset
  1-NN 0.6681 
  3-NN 0.6855 
  5-NN 0.6841 
  7-NN 0.6928 
  9-NN 0.6884 
 11-NN 0.7000 
 13-NN 0.6957 
 15-NN 0.6725 
 17-NN 0.6884 
 19-NN 0.6913 
 21-NN 0.6957 
 23-NN 0.6928 
 25-NN 0.6899 
ensemble 0.6928 
Best accuracy : 11-NN with accuracy 0.7000 


# Balance data set contine 625 randuri de date, 4 feature-uri, feature-ul pe care il vom clasifica este F1 care are 3 posibile clase 

In [11]:
print('Evaluate Balance dataset')
input_file = "balance.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F1')], data['F1']

n=int(math.sqrt(625))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Balance dataset
  1-NN 0.7888 
  3-NN 0.7792 
  5-NN 0.8176 
  7-NN 0.8592 
  9-NN 0.8688 
 11-NN 0.8768 
 13-NN 0.8736 
 15-NN 0.8864 
 17-NN 0.8912 
 19-NN 0.8944 
 21-NN 0.8944 
 23-NN 0.8928 
 25-NN 0.8880 
ensemble 0.8896 
Best accuracy : 19-NN with accuracy 0.8944 


# Banknote data set contine 1372 randuri de date, 5 feature-uri, feature-ul pe care il vom clasifica este F5 care are 2 posibile clase 

In [12]:
print('Evaluate Banknote dataset')
input_file = "banknote.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F5')], data['F5']

n=int(math.sqrt(1372))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Banknote dataset
  1-NN 0.9993 
  3-NN 0.9993 
  5-NN 1.0000 
  7-NN 1.0000 
  9-NN 1.0000 
 11-NN 1.0000 
 13-NN 1.0000 
 15-NN 0.9978 
 17-NN 0.9942 
 19-NN 0.9934 
 21-NN 0.9927 
 23-NN 0.9927 
 25-NN 0.9927 
 27-NN 0.9927 
 29-NN 0.9927 
 31-NN 0.9927 
 33-NN 0.9927 
 35-NN 0.9898 
 37-NN 0.9891 
ensemble 0.9934 
Best accuracy :  5-NN with accuracy 1.0000 


# Haberman data set contine 306 randuri de date, 4 feature-uri, feature-ul pe care il vom clasifica este F4 care are 2 posibile clase

In [13]:
print('Evaluate Haberman dataset')
input_file = "haberman.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F4')], data['F4']

n=int(math.sqrt(306))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Haberman dataset
  1-NN 0.6636 
  3-NN 0.6960 
  5-NN 0.7024 
  7-NN 0.7319 
  9-NN 0.7614 
 11-NN 0.7616 
 13-NN 0.7681 
 15-NN 0.7452 
 17-NN 0.7452 
ensemble 0.7680 
Best accuracy : 13-NN with accuracy 0.7681 


# Heart data set contine 271 randuri de date, 14 feature-uri, feature-ul pe care il vom clasifica este F14 care are 2 posibile clase

In [14]:
print('Evaluate Heart dataset')
input_file = "heart.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F14')], data['F14']

n=int(math.sqrt(271))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Heart dataset
  1-NN 0.6148 
  3-NN 0.6519 
  5-NN 0.6481 
  7-NN 0.6704 
  9-NN 0.6778 
 11-NN 0.6704 
 13-NN 0.6704 
 15-NN 0.6741 
ensemble 0.6704 
Best accuracy :  9-NN with accuracy 0.6778 


# Ionosphere data set contine 351 randuri de date, 35 feature-uri, feature-ul pe care il vom clasifica este F35 care are 2 posibile clase

In [16]:
print('Evaluate Ionosphere dataset')
input_file = "ionosphere.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F35')], data['F35']

n=int(math.sqrt(351))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Ionosphere dataset
  1-NN 0.8657 
  3-NN 0.8457 
  5-NN 0.8457 
  7-NN 0.8288 
  9-NN 0.8288 
 11-NN 0.8345 
 13-NN 0.8347 
 15-NN 0.8316 
 17-NN 0.8375 
ensemble 0.8288 
Best accuracy :  1-NN with accuracy 0.8657 


# Iris data set contine 151 randuri de date, 5 feature-uri, feature-ul pe care il vom clasifica este F5 care are 3 posibile clase

In [18]:
print('Evaluate Iris dataset')
input_file = "iris.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F5')], data['F5']

n=int(math.sqrt(151))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Iris dataset
  1-NN 0.9600 
  3-NN 0.9600 
  5-NN 0.9667 
  7-NN 0.9667 
  9-NN 0.9667 
 11-NN 0.9733 
ensemble 0.9667 
Best accuracy : 11-NN with accuracy 0.9733 


# Liver data set contine 345 randuri de date, 7 feature-uri, feature-ul pe care il vom clasifica este F7 care are 2 posibile clase

In [27]:
print('Evaluate Liver dataset')
input_file = "liver.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F7')], data['F7']

n=int(math.sqrt(345))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))


Evaluate Liver dataset
  1-NN 0.6143 
  3-NN 0.6422 
  5-NN 0.6604 
  7-NN 0.6925 
  9-NN 0.7044 
 11-NN 0.6875 
 13-NN 0.6873 
 15-NN 0.6932 
 17-NN 0.6873 
ensemble 0.6901 
Best accuracy :  9-NN with accuracy 0.7044 


# Parkinson data set contine 1040 randuri de date, 27 feature-uri, feature-ul pe care il vom clasifica este F1 care are 2 posibile clase

In [30]:
print('Evaluate Parkinson dataset')
input_file = "parkinson.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F1')], data['F1']

n=int(math.sqrt(168))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Parkinson dataset
  1-NN 0.6062 
  3-NN 0.5888 
  5-NN 0.5111 
  7-NN 0.4939 
  9-NN 0.4405 
 11-NN 0.3631 
ensemble 0.4938 
Best accuracy :  1-NN with accuracy 0.6062 


# Sonar data set contine 209 randuri de date, 61 feature-uri, feature-ul pe care il vom clasifica este F61 care are 2 posibile clase

In [15]:
print('Evaluate Sonar dataset')
input_file = "sonar.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F61')], data['F61']

n=int(math.sqrt(209))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Sonar dataset
  1-NN 0.8074 
  3-NN 0.8075 
  5-NN 0.7784 
  7-NN 0.7204 
  9-NN 0.6725 
 11-NN 0.6725 
 13-NN 0.6533 
ensemble 0.7398 
Best accuracy :  3-NN with accuracy 0.8075 


# Wine data set contine 179 randuri de date, 13 feature-uri, feature-ul pe care il vom clasifica este F1 care are 3 posibile clase

In [16]:
print('Evaluate Wine dataset')
input_file = "wine.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F1')], data['F1']

n=int(math.sqrt(179))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))

Evaluate Wine dataset
  1-NN 0.7525 
  3-NN 0.7143 
  5-NN 0.7198 
  7-NN 0.7083 
  9-NN 0.6970 
 11-NN 0.6802 
 13-NN 0.6856 
ensemble 0.7252 
Best accuracy :  1-NN with accuracy 0.7525 


# EEG data set contine 14980 randuri de date, 15 feature-uri, feature-ul pe care il vom clasifica este F15 care are 2 posibile clase (loading time > 5 minutes)

In [18]:
print('Evaluate EEG dataset')
input_file = "EEG.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F15')], data['F15']

n=int(math.sqrt(14980))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))


Evaluate EEG dataset
  1-NN 0.9768 
  3-NN 0.9726 
  5-NN 0.9653 
  7-NN 0.9613 
  9-NN 0.9573 
 11-NN 0.9536 
 13-NN 0.9519 
 15-NN 0.9493 
 17-NN 0.9449 
 19-NN 0.9419 
 21-NN 0.9393 
 23-NN 0.9366 
 25-NN 0.9334 
 27-NN 0.9310 
 29-NN 0.9290 
 31-NN 0.9271 
 33-NN 0.9245 
 35-NN 0.9225 
 37-NN 0.9202 
 39-NN 0.9184 
 41-NN 0.9166 
 43-NN 0.9145 
 45-NN 0.9131 
 47-NN 0.9113 
 49-NN 0.9105 
 51-NN 0.9089 
 53-NN 0.9070 
 55-NN 0.9051 
 57-NN 0.9041 
 59-NN 0.9031 
 61-NN 0.9017 
 63-NN 0.9012 
 65-NN 0.9005 
 67-NN 0.8989 
 69-NN 0.8972 
 71-NN 0.8959 
 73-NN 0.8953 
 75-NN 0.8940 
 77-NN 0.8927 
 79-NN 0.8921 
 81-NN 0.8915 
 83-NN 0.8905 
 85-NN 0.8889 
 87-NN 0.8880 
 89-NN 0.8863 
 91-NN 0.8852 
 93-NN 0.8849 
 95-NN 0.8835 
 97-NN 0.8823 
 99-NN 0.8802 
101-NN 0.8787 
103-NN 0.8784 
105-NN 0.8771 
107-NN 0.8757 
109-NN 0.8748 
111-NN 0.8737 
113-NN 0.8724 
115-NN 0.8711 
117-NN 0.8707 
119-NN 0.8685 
121-NN 0.8680 
ensemble 0.9052 
Best accuracy :  1-NN with accuracy 0.9768 


# Letter recognition data set contine 20000 randuri de date, 16 feature-uri, feature-ul pe care il vom clasifica este F1 care are 26 posibile clase (loading time > 5 minutes)

In [19]:
print('Evaluate Letter-Recognition dataset')
input_file = "letter-recognition.csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F1')], data['F1']

n=int(math.sqrt(20000))


if(n % 2 == 0):
	n=n-1

models = get_models(n)
# evaluate the models and store results
results, names = list(), list()
bestName="1NN"; bestAccuracy=0;
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))
	if(mean(results[x])> bestAccuracy):
		bestName= names[x]; 
		bestAccuracy= mean(results[x]);
print('Best accuracy :%s with accuracy %.4f '% (bestName, bestAccuracy))


Evaluate Letter-Recognition dataset
  1-NN 0.9570 
  3-NN 0.9531 
  5-NN 0.9503 
  7-NN 0.9482 
  9-NN 0.9465 
 11-NN 0.9431 
 13-NN 0.9410 
 15-NN 0.9370 
 17-NN 0.9347 
 19-NN 0.9312 
 21-NN 0.9278 
 23-NN 0.9250 
 25-NN 0.9219 
 27-NN 0.9189 
 29-NN 0.9163 
 31-NN 0.9149 
 33-NN 0.9118 
 35-NN 0.9090 
 37-NN 0.9069 
 39-NN 0.9044 
 41-NN 0.9033 
 43-NN 0.8995 
 45-NN 0.8964 
 47-NN 0.8942 
 49-NN 0.8911 
 51-NN 0.8857 
 53-NN 0.8836 
 55-NN 0.8822 
 57-NN 0.8792 
 59-NN 0.8765 
 61-NN 0.8736 
 63-NN 0.8704 
 65-NN 0.8665 
 67-NN 0.8638 
 69-NN 0.8614 
 71-NN 0.8584 
 73-NN 0.8575 
 75-NN 0.8549 
 77-NN 0.8529 
 79-NN 0.8509 
 81-NN 0.8484 
 83-NN 0.8462 
 85-NN 0.8434 
 87-NN 0.8417 
 89-NN 0.8400 
 91-NN 0.8377 
 93-NN 0.8350 
 95-NN 0.8341 
 97-NN 0.8317 
 99-NN 0.8294 
101-NN 0.8275 
103-NN 0.8248 
105-NN 0.8236 
107-NN 0.8224 
109-NN 0.8206 
111-NN 0.8190 
113-NN 0.8168 
115-NN 0.8155 
117-NN 0.8134 
119-NN 0.8111 
121-NN 0.8093 
123-NN 0.8077 
125-NN 0.8070 
127-NN 0.8061 
129-

# Concluzii: 
# Toate seturile de date evaluate anterior au fost evaluate si in articolul ales de mine, restul seturilor de date care sunt prezentate in articol si nu sunt regasite mai sus nu mai sunt disponibile pe site-ul din bibliografie. 

# In urma experimentelor am remarcat ca desi clasificatorul asamblat descris in articol nu depaseste performanta celui mai bun clasificator KNN din ansamblul sau performanta ansamblului este foarte apropiata de cea mai buna performanta, scutundu-ne de cautarea parametrului k care ar avea cea mai buna performanta. 

# De asemenea am remarcat ca performantele optinute ruland codul python din terminal(folosind versiunea 2.7.3) si cea optinuta din acest notebook(care foloseste versiunea 3) sunt diferite