# "Solving the Problem of the K Parameter in the KNN
# Classifier Using an Ensemble Learning Approach"

# Ideea principala a acestui articol este utilizarea algoritmului # KNN fara a specifica parametrul k in mod empiric.


# Metoda propusa in acest articol a fost asamblarea clasificatoarelor KNN cu k=1, 3, 5, 7 ... n (unde n reprezinta radacina patrata a dimensiunii setului de date) intr-un singur clasificator care va clasifica in urma deciziei majoritare 

# Pasul 1: importam librariile necesare

In [1]:
#import subprocess
import pandas as pd
import numpy as np

from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils import shuffle
from matplotlib import pyplot

from sklearn.ensemble import VotingClassifier
import math


#  Pasul 2: definim metoda de instantiere a clasificatorului asamblat

In [2]:
# get a voting ensemble of models
def get_voting(n):
	k=-1; count=0; models = list(); label="-NN"; labelList=[];
	while k<n: 
		k=k+2;
		count=count+1;
		labelList.append(str(k)+label)
		# define the base models
		models.append((str(k)+label, KNeighborsClassifier(n_neighbors=k)))
	# define the voting ensemble
	ensemble = VotingClassifier(estimators=models, voting='hard')
	return ensemble

#  Pasul 3: vom crea o lista cu clasificatorii care vor fi evaluati, aceasta lista contine clasificatorii 1NN, 3NN, 5NN.... nNN (unde n reprezinta radacina patrata a dimensiunii setului de date), si clasificatorul care asambleaza toti clasificatorii mentionati anterior

In [3]:
# get a list of models to evaluate
def get_models(n):
	models = dict()
	k=-1; count=0; label="-NN"; labelList=[];
	while k<n: 
		k=k+2;
		count=count+1;
		labelList.append(str(k)+label)
		# define the base models
		if(k<10):
			models[' '+str(k)+label] = KNeighborsClassifier(n_neighbors=k)
		else:
			models[str(k)+label] = KNeighborsClassifier(n_neighbors=k)
		
	models['ensemble'] = get_voting(n)
	return models

# Pasul 4: vom crea o metoda care va evalua fiecare model individual, metrica de interes fiind acuratetea. Pentru testare am impartit setul de date in 70% date de antrenare si 30% date de testare cum a specificat autorul documentului

In [4]:
# evaluate a give model using cross-validation
def evaluate_model(model):
	cv = RepeatedStratifiedKFold(n_splits=25, n_repeats=1, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

# Un exemplu propus de autor foloseste setul de date QSAR.csv care contine 43 de feature-uri, din care primele 42 sunt date de intrare, iar al 43-lea feature reprezinta clasa din care face parte obiectul interogat. 
# Dimensiunea setului de date este de 1055 de unde tragem concluzia ca vom utiliza clasificatorii 1NN, 3NN, 5NN, 7NN, 9NN, 11NN, 13NN, 15NN, 17NN, 19NN, 21NN, 23NN, 25NN, 27NN, 29NN, 31NN(deoarece 31 este cel mai apropiat numar impar de radical(1055)) in cadrul clasificatorului asamblat. 

In [5]:
input_file = "QSAR .csv"

data = pd.read_csv(input_file, header = 0)

X, y = data[data.columns.drop('F43')], data['F43']

n=int(math.sqrt(1055))


if(n % 2 == 0):
	n=n-1

models = get_models(n)

# Datorita unui bug modelele sunt analizate intr-o ordine aleatoare, motiv pentru care voi introduce o sortare alfabetica a numelor clasificatoriilor care va ordona indirect si lista performantelor obtinute


In [8]:
# evaluate the models and store results (unsorted)
results, names = list(), list()

for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))

 1-NN 0.8029 
 3-NN 0.8276 
 5-NN 0.8199 
 7-NN 0.8256 
 9-NN 0.8226 
11-NN 0.8066 
13-NN 0.8095 
15-NN 0.8113 
17-NN 0.8018 
19-NN 0.7923 
21-NN 0.7876 
23-NN 0.7839 
25-NN 0.7811 
27-NN 0.7716 
29-NN 0.7716 
31-NN 0.7726 
ensemble 0.8085 


In [6]:
# evaluate the models and store results (sorted)
results, names = list(), list()

for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	zipped= zip(names, results)
names, results = zip(*sorted(zipped))
for x in range (len(names)): 	
	print('%s %.4f ' % (names[x], mean(results[x])))

 1-NN 0.8029 
 3-NN 0.8276 
 5-NN 0.8199 
 7-NN 0.8256 
 9-NN 0.8226 
11-NN 0.8066 
13-NN 0.8095 
15-NN 0.8113 
17-NN 0.8018 
19-NN 0.7923 
21-NN 0.7876 
23-NN 0.7839 
25-NN 0.7811 
27-NN 0.7716 
29-NN 0.7716 
31-NN 0.7726 
ensemble 0.8085 
