In [13]:
import sys
sys.path.append('../')

import csv
from sources.distances import DistanceCalculator as dist
from random import shuffle

euclidean = 1
manhattan = 2
minkowski = 3

In [14]:
def fixDataSet():
    with open('../dataset/ionosphere.data','r') as File:
        reader = csv.reader(File, delimiter=',', quotechar=',',
                            quoting=csv.QUOTE_MINIMAL)

        l_reader = list(reader)

    mycsv_list = []    
    for i in range(len(l_reader)):
        if not(l_reader[i] in l_reader[i+1:len(l_reader)]):
            mycsv_list.append(l_reader[i])

    with open("ionosphere.csv.teste", "w") as File:
        writer = csv.writer(File, lineterminator='\n')
        writer.writerows(mycsv_list)
        

In [15]:
def num(s):
    try:
        return float(s)
    except ValueError:
        return s
    

In [16]:
def getData(datasetName = 'mycsv.csv', percentToTraining = 60, randomize = True, verbose = True):
    
    count = 0
    group_g = 0
    test_data = []
    training_data = []
    
    
    with open(datasetName,'r') as File:
        reader = csv.reader(File, delimiter=',', quotechar=',',
                            quoting=csv.QUOTE_MINIMAL)

        l_reader = list(reader)
        limit = int(len(l_reader) * (percentToTraining/100))
        
        if randomize:
            shuffle(l_reader)

        for row in l_reader:

            if row[-1] == "g" :
                group_g += 1

            if count < limit:
                training_data.append([num(i) for i in row])
            else:
                test_data.append([num(i) for i in row])

            count += 1
            
    if verbose:
        print("Total de amostras: %d:" %len(l_reader))
        print("    - %d amostra do tipo \"Good\"" % group_g)
        print("    - %d amostra do tipo \"Bad\"" % (len(l_reader) - group_g))
        print("")
        print("%d %% das amostras separadas para treino." %percentToTraining)
        print("    - %d amostras para treino" % (len(training_data)))
        print("    - %d amostras para teste" % (len(test_data)))
            
    return training_data, test_data


In [20]:
def knn(training, test, k, distanceMethod = euclidean, distanceOrder = 0.5 ):
    
    result = []
    
    for i in range(len(test)):
        distances = {}
        
        for j in range(len(training)):
            if distanceMethod == euclidean:
                distances[j] = dist.euclidean_distance(test[i], training[j])
            elif distanceMethod == manhattan:
                distances[j] = dist.manhattan_distance(test[i], training[j])
            elif distanceMethod == minkowski:
                distances[j] = dist.minkowski_distance(test[i], training[j], distanceOrder)
                
        k_neighbors = sorted(distances, key=distances.get)[:k]
        
        g_count, b_count = 0, 0
        
        for index in k_neighbors:
            if training[index][-1] == 'g':
                g_count += 1
            else:
                b_count += 1
        
        if g_count > b_count:
            result.append('g')
        else:
            result.append('b')
            
        
    acertos = 0
    for i in range(len(test)):
        #print("Obtido : " + result[i])
        #print("Correto: " + test[i][-1])
        
        if(result[i] == test[i][-1]):
            acertos += 1
        
    print("Porcentagem de acertos: %.4f %%" % (100*acertos/len(test)))
    

In [27]:
fixDataSet()
training_data, test_data = getData('../dataset/ionosphere.csv',60,randomize=True,verbose=False,)

In [35]:
print("\nEuclidean distance:")
knn(training_data, test_data, k=13, distanceMethod=euclidean)

print("\nManhattan distance:")
knn(training_data, test_data, k=13, distanceMethod=manhattan)

print("\nMinkowski distance:")
knn(training_data, test_data, k=13, distanceMethod=minkowski, distanceOrder = 0.5)


Euclidean distance:
Porcentagem de acertos: 88.5714 %

Manhattan distance:
Porcentagem de acertos: 89.2857 %

Minkowski distance:
Porcentagem de acertos: 94.2857 %


In [12]:
#
# Busca e exibe duplicatas em um arquivo csv
#

with open('../dataset/ionosphere.csv','r') as File:
    reader = csv.reader(File, delimiter=',', quotechar=',',
                        quoting=csv.QUOTE_MINIMAL)

    l_reader = list(reader)

mycsv_list = []    
for i in range(len(l_reader)):
    if l_reader[i] in l_reader[i+1:len(l_reader)]:
        print(l_reader[i])