In [170]:
# knn solution for iris data set by U. Kiekheben
import numpy as np
import math
from sklearn import datasets
from sklearn import preprocessing
iris = datasets.load_iris()
iris_data = iris.data
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [171]:
unknown_entry = [4.8,2.5,5.3,2.4]

In [172]:
# getting the feature values
sepal_length = iris["data"][:, 0:1]
sepal_width = iris["data"][:, 1:2]
petal_length = iris["data"][:, 2:3]
petal_width = iris["data"][:, 3:]

In [173]:
# getting min and max from the features
min_values = [sepal_length.min(),sepal_width.min(),petal_length.min(),petal_width.min()]
min_values

[4.3, 2.0, 1.0, 0.1]

In [174]:
max_values = [sepal_length.max(),sepal_width.max(),petal_length.max(),petal_width.max()]
max_values

[7.9, 4.4, 6.9, 2.5]

In [175]:
# normalize one entry
def normalize_entry(entry):
    for i in range(len(entry)):
        value = (entry[i] - min_values[i]) / (max_values[i] - min_values[i])
        entry[i] = value
    return entry

In [176]:
# normalize the data
def normalize_data(data):
    for i in range(len(data)):
        entry = data[i]
        normalized_entry = normalize_entry(entry)
        data[i] = normalized_entry
    return data

In [177]:
# normalize iris data
iris_copy1 = iris_data.copy()
normalized_data = normalize_data(iris_copy1)
normalized_data

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [178]:
# normalize unknown entry
unknown_entry1 = unknown_entry.copy()
normentry = normalize_entry(unknown_entry1)
normentry

[0.13888888888888887,
 0.20833333333333331,
 0.7288135593220338,
 0.9583333333333333]

In [179]:
# calculate the distance of an unknown entry to the entries in the dataset and add the result to the dataset
def set_distances(data, entry):
    result = np.empty((0, 5))
    for i in range(len(data)):
        distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(data[i], entry)]))
        result_array = np.append(data[i], [distance], axis=0)
        result = np.append(result, [result_array], axis=0)
    return result

In [180]:
data_with_distance = set_distances(normalized_data, normentry)
data_with_distance

array([[0.22222222, 0.625     , 0.06779661, 0.04166667, 1.20738426],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667, 1.14951971],
       [0.11111111, 0.5       , 0.05084746, 0.04166667, 1.17718175],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667, 1.1492117 ],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667, 1.22081   ],
       [0.30555556, 0.79166667, 0.11864407, 0.125     , 1.19783422],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333, 1.16029299],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667, 1.18271405],
       [0.02777778, 0.375     , 0.06779661, 0.04166667, 1.14775635],
       [0.16666667, 0.45833333, 0.08474576, 0.        , 1.18173504],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667, 1.23809486],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667, 1.17225448],
       [0.13888889, 0.41666667, 0.06779661, 0.        , 1.18268718],
       [0.        , 0.41666667, 0.01694915, 0.        , 1.21977318],
       [0.41666667, 0.83333333, 0.

In [181]:
# calculate distances of the unknown entry to the iris dataset
iris_target = iris.target.copy()
data_with_distances_and_targets = np.column_stack( (data_with_distance, iris_target) )
data_with_distances_and_targets

array([[0.22222222, 0.625     , 0.06779661, 0.04166667, 1.20738426,
        0.        ],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667, 1.14951971,
        0.        ],
       [0.11111111, 0.5       , 0.05084746, 0.04166667, 1.17718175,
        0.        ],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667, 1.1492117 ,
        0.        ],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667, 1.22081   ,
        0.        ],
       [0.30555556, 0.79166667, 0.11864407, 0.125     , 1.19783422,
        0.        ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333, 1.16029299,
        0.        ],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667, 1.18271405,
        0.        ],
       [0.02777778, 0.375     , 0.06779661, 0.04166667, 1.14775635,
        0.        ],
       [0.16666667, 0.45833333, 0.08474576, 0.        , 1.18173504,
        0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667, 1.23809486,
        0.        ],
       [0.13888889, 0

In [182]:
# find the k nearest neighbors
def find_knn(data, k):
    sorted_data = sorted(data,key=lambda x: x[4])
    knn = np.array(sorted_data)[:k,:]
    return knn

In [198]:
# fin k nearest neighbors for the iris data
nearest_neighbors = find_knn(data_with_distances_and_targets, 7)
nearest_neighbors

array([[0.38888889, 0.20833333, 0.6779661 , 0.79166667, 0.30473471,
        2.        ],
       [0.41666667, 0.33333333, 0.69491525, 0.95833333, 0.3064875 ,
        2.        ],
       [0.36111111, 0.33333333, 0.66101695, 0.79166667, 0.31206069,
        2.        ],
       [0.16666667, 0.20833333, 0.59322034, 0.66666667, 0.3228414 ,
        2.        ],
       [0.41666667, 0.29166667, 0.69491525, 0.75      , 0.35868762,
        2.        ],
       [0.41666667, 0.29166667, 0.69491525, 0.75      , 0.35868762,
        2.        ],
       [0.44444444, 0.41666667, 0.69491525, 0.70833333, 0.44767853,
        2.        ]])

In [200]:
# find the max of tagets
def find_target(data):
    targets = data[:, 5:]
    targets_flat = targets.flatten().astype(int)
    print(targets_flat)
    target = np.bincount(targets_flat).argmax()
    # first solution I want to let it here because it was part of the process
    #setosa = np.isclose(data, 0).sum()
    #versicolor = np.isclose(data, 1).sum()
    #virginica = np.isclose(data, 2).sum() 
    #values = np.array([["setosa", setosa],["versicolor", versicolor],["virginica",virginica]])
    #sorted_data = sorted(values,key=lambda x: x[1])
    #max_value = np.array(sorted_data)[2:3,:]
    return iris.target_names[target]

In [201]:
unknown_flower = find_target(nearest_neighbors)
unknown_flower

[2 2 2 2 2 2 2]


'virginica'

The unknown iris is a Virginica

I testet also with different numbers of k until until 25. I got always the same result. But the number of versicolor raises in the target results.