In [454]:
import itertools
from typing import Callable

import numpy as np
from numpy.random import shuffle
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


In [455]:
iris = load_iris()

data = np.array(iris['data'])
target = np.array(iris['target'])

data = data[:,:]
X_train, X_dev, y_train, y_dev =  train_test_split(data,target, test_size=0.4, train_size=0.6, random_state=42)

In [456]:
def toIrisName(nb: int)-> str:
    return iris["target_names"][nb]

In [457]:
def distance(x1: np.ndarray, x2: np.ndarray):
    return np.linalg.norm(x1 - x2)

In [458]:
def KNN_Algo(datas: np.array,target: np.array,k, distance: Callable,new_data :np.ndarray ):
    
    dicDist = {}
    for ii,data in enumerate(datas):
        dicDist[ii] = distance(data, new_data)
    #sorted by value
    listDist = dict(sorted(dicDist.items(), key=lambda item: item[1]))
    #get k first
    splitKNear = dict(itertools.islice(listDist.items(), k))
        
    #get the major cluster
    _, counts = np.unique(target, return_counts=True)
    listTarget = [0]*len(counts)
    for key in splitKNear: 
        listTarget[target[key]] +=1
    return np.argmax(listTarget)


In [459]:
def accuracyKNN(data: np.array, target: np.array, k: int) -> float:
    nbGoodAnswer = 0
    nbAnswer = data.shape[0]
    
    for ii in range(nbAnswer):
        res = KNN_Algo(X_train,y_train, k, distance, data[ii])
        if res == target[ii]:
            nbGoodAnswer+=1
    return nbGoodAnswer/nbAnswer

In [460]:
def zeroR(target: np.array, new_data: np.array) -> str:
    maxi = [0]*(np.max(target)+1)
    for ii,val in enumerate(target):
        maxi[val] += 1
    return np.argmax(maxi)


In [461]:
def AccuracyZeroR(data: np.array, target: np.array)-> float:
    accuracy = 0
    for indiceName in target :
        if zeroR(target,np.array([0])) == indiceName:
            accuracy += 1
    return accuracy/len(target)

In [462]:
def nbUniqueTarget(target: np.array) -> int:
    _, counts = np.unique(target, return_counts=True)
    return len(counts)

In [463]:
def discretizeValue(data: np.array, target: np.array) -> np.array:
    allDiscretInterval = []
    for ii in range(data.shape[1]):
        #Interval discretize
        discretisation = np.array([])

        
        feature_i = data[:,ii]

        #get the number of unique value
        uniqueTarget = nbUniqueTarget(target)

        
        pas = feature_i.mean()/uniqueTarget
        maxi = np.max(feature_i)
        mini = np.min(feature_i)
        discretisation = []
        while (mini+pas <= maxi):
            
            discretisation.append([mini,mini+pas])
            mini += pas #new min not the same as the ancient max 
        discretisation.append(np.array([mini,maxi]))
        allDiscretInterval.append(discretisation)
    
    return allDiscretInterval

In [464]:
def ProbaOneR(count: np.array, nbTarget: int) -> float:
    return np.divide(count,count.sum(axis=1)[:,np.newaxis])

In [474]:
from copy import deepcopy

def predictFeatureOneR(data: np.array, target: np.array):
    discretInterval = discretizeValue(data, target)
    
    countLabel = deepcopy(list(discretInterval))

    
    #Replace all value by zero
    UniqueTarget = nbUniqueTarget(target)
    for ii,c in enumerate(countLabel):#for each label interval list
        for i in range(len(c)):#for each interval
            c[i] = [0]*UniqueTarget
        countLabel[ii] = c
    for ii,features in enumerate(data):
         for jj,valueFeature in enumerate(features):
             for indiceInterval,interval in enumerate(discretInterval[jj]):
                 bornInf = interval[0]
                 bornSup = interval[1]
                 
                 if bornInf < valueFeature < bornSup:
                     x = 5
                     countLabel[jj][indiceInterval][target[ii]] += 1

    #calculation of probality of each feature
    selectFeature = -1
    maxiProb = -np.inf
    for iiSelectFeature,f in enumerate(countLabel):
        nbValue = ProbaOneR(np.array(f), UniqueTarget)
        
        prob = np.divide(nbValue.max(axis=1).sum(),len(nbValue))
        
        
        if prob > maxiProb:
            maxiProb = prob
            selectFeature = iiSelectFeature
    #set the feature to predict 
    nbValue = ProbaOneR(np.array(countLabel[selectFeature]), UniqueTarget)

    
    featurePredictList = np.argmax(nbValue, axis=1)
    print(nbValue)
    print(featurePredictList)
    return selectFeature, discretInterval[selectFeature], featurePredictList
predictFeatureOneR(X_train,y_train)

[[1.         0.         0.        ]
 [0.         1.         0.        ]
 [0.         0.83333333 0.16666667]
 [0.         0.07407407 0.92592593]
 [0.         0.         1.        ]]
[0 1 1 2 2]


(2,
 [[1.1, 2.3814814814814813],
  [2.3814814814814813, 3.662962962962963],
  [3.662962962962963, 4.944444444444445],
  [4.944444444444445, 6.225925925925926],
  array([6.22592593, 6.7       ])],
 array([0, 1, 1, 2, 2]))

In [475]:
def oneR(selectFeature: np.array, discretInterval: np.array, featurePredictList: np.array, new_data: np.array):
    for ii,interval in enumerate(discretInterval):
        if interval[0] < new_data[selectFeature] and new_data[selectFeature] < interval[1]:
            return featurePredictList[ii]

In [480]:
def AccuracyOneR(dataTrain: np.array, targetTrain: np.array, dataDev: np.array, targetDev: np.array)-> float:
    selectFeature, discretInterval, featurePredictList = predictFeature(dataTrain, targetTrain)
    accuracy = 0
    for ii,indiceName in enumerate(targetDev) :
        prediction = oneR(selectFeature,discretInterval,featurePredictList, dataDev[ii])
        if prediction == indiceName:
            accuracy += 1
    return accuracy/len(targetDev)

In [481]:
print(f"Accuracy of KNN with k=3 ,{accuracyKNN(X_train,y_train,3)}")
print(f"Accuracy of ZeroR ,{AccuracyZeroR(X_train, y_dev)}")
print(f"Accuracy of OneR,{AccuracyOneR(X_train,y_train,X_dev,y_dev)}")



Accuracy of KNN with k=3 ,0.9555555555555556
Accuracy of ZeroR ,0.38333333333333336
Accuracy of OneR,0.5166666666666667
