# Mini Project

## Using k-NN on the datasets

### 25 Datasets

In [1]:
from ucimlrepo import fetch_ucirepo 
from sklearn.preprocessing import MinMaxScaler

In [2]:
#25 datasets
dataset_ids =   {
                "parkinsons": 174,
                "page-blocks": 78,
                "optical": 80,
                "musk2": 75,
                "bc-wisc-diag": 17,
                "students": 697,
                "wine": 109,
                "magic": 159,
                "balance-scale": 12,
                "glass": 42,
                "zoo": 111,
                "waveform": 107,
                "image-segmentation": 50,
                "blood": 176,
                "spect": 95,
                "yeast": 110,
                "monk": 70,
                "ecoli": 39,
                "iris": 53,
                "contraception": 30,
                "fertility": 244,
                "conn-bench-sonar":  151,
                "landsat": 146,
                "ionosphere": 52,
                "letter": 59,
                }

In [3]:
def load_dataset(id, show_info = False):  
    # fetch dataset 
    dataset = fetch_ucirepo(id=id) 
    
    # data (as pandas dataframes) 
    X = dataset.data.features 
    y =dataset.data.targets 
    
    # dictionary gthering infos about the metadata (url, abstract, ... etc.)
    metadata_infos_dict = dataset.metadata
    
    if show_info:
        print('data url:\n', metadata_infos_dict['data_url'])
    
    # variable information
    var_infos = dataset.variables.to_numpy()
    
    data_vectors = X.to_numpy() #instance vectors with features
    features_names = X.columns.to_numpy() #getting the names of each feature
    
    data_labels = y.to_numpy() #output labels for each instance
    label_name = y.columns.to_numpy() # name of the output label
    
    return data_vectors, features_names, data_labels, label_name

In [4]:
def load_all_datasets(dataset_ids):
    
    datasets = {}
    for i, dataset_name in enumerate(dataset_ids.keys()):
        
        scaler_minmax = MinMaxScaler()
        
        X, X_names, y, y_name = load_dataset(id = dataset_ids[dataset_name])
        datasets[dataset_name] = {}
        datasets[dataset_name]["X"] = scaler_minmax.fit_transform(X)
        datasets[dataset_name]["X_names"] = X_names
        datasets[dataset_name]["y"] = y
        datasets[dataset_name]["y_name"] = y_name
        
        print(f"'{dataset_name}' dataset loaded ({i+1}/{len(dataset_ids.keys())})")
    
    return datasets

In [5]:
datasets = load_all_datasets(dataset_ids)

'parkinsons' dataset loaded (1/25)
'page-blocks' dataset loaded (2/25)
'optical' dataset loaded (3/25)
'musk2' dataset loaded (4/25)
'bc-wisc-diag' dataset loaded (5/25)
'students' dataset loaded (6/25)
'wine' dataset loaded (7/25)
'magic' dataset loaded (8/25)
'balance-scale' dataset loaded (9/25)
'glass' dataset loaded (10/25)
'zoo' dataset loaded (11/25)
'waveform' dataset loaded (12/25)
'image-segmentation' dataset loaded (13/25)
'blood' dataset loaded (14/25)
'spect' dataset loaded (15/25)
'yeast' dataset loaded (16/25)
'monk' dataset loaded (17/25)
'ecoli' dataset loaded (18/25)
'iris' dataset loaded (19/25)
'contraception' dataset loaded (20/25)
'fertility' dataset loaded (21/25)
'conn-bench-sonar' dataset loaded (22/25)
'landsat' dataset loaded (23/25)
'ionosphere' dataset loaded (24/25)
'letter' dataset loaded (25/25)


### Implementation of k-NN Classifier

#### Useful libraries

In [6]:
# Importation des bibliothèques nécessaires

import numpy as np
# from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#### k-NN classifier

In [7]:
def predict_kNN(Xtr, ytr, Xtst, k: int = 1):
    knn = KNeighborsClassifier(n_neighbors=k)
    if len(ytr.shape) > 1:
        ytr = ytr[:, 0]
    knn.fit(Xtr, ytr)
    ypred = knn.predict(Xtst)
    return ypred

### Cross-validation methods

In [8]:
def data_split(Xtr, ytr, n_fold, num_sample = None, seed: int = 42):
    if n_fold >= 2:
        if num_sample is None:
            num_sample = len(Xtr)
            
        Tr_set = Xtr[:num_sample]
        Ltr_set = ytr[:num_sample]
        
        np.random.seed(seed)
        avg_len = num_sample//n_fold
        remainder = num_sample%n_fold
        
        extra_list = np.zeros(n_fold)
        extra_folders = np.random.choice(range(n_fold), size=remainder, replace=False)
        extra_list[extra_folders] = 1
        
        train_folders = []
        train_labels = []
        start_ind = 0
        end_ind = None
        for i in range(n_fold):
            folder_size = avg_len + int(extra_list[i])
            end_ind = start_ind + folder_size
            folder = Tr_set[start_ind:end_ind]
            labels = Ltr_set[start_ind:end_ind]

            train_folders.append(folder)
            train_labels.append(labels)
                
            start_ind = end_ind
            
        return train_folders, train_labels
    else:
        print("n_fold must be atleast 2")

def get_mean_accuracy(Tr_set_list, Ltr_set_list, k):
    n_fold = len(Tr_set_list)
    total_accuracy = 0
    
    for i in range(n_fold):
        Val_set = Tr_set_list[i]
        Lval_set = Ltr_set_list[i]
        
        indexes = list(range(n_fold)).pop(i)
        Tr_set = np.vstack(Tr_set_list[indexes])
        Ltr_set = np.vstack(Ltr_set_list[indexes])[:,0]
        
        Labels_predicted = predict_kNN(Tr_set, Ltr_set, Val_set, k)
        accuracy = accuracy_score(Labels_predicted, Lval_set)
        
        total_accuracy += accuracy
        
    mean_accuracy = total_accuracy/n_fold
    
    return mean_accuracy
        
def get_best_k(Tr_set_list, Ltr_set_list, k_list):
    max_accuracy = 0
    best_k_list = []
    for k in k_list:
        mean_accuracy = get_mean_accuracy(Tr_set_list, Ltr_set_list, k)
               
        if mean_accuracy > max_accuracy:
            best_k_list = []
            best_k_list.append(k)
            max_accuracy = mean_accuracy
        elif mean_accuracy == max_accuracy:
            best_k_list.append(k)
        
    return best_k_list

### Find best k value for each dataset and getting accuracy

In [9]:
def get_x_y(datasets, dataset_name):
    X = datasets[dataset_name]["X"]
    y = datasets[dataset_name]["y"]
    
    return X, y
    
def split_train_test(X, y, train_ratio = 0.9, seed = 42):
    np.random.seed(seed)
    
    n_instances = len(X)
    ntr = round(n_instances*train_ratio)
    
    indices = np.arange(n_instances)
    np.random.shuffle(indices)
    
    Xtr = X[indices[:ntr]]
    ytr = y[indices[:ntr]]
    
    Xtst = X[indices[ntr:]]
    ytst = y[indices[ntr:]]
    
    return Xtr, ytr, Xtst, ytst   

In [10]:
def apply_knn(datasets, dataset_name, n_fold, k_list):
    
    #Getting the data vectors
    X, y = get_x_y(datasets, dataset_name)
    Xtr, ytr, Xtst, ytst = split_train_test(X, y) #splitting train and test parts
    
    #Searching for the best k with cross-validation method
    Tr_set_list, Ltr_set_list = data_split(Xtr, ytr, n_fold)
    best_k_list = get_best_k(Tr_set_list, Ltr_set_list, k_list)
    best_k = best_k_list[-1]
    
    ypred = predict_kNN(Xtr, ytr, Xtst, k = best_k)

    #computing the accuracy
    accuracy = accuracy_score(ytst, ypred)
    
    return best_k, accuracy
    

def apply_knn_on_all(datasets, n_fold = 3, k_list = range(1,10)):
    
    for i, dataset_name in enumerate(datasets.keys()):
        print(f"Dataset {i+1}/{len(datasets.keys())}: {dataset_name}")
        
        best_k, accuracy = apply_knn(datasets, dataset_name, n_fold, k_list)
        
        print(f"Best k value: {best_k}, accuracy: {100*accuracy}%") 
        print('')
        
apply_knn_on_all(datasets)


Dataset 1/25: parkinsons
Best k value: 1, accuracy: 94.73684210526315%

Dataset 2/25: page-blocks
Best k value: 1, accuracy: 97.44058500914076%

Dataset 3/25: optical
Best k value: 1, accuracy: 99.11032028469751%

Dataset 4/25: musk2
Best k value: 1, accuracy: 95.45454545454545%

Dataset 5/25: bc-wisc-diag
Best k value: 1, accuracy: 96.49122807017544%

Dataset 6/25: students
Best k value: 1, accuracy: 64.02714932126696%

Dataset 7/25: wine
Best k value: 1, accuracy: 88.88888888888889%

Dataset 8/25: magic
Best k value: 1, accuracy: 80.91482649842271%

Dataset 9/25: balance-scale
Best k value: 1, accuracy: 76.19047619047619%

Dataset 10/25: glass
Best k value: 1, accuracy: 52.38095238095239%

Dataset 11/25: zoo
Best k value: 1, accuracy: 100.0%

Dataset 12/25: waveform
Best k value: 1, accuracy: 78.8%

Dataset 13/25: image-segmentation
Best k value: 1, accuracy: 85.71428571428571%

Dataset 14/25: blood
Best k value: 1, accuracy: 70.66666666666667%

Dataset 15/25: spect
Best k value: 1, 