# Function

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# in common use
def min_max_normalization(data):
    return (data - np.min(data, axis = 0))/(np.max(data, axis = 0) - np.min(data, axis = 0))

def concatenate_two_1D_array(X1,X2):
    return np.concatenate((X1.reshape(len(X1),1),X2.reshape(len(X2),1)),axis=1)

def sort_by_row_in_2D_array_descending(X,row_number):
    ind = np.argsort(-X[:,row_number])
    return X[ind,:]

def select_feature(X,selected_feature):
    return X[:,selected_feature]

def accuracy(y,y_pred):
    return sum(y_pred == y)/len(y_pred == y)

def save_ndarray_to_csv(X,filename):
    X = pd.DataFrame(X)
    X.to_csv("{}.csv".format(filename), index=True)

In [3]:
# split categorical feature and continuous feature
# judge
def Is_Continuous(X_train, unique_value_threshold):
    return np.array([len(np.unique(X_train[:,i])) > unique_value_threshold for i in range(X_train.shape[1])])

# check len unique
def check_len_unique(X_train):
    len_unique = np.zeros(X_train.shape[1])
    for i in range(X_train.shape[1]):
        len_unique[i] = len(np.unique(X_train[:, i]))
    return len_unique

In [4]:
# RELIEF-F
from sklearn.neighbors import NearestNeighbors

def diff(x_1,x_2,is_categorical):
    if is_categorical == True: # categorical
        if x_1 == x_2:
            return 0
        if x_1 != x_2:
            return 1
    if is_categorical == False: # continuous
        return np.abs(x_1 - x_2)

def reliefF(X_train, y_train, n_neighbors, is_categorical):
    
    # prepare
    n_samples, n_features = X_train.shape
    weight_of_feature = np.zeros(n_features)

    # get the frequency of each label in y_train
    counts = np.bincount(y_train)
    frequency = counts / y_train.shape
    
    # use NearestNeighbors of sklearn to get enough neighbors
    n_multiple = 10 # With lab5's ball-like bad data, 10x is enough
    neigh = NearestNeighbors(n_neighbors = n_neighbors * len(counts) * n_multiple + 1, metric='minkowski', p = 1) # len(counts) = 4
    neigh.fit(X_train)
    _, hit_miss_indices = neigh.kneighbors(X_train)
    hit_miss_indices = hit_miss_indices[:, 1:]
    
    # for each sample i
    for i in range(n_samples):

        # get hit indices, the amount equals n_neighbors
        hit_indices = hit_miss_indices[i][y_train[hit_miss_indices[i]] == y_train[i]]
        if len(hit_indices) < n_neighbors:
            raise ValueError("length of hit_indices is less than n_neighbors")
        else:
            hit_indices = hit_indices[0:n_neighbors]
        
        # get miss indices, the amount equals n_neighbors for each miss class
        miss_indices = hit_miss_indices[i][y_train[hit_miss_indices[i]] != y_train[i]]        
        miss_label = y_train[miss_indices]
        miss_ind_and_label = np.concatenate((miss_indices.reshape(len(miss_indices),1),miss_label.reshape(len(miss_label),1)),axis=1)
        label = np.unique(miss_ind_and_label[:,1])
        count = np.zeros(len(label))
        delete = []        
        for k in range(miss_ind_and_label.shape[0]): # for each row k of miss_ind_and_label
            for l in range(len(label)): # len(label) is the number of class in y_train minus 1
                if miss_ind_and_label[k,1] == label[l]:
                    count[l] += 1
                    if count[l] >= n_neighbors + 1:
                        delete.append(k)
        miss_ind_and_label = np.delete(miss_ind_and_label, delete, axis = 0)
        if miss_ind_and_label.shape[0] != (len(label))*n_neighbors:
            raise ValueError("length of miss_indices is less than n_neighbors")
        miss_indices = miss_ind_and_label[:,0]

        # for each feature j
        for j in range(n_features):
            for hit_idx in hit_indices:      
                weight_of_feature[j] -= diff(X_train[i, j],X_train[hit_idx, j],is_categorical[j]) / n_neighbors
            for miss_idx in miss_indices:
                weight_of_miss = frequency[y_train[miss_idx]]/(1 - frequency[y_train[i]]) # calculate weighted average of miss classes
                weight_of_feature[j] += diff(X_train[i, j],X_train[miss_idx, j],is_categorical[j]) / n_neighbors * weight_of_miss

    return weight_of_feature

# Load Dataset

In [5]:
train_feature = pd.read_csv('train_feature_preprocessed.csv')
test_feature = pd.read_csv('test_feature_preprocessed.csv')
train_label = pd.read_csv('Dataset/train_label.csv')
train_feature = train_feature.drop('Unnamed: 0',axis=1)
test_feature = test_feature.drop('Unnamed: 0',axis=1)
train_feature = train_feature.to_numpy() 
test_feature = test_feature.to_numpy()
train_label = train_label.to_numpy()

# Split

In [6]:
from sklearn.model_selection import StratifiedKFold
X = train_feature
y = train_label
k = 5 # a hyper-parameter
stratified_kf = StratifiedKFold(n_splits = k,shuffle = True,random_state = 0)
n_neigh_RELIEF = 10 # a hyper-parameter
number_of_features_choosed_by_RELIEF = 10 # a hyper-parameter
result_sum = np.zeros((number_of_features_choosed_by_RELIEF*n_neigh_RELIEF,))
for train_index, val_index in stratified_kf.split(X, y):

    # split
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # reshape
    y_train = y_train.reshape(len(y_train),)
    y_val = y_val.reshape(len(y_val),)

    # number of samples and features
    n_sample = X_train.shape[0]
    n_feature = X_train.shape[1]

    # min-max-normalization
    X_train = min_max_normalization(X_train)
    X_val = min_max_normalization(X_val)

    # split
    threshold_of_unique_value = 23 # a hyper-parameter
    is_continuous = Is_Continuous(X_train, threshold_of_unique_value)
    is_categorical = ~is_continuous

    # RELIEF-F
    weight_of_feature = reliefF(X_train, y_train, n_neigh_RELIEF, is_categorical)

    # knn    
    result = np.zeros((number_of_features_choosed_by_RELIEF*n_neigh_RELIEF,3))
    count_knn = 0
    for i in range(1,number_of_features_choosed_by_RELIEF,1):

        # select top i features
        weight_of_feature_concatenate = concatenate_two_1D_array(weight_of_feature,np.arange(120))
        weight_of_feature_concatenate = sort_by_row_in_2D_array_descending(weight_of_feature_concatenate,0)
        selected_feature = weight_of_feature_concatenate[0:i,1]
        selected_feature = selected_feature.astype(int)
        X_train_RELIEF = select_feature(X_train,selected_feature)
        X_val_RELIEF = select_feature(X_val,selected_feature)

        # perform knn
        for n_neigh_knn in np.arange(1,n_neigh_RELIEF + 1,1): # a hyper-parameter
            knn = KNeighborsClassifier(n_neighbors=n_neigh_knn,p = 1,metric = 'minkowski')
            knn.fit(X_train_RELIEF, y_train)
            y_val_pred = knn.predict(X_val_RELIEF)
            result[count_knn,0] = i
            result[count_knn,1] = n_neigh_knn
            result[count_knn,2] = accuracy(y_val,y_val_pred)
            count_knn += 1
    result_sum += result[:,2]
result_sum = result_sum / k

In [7]:
save_ndarray_to_csv(result_sum,'result_5_new')