In [181]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from algos.classical import *
import matplotlib.pyplot as plt
import scipy.io as sio    
import glob
from sklearn.model_selection import KFold
from statistics import mean, stdev, variance
from ReliefF import ReliefF
import time
import scipy
from sklearn.preprocessing import StandardScaler
from sklearn import cluster
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans

In [146]:
Accuracy_Table = pd.read_csv('./Filter/Accuracy.csv', index_col=0)
Features_Table = pd.read_csv('./Filter/Features.csv', index_col=0)
Time_Table = pd.read_csv('./Filter/Time.csv', index_col=0)
Fitness_Table = pd.read_csv('./Filter/Fitness.csv', index_col=0)
Curve_Table = pd.read_csv('./Filter/Convergence.csv', index_col=0)
Tables = [Accuracy_Table, Features_Table, Time_Table, Fitness_Table]

In [147]:
def Pearson(X, y, nfeat, sel_features = []):
    
    variables = []
    for col in range(X.shape[1]):
        ans = pearsonr(X[:, col], y.reshape(-1))
        variables.append({'col' : col, 'var' : abs(ans[0])})

    sortedvars = sorted(variables, key=lambda d: d['var'], reverse= True) 

    self_pearson = []

    X_new = np.empty((X.shape[0], 0))
    for i in range(min(X.shape[1], nfeat)):
        self_pearson.append(sortedvars[i]['col'])
        X_new = np.concatenate((X_new, X[:, sortedvars[i]['col']].reshape(-1, 1)), 1)

    Xp = X_new
    sel_features.append(self_pearson)
    return Xp

In [148]:
def Relief(X, y, nfeat, sel_features = []):

    n = np.size(X, 0)
    y_temp = y.reshape(n)        
    fs = ReliefF(n_neighbors=10, n_features_to_keep = nfeat)
    Xr = fs.fit_transform(X, y_temp)
    sel_features.append(fs.top_features[:nfeat])
    return Xr

In [149]:
def Fval(X, y, nfeat, sel_features = []):

    mf = SelectKBest(f_classif, k=nfeat)
    Xf = mf.fit_transform(X, y.reshape(-1))

    variables = []
    for col in range(X.shape[1]):
        variables.append({'col' : col, 'var' : mf.scores_[col]})

    sortedvars = sorted(variables, key=lambda d: d['var'], reverse= True) 

    self_fclass = []

    for i in range(min(X.shape[1], nfeat)):
        self_fclass.append(sortedvars[i]['col'])
    sel_features.append(self_fclass)
    return Xf

In [150]:
def MutualInfo(X, y, nfeat, sel_features = []):

    mf = SelectKBest(mutual_info_classif, k=nfeat)
    Xm = mf.fit_transform(X, y.reshape(-1))

    variables = []
    for col in range(X.shape[1]):
        variables.append({'col' : col, 'var' : mf.scores_[col]})

    sortedvars = sorted(variables, key=lambda d: d['var'], reverse= True) 

    self_mclass = []

    for i in range(min(X.shape[1], 100)):
        self_mclass.append(sortedvars[i]['col'])
    sel_features.append(self_mclass)
    return Xm

In [151]:
def Union1(X, y, nfeat):
    BaseMethods = [Pearson, Relief, Fval, MutualInfo]
    sel_features = []
    
    for method in BaseMethods:
        Xtemp = method(X, nfeat, sel_features)
    
    sel_features = np.asarray(sel_features)
    sel_features = [i[:50] for i in sel_features]
    union_array = np.asarray([])
    
    for i in range(len(sel_features)):
        union_array = np.union1d(union_array, sel_features[i])
    
    X_new = np.empty((X.shape[0], 0))
    for i in range(len(union_array)):
        X_new = np.concatenate((X_new, X[:, int(union_array[i])].reshape(-1, 1)), 1)
    
    return X_new

In [152]:
def Union2(X, y, nfeat):
    BaseMethods = [Pearson, Relief, Fval, MutualInfo]
    sel_features = []
    
    for method in BaseMethods:
        Xtemp = method(X, nfeat, sel_features)
    
    sel_features = np.asarray(sel_features)
    union_array = np.asarray([])
    
    for i in range(sel_features.shape[1]):
        for j in range(sel_features.shape[0]):
            union_array = np.union1d(union_array, np.asarray([sel_features[j][i]]))
        if len(union_array) >= 100:
            break
    
    X_new = np.empty((X.shape[0], 0))
    for i in range(len(union_array)):
        X_new = np.concatenate((X_new, X[:, int(union_array[i])].reshape(-1, 1)), 1)
    
    return X_new

In [153]:
def normalize(x):
    x = np.asarray(x)
    return (x - x.min()) / (np.ptp(x))


def UnionRankNorm(X, y, nfeat):
    
    pearson_score = []
    for col in range(X.shape[1]):
        ans = pearsonr(X[:, col], y.reshape(-1))
        pearson_score.append(abs(ans[0]))
    
    pearson_score = np.asarray(pearson_score)
    mms = MinMaxScaler()
    ps_norm = normalize(pearson_score)
    
    n = np.size(X, 0)
    y_temp = y.reshape(n)        
    fs = ReliefF(n_neighbors=10, n_features_to_keep = nfeat)
    Xr = fs.fit_transform(X, y_temp)
    relief_score = fs.feature_scores
    rs_norm = normalize(relief_score)
    
    mf = SelectKBest(f_classif, k=nfeat)
    Xf = mf.fit_transform(X, y.reshape(-1))
    f_score = mf.scores_
    fs_norm = normalize(f_score)
    
    mf = SelectKBest(mutual_info_classif, k=nfeat)
    Xm = mf.fit_transform(X, y.reshape(-1))
    mi_score = mf.scores_
    mi_norm  = normalize(mi_score)
    
    
    variables = []
    for col in range(X.shape[1]):
        avg_score = (ps_norm[col] + rs_norm[col] + fs_norm[col] + mi_norm[col])/4
        variables.append({'col' : col, 'var' : avg_score})

    sortedvars = sorted(variables, key=lambda d: d['var'], reverse= True) 
    
    X_new = np.empty((X.shape[0], 0))
    for i in range(min(X.shape[1], nfeat)):
        X_new = np.concatenate((X_new, X[:, sortedvars[i]['col']].reshape(-1, 1)), 1)

    return X_new
    
    
    

In [158]:
def UnionCluster(X, y, nfeat):
    
    pearson_score = []
    for col in range(X.shape[1]):
        ans = pearsonr(X[:, col], y.reshape(-1))
        pearson_score.append(abs(ans[0]))
    
    pearson_score = np.asarray(pearson_score)
    ps_std = (pearson_score - pearson_score.mean())/(pearson_score.std())
    
    n = np.size(X, 0)
    y_temp = y.reshape(n)        
    fs = ReliefF(n_neighbors=10, n_features_to_keep = nfeat)
    Xr = fs.fit_transform(X, y_temp)
    relief_score = fs.feature_scores
    rs_std = (relief_score - relief_score.mean())/(relief_score.std())
    
    mf = SelectKBest(f_classif, k=nfeat)
    Xf = mf.fit_transform(X, y.reshape(-1))
    f_score = mf.scores_
    fs_std = (f_score - f_score.mean())/(f_score.std())
    
    mf = SelectKBest(mutual_info_classif, k=nfeat)
    Xm = mf.fit_transform(X, y.reshape(-1))
    mi_score = mf.scores_
    ms_std = (mi_score - mi_score.mean())/(mi_score.std())
    
    matrix = np.asarray([ps_std, rs_std, fs_std, ms_std]).transpose()
    
    kmeans = KMeans(n_clusters = 3).fit(matrix)
    centres = kmeans.cluster_centers_
    scores = []
    for centre in centres:
        mean_score = np.mean(centre)
        scores.append(mean_score)
    
    max_score_index = 0
    for i in range(len(scores)):
        if scores[i] > scores[max_score_index]:
            max_score_index = i
    
    X_new = np.empty((X.shape[0], 0))
    
    labels = kmeans.labels_
    
    for i in range(len(labels)):
        if labels[i] == max_score_index:
            X_new = np.concatenate((X_new, X[:, i].reshape(-1, 1)), 1)

    return X_new

In [159]:
FilterMethods = [UnionCluster]

for method in FilterMethods:
    Filter_name = method.__name__.upper()
    
    for i in range(len(Tables)):
        if not Filter_name in Tables[i].columns:
            Tables[i][Filter_name] = ""
    
    if not Filter_name in Curve_Table.columns:
            Curve_Table[Filter_name] = ""
    
    files = glob.glob("datasets/*.mat")
    files = list(files)
    files.sort()
    
    for file in files:
        
        dataset_name = file[9:-4].upper()
        print(dataset_name)
        data  = sio.loadmat(file)
        X  = data['X']
        if type(X) is scipy.sparse.csc.csc_matrix:
            X = X.toarray()
        else:
            X = np.asarray(X)
        y = np.asarray(data['Y'])
        
        variables = []
        for col in range(X.shape[1]):
            variables.append({'col' : col, 'var' : np.var(X[:, col])})
        
        sortedvars = sorted(variables, key=lambda d: d['var'], reverse= True) 
        
        X_new = np.empty((X.shape[0], 0))
        for i in range(min(X.shape[1], 5000)):
            X_new = np.concatenate((X_new, X[:, sortedvars[i]['col']].reshape(-1, 1)), 1)
        
        X = X_new
        X_new = None
        variables = None
        sortedvars = None
        
        print('Low Variance Features removed')
        X = StandardScaler().fit_transform(X)
        print('Standardized')
        
        X = method(X, y, 100)
        print(f'{Filter_name} Filter approach applied')
        
        runs = 10
        Accuracy_Score = []
        Fitness_Score = []
        Selected_Features = []
        Convergence_curve = []
        Time = []
        Scores = [Accuracy_Score, Selected_Features, Time, Fitness_Score]
    
        k    = 5     # k-value in KNN
        N    = 20    # number of particles
        T    = 100   # maximum number of iterations

        opts = {'k':k, 'N':N, 'T':T}

        for i in range(runs):
            start_time = time.time()
            fmdl = pso(X, y, opts)
            time_taken = time.time() - start_time
            
            Acc       = fmdl['acc']
            num_feat = float(fmdl['num_feat'])
            curve   = fmdl['c']
            curve   = curve.reshape(np.size(curve,1))
            fitness = fmdl['fitness']

            print(f' Run {i+1}')
            print("Accuracy:", 100 * Acc)
            print("Feature Size:", num_feat)
            print("Fitness:", fitness)
            print("Time:", time_taken)
            print('------------------------------------')

            Accuracy_Score.append(Acc)
            Fitness_Score.append(fitness)
            Selected_Features.append(num_feat)
            Convergence_curve = curve
            Time.append(time_taken)

        print(f'Filter Approach : {Filter_name} Dataset : {dataset_name}')
        print("Accuracy:", 100*mean(Accuracy_Score))
        print("Feature Size:", mean(Selected_Features))
        print("Fitness:", mean(Fitness_Score))
        print("Time:", mean(Time))

        x = np.arange(0, opts['T'], 1.0) + 1.0
        fig, ax = plt.subplots()
        ax.plot(x, Convergence_curve, 'o-')
        ax.set_xlabel('Number of Iterations')
        ax.set_ylabel('Fitness')
        ax.set_title(f'{Filter_name}')
        ax.grid()
        plt.show()
        
        if not (Accuracy_Table['Dataset'] == dataset_name).any():
            arr1 = [dataset_name, 'All']
            arr2 = [dataset_name, 'Avg']
            arr3 = [dataset_name, 'Std']
            arr4 = [dataset_name]
            
            for i in range(len(Tables)):
                df1 = pd.DataFrame([arr1 + [np.nan]*(len(Tables[i].columns)-2)], columns=Tables[i].columns)
                df2 = pd.DataFrame([arr2 + [np.nan]*(len(Tables[i].columns)-2)], columns=Tables[i].columns)
                df3 = pd.DataFrame([arr3 + [np.nan]*(len(Tables[i].columns)-2)], columns=Tables[i].columns)
                Tables[i] = Tables[i].append(df1, ignore_index=True)
                Tables[i] = Tables[i].append(df2, ignore_index=True)
                Tables[i] = Tables[i].append(df3, ignore_index=True)
            
            df4 = pd.DataFrame([arr4 + [np.nan]*(len(Curve_Table.columns)-1)], columns=Curve_Table.columns)
            Curve_Table = Curve_Table.append(df4, ignore_index=True)
        
        for i in range(len(Tables)):
            index = Tables[i].index
            condition = (Tables[i]['Dataset'] == dataset_name) & (Tables[i]['Metric'] == 'All')
            index = index[condition].tolist()[0]
            Tables[i].at[index, Filter_name] = Scores[i]
            
            index = Tables[i].index
            condition = (Tables[i]['Dataset'] == dataset_name) & (Tables[i]['Metric'] == 'Avg')
            index = index[condition].tolist()[0]
            Tables[i].at[index, Filter_name] = mean(Scores[i])
            
            index = Tables[i].index
            condition = (Tables[i]['Dataset'] == dataset_name) & (Tables[i]['Metric'] == 'Std')
            index = index[condition].tolist()[0]
            Tables[i].at[index, Filter_name] = stdev(Scores[i])
        
        index = Curve_Table.index
        condition = (Curve_Table['Dataset'] == dataset_name)
        index = index[condition].tolist()[0]
        Curve_Table[Filter_name] = Curve_Table[Filter_name].astype(object)
        Curve_Table.at[index, Filter_name] = Convergence_curve
        
        file_name = './Filter/Convergence.csv'
        Curve_Table.to_csv(file_name)
        file_name = './Filter/Accuracy.csv'
        Tables[0].to_csv(file_name)
        file_name = './Filter/Features.csv'
        Tables[1].to_csv(file_name)
        file_name = './Filter/Time.csv'
        Tables[2].to_csv(file_name)
        file_name = './Filter/Fitness.csv'
        Tables[3].to_csv(file_name)


11_TUMORS
Low Variance Features removed
Standardized


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 174 and the array at index 1 has size 4

Unnamed: 0_level_0,Metric,PEARSON
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1


In [127]:
Accuracy_Table = pd.DataFrame(columns=['Dataset', 'Metric'])

In [128]:
Features_Table = pd.DataFrame(columns=['Dataset', 'Metric'])

In [129]:
Time_Table = pd.DataFrame(columns=['Dataset', 'Metric'])

In [130]:
Fitness_Table = pd.DataFrame(columns=['Dataset', 'Metric'])

In [131]:
Curve_Table = pd.DataFrame(columns=['Dataset'])

In [132]:
Tables = [Accuracy_Table, Features_Table, Fitness_Table, Fitness_Table]

In [133]:
file_name = './Filter/Convergence.csv'
Curve_Table.to_csv(file_name)
file_name = './Filter/Accuracy.csv'
Tables[0].to_csv(file_name)
file_name = './Filter/Features.csv'
Tables[1].to_csv(file_name)
file_name = './Filter/Time.csv'
Tables[2].to_csv(file_name)
file_name = './Filter/Fitness.csv'
Tables[3].to_csv(file_name)

In [162]:
nfeat = 100

In [182]:
pearson_score = []
for col in range(X.shape[1]):
    ans = pearsonr(X[:, col], y.reshape(-1))
    pearson_score.append(abs(ans[0]))

pearson_score = np.asarray(pearson_score)
ps_std = (pearson_score - pearson_score.mean())/(pearson_score.std())

n = np.size(X, 0)
y_temp = y.reshape(n)        
fs = ReliefF(n_neighbors=10, n_features_to_keep = nfeat)
Xr = fs.fit_transform(X, y_temp)
relief_score = fs.feature_scores
rs_std = (relief_score - relief_score.mean())/(relief_score.std())

mf = SelectKBest(f_classif, k=nfeat)
Xf = mf.fit_transform(X, y.reshape(-1))
f_score = mf.scores_
fs_std = (f_score - f_score.mean())/(f_score.std())

mf = SelectKBest(mutual_info_classif, k=nfeat)
Xm = mf.fit_transform(X, y.reshape(-1))
mi_score = mf.scores_
ms_std = (mi_score - mi_score.mean())/(mi_score.std())

matrix = np.asarray([ps_std, rs_std, fs_std, ms_std]).transpose()

kmeans = MiniBatchKMeans(n_clusters = 3).fit(matrix)
centres = kmeans.cluster_centers_
scores = []
for centre in centres:
    mean_score = np.mean(centre)
    scores.append(mean_score)

max_score_index = 0
for i in range(len(scores)):
    if scores[i] > scores[max_score_index]:
        max_score_index = i

X_new = np.empty((X.shape[0], 0))

labels = kmeans.labels_

for i in range(len(labels)):
    if labels[i] == max_score_index:
        X_new = np.concatenate((X_new, X[:,i].reshape(-1, 1)), 1)

In [175]:
X[:,0].shape

(174,)

In [183]:
X_new.shape

(174, 817)