<a href="https://colab.research.google.com/github/Ofir408/ml-feature-selection/blob/main/ofir_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Ofir Ben Shoham


Implentation of the paper: "A two-stage gene selection scheme utilizing MRMR filter and GA wrapper"

Link: https://link.springer.com/content/pdf/10.1007/s10115-010-0288-x.pdf

Data Preprocessing notebook: https://github.com/Ofir408/ml-feature-selection/blob/main/data_preprocessing.ipynb

## Setup

In [None]:
!pip install numpy Cython --quiet
!pip install -U pymrmr --quiet
#!pip install mrmr --quiet
!pip install sklearn-genetic --quiet
!pip install skfeature-chappers --quiet
#!pip install -U sklearn-features --quiet

[?25l[K     |████▊                           | 10 kB 28.7 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 26.5 MB/s eta 0:00:01[K     |██████████████▏                 | 30 kB 33.6 MB/s eta 0:00:01[K     |██████████████████▉             | 40 kB 16.0 MB/s eta 0:00:01[K     |███████████████████████▋        | 51 kB 14.7 MB/s eta 0:00:01[K     |████████████████████████████▎   | 61 kB 17.1 MB/s eta 0:00:01[K     |████████████████████████████████| 69 kB 6.3 MB/s 
[?25h  Building wheel for pymrmr (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 139 kB 15.5 MB/s 
[K     |████████████████████████████████| 115 kB 59.8 MB/s 
[K     |████████████████████████████████| 66 kB 4.4 MB/s 
[?25h

In [None]:
!pip install git+https://github.com/jundongl/scikit-feature.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/jundongl/scikit-feature.git
  Cloning https://github.com/jundongl/scikit-feature.git to /tmp/pip-req-build-exa4s1nv
  Running command git clone -q https://github.com/jundongl/scikit-feature.git /tmp/pip-req-build-exa4s1nv
Building wheels for collected packages: skfeature
  Building wheel for skfeature (setup.py) ... [?25l[?25hdone
  Created wheel for skfeature: filename=skfeature-1.0.0-py3-none-any.whl size=61510 sha256=be91ba62de7dacc7a6bdef79671da1b62fc3157ead6249d82491a209ecfe612f
  Stored in directory: /tmp/pip-ephem-wheel-cache-4ewwnuuy/wheels/be/18/8d/fd247ee221428efa6071101e0e322224b42a33b65b130ace7c
Successfully built skfeature
Installing collected packages: skfeature
Successfully installed skfeature-1.0.0


Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import pymrmr
import numpy as np
from sklearn.naive_bayes import GaussianNB
from genetic_selection import GeneticSelectionCV


get all the paths of the datasets

In [None]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/*/*.csv", recursive=True)


In [None]:
# lets see the first dataset path.
print(f'first ds path={ds_paths[0]}')
print(f'number of datasets: {len(ds_paths)}')

first ds path=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/arcene.csv
number of datasets: 20


20 datasets as required.

## Toy Dataset

In [168]:
toy_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train',
                     delimiter = ",", header=None, names=[str(x) for x in range(45)])

In [None]:
toy_df.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44'],
      dtype='object')

In [169]:
target_column = '0'

## Paper model implementation

In [177]:
def paper_algo_toy_sample(X, y, k):
  columns = list(X.columns)
  mrmr_k_best = SelectKBest(score_func=run_mrmr, k=k)
  mrmr_k_best.fit(X, y)
  mrmr_selected_features = mrmr_k_best.get_feature_names_out()
  new_X = X[mrmr_selected_features]
  ga = GeneticSelectionCV(estimator=GaussianNB(), scoring='accuracy',
                        n_generations=20, crossover_proba=0.8,
                        mutation_independent_proba=0.1, n_population=100, 
                        n_jobs=-1, caching=True)
  ga = ga.fit(new_X, y)
  final_selected_features = new_X.columns[ga.support_]

  # 1 if the feature was selected, 0 otherwise.
  scores = [1 if column in final_selected_features else 0 for column in columns] 
  return np.array(scores)

X = toy_df.drop(columns=[target_column])
y = toy_df[target_column]


In [203]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import random 
from random import randrange

def toy_dataset_runner(X, y, run_count):
  scores_list = []
  selected_features_list = []
  k_options = [1, 2, 3]
  for i in range(run_count):
    k = k_options[i]
    random.seed(10 * k)
    selected_features = paper_algo_toy_sample(X, y, k=k)
    selected_features_list.append(selected_features)
  return selected_features_list
  

In [204]:
toy_dataset_runner(X, y, 3)

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])]

## General Settings

In [None]:
# define the k options
top_k_features = [1,2,3,4,5,10,15,20,25,30,50,100]

In [None]:
from sklearn.model_selection import LeavePOut, LeaveOneOut, StratifiedKFold

# define the cv methods according the number of samples in our dataset.
def get_cv_method(samples_num):
  print(f'samples_num={samples_num}')
  if samples_num < 50: 
    return LeavePOut(p=2)
  if 50 <= samples_num <= 100:
    return LeaveOneOut()
  if 100 < samples_num < 1000:
    return StratifiedKFold(n_splits=10)
  return StratifiedKFold(n_splits=5)


## Feature selection methods

In [None]:
from skfeature.function.information_theoretical_based import MRMR

# run mrmr algorithm, given X,y. Return ndarray with the socres.
def run_mrmr(X, y):
  selected_indexes, _, scores = MRMR.mrmr(X, y, n_selected_features=200)
  rank_scores = [0 for _ in range(X.shape[1])]
  for score_counter, inx in enumerate(selected_indexes):
    rank_scores[inx] = scores[score_counter]
  return np.array(rank_scores)


In [None]:
from sklearn.feature_selection import SelectFdr

# run f_classif algorithm (with alpha=0.1), 
# given X,y. Return ndarray with the socres.
def run_f_classif(X, y):
  s = SelectFdr(alpha=0.1)
  s.fit(X, y)
  return np.array(s.scores_)


In [None]:
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import RFE

def run_rfe(X, y):
  estimator = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3) # hinge loss gives a linear SVM.
  selector = RFE(estimator, n_features_to_select=1, step=5)
  selector.fit(X, y)
  # return for example array([4, 3, 1, 2]), 1 is the best feature, afterwards 2, etc..
  scores = [10000000 - r for r in selector.ranking_]
  return np.array(scores)


In [None]:
from skfeature.function.similarity_based import reliefF

def run_relieff(X, y):
  return reliefF.reliefF(X, y)

In [None]:
from sklearn.feature_selection import SelectKBest

def paper_algo(X, y):
  columns = list(X.columns)
  mrmr_k_best = SelectKBest(score_func=run_mrmr, k=200)
  mrmr_k_best.fit(X, y)
  mrmr_selected_features = mrmr_k_best.get_feature_names_out()
  new_X = X[mrmr_selected_features]
  ga = GeneticSelectionCV(estimator=GaussianNB(), scoring='accuracy',
                        n_generations=20, crossover_proba=0.8,
                        mutation_independent_proba=0.1, n_population=100, 
                        n_jobs=-1, caching=True)
  ga = ga.fit(new_X, y)
  final_selected_features = new_X.columns[ga.support_]

  # 1 if the feature was selected, 0 otherwise.
  scores = [1 if column in final_selected_features else 0 for column in columns] 
  return np.array(scores)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from operator import itemgetter

# random forest instead of mrmr feature selection, before the genetic algorithm.
def paper_algo_improvement(X, y):
  columns = list(X.columns)
  rf = RandomForestClassifier(max_depth=2, random_state=0)
  rf.fit(X, y)
  scores = rf.feature_importances_
  features_with_scores = dict(zip(columns, scores))
  sorted_dict = dict(sorted(features_with_scores.items(), key = itemgetter(1), reverse = True)[:200])
  new_X = X[sorted_dict.keys()]

  ga = GeneticSelectionCV(estimator=GaussianNB(), scoring='accuracy',
                      n_generations=20, crossover_proba=0.8,
                      mutation_independent_proba=0.1, n_population=100, 
                      n_jobs=-1, caching=True)
  ga = ga.fit(new_X, y)
  final_selected_features = new_X.columns[ga.support_]

  # 1 if the feature was selected, 0 otherwise.
  scores = [1 if column in final_selected_features else 0 for column in columns] 
  return np.array(scores)


## Models

In [None]:
# taken from https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers
from sklearn.base import BaseEstimator

class ClfSwitcher(BaseEstimator):

  def __init__(self, estimator):
      self.estimator = estimator


  def fit(self, X, y=None, **kwargs):
      self.estimator.fit(X, y)
      return self

  def predict(self, X, y=None):
      return self.estimator.predict(X)

  def predict_proba(self, X):
      return self.estimator.predict_proba(X)

  def score(self, X, y):
      return self.estimator.score(X, y)


Extract the features from each dataset and fold num.


In [None]:
from sklearn.feature_selection import f_classif
from timeit import default_timer
import json 
from sklearn.feature_selection import SelectKBest
import datetime

def calc_hash(ds_name, score_func_name):
  return f'{ds_name}_{score_func_name}'

counters = {}
c = 0

class FeatureSelectionSwitcher:

  def __init__(self, ds_name=""):
    self.ds_name = ds_name
    self.k = 200
    

  def fit(self, X, y, score_functions):
    for score_func in score_functions:
      start = default_timer()
      select_k_best = SelectKBest(score_func, k=self.k)
      select_k_best.fit(X, y)

      stop = default_timer()
      fs_time = round(stop - start, 2)
      selected_features = select_k_best.get_feature_names_out()
      selected_features_scores = np.array([round(score, 2) for score in select_k_best.scores_[select_k_best.get_support()]])
      features_and_scores_dict = dict(zip(selected_features, selected_features_scores))
      features_and_scores_dict = dict(sorted(features_and_scores_dict.items(), key=lambda item: item[1]))
      selected_features = np.array(list(features_and_scores_dict.keys()))
      selected_features_scores = np.array(list(features_and_scores_dict.values()))
      self.write_to_file(selected_features, selected_features_scores, score_func, fs_time)
    return self
    
  def write_to_file(self, selected_features, selected_features_scores, score_func, fit_time):
    score_func_name = score_func.__name__.replace('run_', '')
    hash_key = calc_hash(self.ds_name, score_func_name)
    global counters
    
    if hash_key not in counters:
      counters[hash_key] = 0
    counters[hash_key] += 1
    time = str(datetime.datetime.now().time())
    results = {
        'ds_name': self.ds_name,
        'score_function': score_func_name,
        'file_time': time,
        'fe_fit_time': fit_time,
        'selected_features': selected_features.tolist(),
        'selected_features_scores': selected_features_scores.tolist()
    }
    with open(f'/content/gdrive/MyDrive/ml-bgu/fs_results_try/ds={self.ds_name},fs_method={score_func_name},fold={counters[hash_key]}.json', 'w') as f:
      f.write(json.dumps(results))
    
class CustomFeatureSelectionSwitcher(FeatureSelectionSwitcher):
    def fit(self, X, y, score_functions):
      for score_func in score_functions:
        start = default_timer()
        columns = np.array(X.columns)
        scores = score_func(X, y)
        support = [True if score == 1 else False for score in scores]
        stop = default_timer()
        fs_time = round(stop - start, 2)
        selected_features = columns[support]
        selected_features_scores = np.array([1 for _ in range(len(selected_features))])
        self.write_to_file(selected_features, selected_features_scores, score_func, fs_time)
      return self



Define the runner for feature selection part

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from tempfile import mkdtemp
import os 
from shutil import rmtree

target_column = 'y'

def run_fs_per_ds_helper(ds_file_path, fs_class, fs_score_functions):
  ds_name = os.path.basename(ds_file_path)
  print(f'run for ds={ds_name}')
  df = pd.read_csv(ds_file_path)
  drop_cols = [x for x in list(pd.read_csv(ds_path).columns) if 'Unnamed' in x]
  drop_cols.append(target_column)
  print(f'drop_cols={drop_cols}')
  X = df.drop(columns=drop_cols)
  y = df[target_column]
  samples_num = y.shape[0]
  cv_method = get_cv_method(samples_num)
  for fold_num, (train_index, _) in enumerate(cv_method.split(X, y)):
    fold_num += 1
    cachedir = mkdtemp()
    pipeline = Pipeline([
        ('fs', fs_class(ds_name=ds_name))
    ])

    is_multi_class = len(set(y)) > 2
    print(f'is_multi_class={is_multi_class}')
    print(f'get_scores_metrics(is_multi_class)= {get_scores_metrics(is_multi_class)}')
    X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
    print(f'X_train.shape={X_train.shape}, y_train.shape={y_train.shape}')
    pipeline.fit(X_train, y_train, fs__score_functions=fs_score_functions)


def run_fs_per_ds(ds_file_path):
  run_fs_per_ds_helper(ds_file_path, FeatureSelectionSwitcher, [run_f_classif, run_relieff, run_mrmr, run_rfe])
  run_fs_per_ds_helper(ds_file_path, CustomFeatureSelectionSwitcher, [paper_algo_improvement, paper_algo])

In [None]:
#run_fs_per_ds('/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/Yale.csv')

import glob 
import os
np.seterr(divide='ignore', invalid='ignore')
import warnings
warnings.filterwarnings('ignore')

paths_to_run = [
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/kostic.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/turnbaugh.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/warpAR10P.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Morgan2012_IBD.3.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Gevers2014_IBD_ileum.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Gevers2014_IBD_rectum.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Ravel2011_Vaginal.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Costello2009_Subject.7.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/Yale.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/Carcinom.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/singh.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/christensen.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/yeoh.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/sorlie.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/arcene.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/gravier.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/claesson.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/bushman_cafe.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/david.csv',
                '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/orlraws10P.csv',
                ]

for ds_path in paths_to_run:
  run_fs_per_ds(ds_path)


In [None]:
# NB, SVM, LogisticsRegression, RandomForest, k-nearest neighbors (K-NN)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

classifiers = [GaussianNB(), SVC(gamma='auto', random_state=0, probability=True),
       LogisticRegression(random_state=0, max_iter=1000), RandomForestClassifier(random_state=0),
       KNeighborsClassifier(n_neighbors=3)] 

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import average_precision_score
from sklearn import preprocessing

def mc_average_precision_score(y_true, y_score):
  y_pred = y_score
  lb = preprocessing.LabelBinarizer()
  lb.fit(y_true)
  y_test = lb.transform(y_true)
  y_pred = lb.transform(y_pred)
  try:
    return average_precision_score(y_true=y_test, y_score=y_pred, average='micro')
  except:
    return np.nan

def roc_auc_score_func(y_true, y_score, *, average="macro", multi_class="raise"):
    try:
        return roc_auc_score(y_true, y_score, average=average, multi_class=multi_class)
    except ValueError:
        return np.nan



In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

def get_scores_metrics(is_multi_class: bool):
  metrics = {
      'ACC': make_scorer(accuracy_score),
      'MCC': make_scorer(matthews_corrcoef),
      'PR-AUC': make_scorer(mc_average_precision_score if is_multi_class else average_precision_score),
      'AUC': make_scorer(roc_auc_score_func, multi_class='ovr' if is_multi_class else 'raise', needs_proba=is_multi_class),
  }
  return metrics


##  Run classifiers

In [None]:
import json
import os.path

def get_features_and_scores(ds_name, score_func_name, fold_num):
  path = f'/content/gdrive/MyDrive/ml-bgu/fs_results_try/ds={ds_name},fs_method={score_func_name},fold={fold_num}.json'
  if not os.path.isfile(path):
    path = f'/content/gdrive/MyDrive/ml-bgu/fs_results_parallel/ds={ds_name},fs_method={score_func_name},fold={fold_num}.json'
  with open(path, 'r') as f:
    json_body = json.load(f)
    selected_features = json_body['selected_features']
    selected_features_scores = json_body['selected_features_scores']
    features_and_scores_dict = dict(zip(selected_features, selected_features_scores))
    #is_bigger_best = True if 'rfe' not in score_func_name else False
    is_bigger_best = True
    features_and_scores_dict = dict(sorted(features_and_scores_dict.items(), key=lambda item: item[1], reverse=is_bigger_best))
    selected_features = list(features_and_scores_dict.keys())
    selected_features_scores = list(features_and_scores_dict.values())
    return selected_features, selected_features_scores

def get_top_k(ds_name, score_func, fold_num, k):
  score_func_name = score_func.__name__.replace('run_', '')
  selected_features, selected_features_scores = get_features_and_scores(ds_name, score_func_name, fold_num)
  if k == 'all':
    return selected_features, selected_features_scores
  return selected_features[:k], selected_features_scores[:k]
  

In [None]:
import time 

def calc_scores_metrics(clf, X_test, y_test, is_multi_class):
  scores = {}
  for metrics_name, scorer_func in get_scores_metrics(is_multi_class).items():
    start_predict_time = time.time()
    _ = clf.predict_proba(X_test) # just for time check.
    avg_inference_time = (time.time() - start_predict_time) / X_test.shape[0]
    score_value = scorer_func(clf, X_test, y_test)
    scores[metrics_name] = (round(score_value, 3), round(avg_inference_time, 5))
  return scores

Choose the k best features, reading from the feature selection files for efficiency 

In [None]:
class FeatureSelection:

  def __init__(self, ds_name, fold_num, score_func):
    self.ds_name = ds_name
    self.score_func_name = score_func.__name__.replace('run_', '')
    self.score_func = score_func
    self.fold_num = fold_num
    self.X = None
    self.selected_features = []

  def fit(self, X, y, k):
    # k is configurable in the fit for optimization.
    selected_features, selected_features_scores = get_top_k(self.ds_name, self.score_func, self.fold_num, k)
    self.selected_features = selected_features
    return self

  def transform(self, X):
    return X[self.selected_features]
  
  def set_params(self, **parameters):
    for parameter, value in parameters.items():
        setattr(self, parameter, value)
    return self

  def get_params(self, deep=True):
    return {"ds_name": self.ds_name, "fold_num": self.fold_num, "score_func": self.score_func}



class CustomFeatureSelection(FeatureSelection):
  def __init__(self, ds_name, score_func, k='all'):
    super().__init__(ds_name, score_func, k)


In [None]:
def get_best_auc_configuration(ds_name):
  ds_auc_results_per_k_df = auc_results_per_k_df[auc_results_per_k_df['dataset_name'] == ds_name]
  return ds_auc_results_per_k_df.loc[
      ds_auc_results_per_k_df['measure_value'].idxmax()].to_dict()


In [None]:
from shutil import rmtree
from sklearn.model_selection import cross_validate
import time
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
from dataclasses import dataclass
import pickle
simplefilter("ignore", category=ConvergenceWarning)
np.seterr(divide='ignore', invalid='ignore')
cachedir = mkdtemp()

# define the result class that contain that data.
@dataclass
class Result:
  dataset_name: str
  samples_num: int
  original_features_num: int
  filtering_algorithm: str
  learning_algorithm: str
  k: str
  cv_method: str
  fold_num: int
  measure_type: str
  measure_value: float
  avg_inference_time: float
  fit_time: float
  

def run_experiment(ds_paths, k_options, clfs, fs_methods, part4_preprocessing = None):
  results = []
  for ds_path in ds_paths:
    ds_name = os.path.basename(ds_path)
    df = pd.read_csv(ds_path)
    drop_cols = [x for x in list(pd.read_csv(ds_path).columns) if 'Unnamed' in x]
    drop_cols.append(target_column)

    X = df.drop(columns=drop_cols)
    y = df[target_column]
    samples_num = y.shape[0]
    is_multi_class = len(set(y)) > 2
    metrics_dict = get_scores_metrics(is_multi_class)
    cv = get_cv_method(samples_num)
    for fold_num, (train_index, test_index) in enumerate(cv.split(X, y)):
      X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
      X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
      if part4_preprocessing is not None:
        best_config = get_best_auc_configuration(ds_name)
        X_train, y_train, X_test, y_test = part4_preprocessing(X_train, y_train, X_test, y_test, best_config)

      for k in k_options:
        for fs_method in fs_methods:
          fs_method_name = fs_method.__name__ 
          if 'paper' in fs_method_name:
            k = 'all'
          for clf in clfs:
            if part4_preprocessing is None:    
              pipeline = Pipeline([
                  ('fs', FeatureSelection(ds_name=ds_name, fold_num=fold_num+1, score_func=fs_method)),
                  ('clf', ClfSwitcher(clf)),
              ], memory=cachedir)
            else:
              pipeline = Pipeline([
                  ('clf', ClfSwitcher(clf)),
              ], memory=cachedir)

            start_fit_time = time.time()
            if part4_preprocessing is None:
              pipeline.fit(X_train, y_train, fs__k=k)
            else:
              pipeline.fit(X_train, y_train)

            total_fit_time = time.time() - start_fit_time
            scores = calc_scores_metrics(pipeline, X_test, y_test, is_multi_class)
            for measure_type, (measure_value, inference_time) in scores.items():
              result = Result(dataset_name=ds_name, samples_num=samples_num, original_features_num=X.shape[1], 
                    filtering_algorithm=fs_method_name.replace('run_', ''), 
                    learning_algorithm=type(clf).__name__, k=k, cv_method=type(cv).__name__,
                    fold_num=fold_num + 1, measure_type=measure_type, measure_value=measure_value,
                    avg_inference_time=inference_time, fit_time=total_fit_time)
              results.append(result)
    backup_results(results)
  return results

# writing the results to pickle after each dataset.
def backup_results(current_results):
  with open('/content/gdrive/MyDrive/ml-bgu/results/pickles/clf_result.pkl', 'wb') as f:
    pickle.dump(current_results, f)


rmtree(cachedir)


In [None]:
import random

fs_methods = [run_f_classif, run_relieff, run_rfe, run_mrmr, paper_algo, paper_algo_improvement]
results = run_experiment(ds_paths, top_k_features, classifiers, fs_methods)

In [None]:
results[0:4]

[Result(dataset_name='arcene.csv', samples_num=200, original_features_num=1000, filtering_algorithm='f_classif', learning_algorithm='GaussianNB', k=1, cv_method='StratifiedKFold', fold_num=1, measure_type='ACC', measure_value=0.7, avg_inference_time=0.0001, fit_time=0.04646563529968262),
 Result(dataset_name='arcene.csv', samples_num=200, original_features_num=1000, filtering_algorithm='f_classif', learning_algorithm='GaussianNB', k=1, cv_method='StratifiedKFold', fold_num=1, measure_type='MCC', measure_value=0.394, avg_inference_time=9e-05, fit_time=0.04646563529968262),
 Result(dataset_name='arcene.csv', samples_num=200, original_features_num=1000, filtering_algorithm='f_classif', learning_algorithm='GaussianNB', k=1, cv_method='StratifiedKFold', fold_num=1, measure_type='PR-AUC', measure_value=0.594, avg_inference_time=9e-05, fit_time=0.04646563529968262),
 Result(dataset_name='arcene.csv', samples_num=200, original_features_num=1000, filtering_algorithm='f_classif', learning_algori

In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv('/content/gdrive/MyDrive/ml-bgu/results/clfs_part2_results.csv')
results_df.sample(5)


Unnamed: 0,dataset_name,samples_num,original_features_num,filtering_algorithm,learning_algorithm,k,cv_method,fold_num,measure_type,measure_value,avg_inference_time,fit_time
477191,warpAR10P.csv,130,1000,mrmr,LogisticRegression,5,StratifiedKFold,2,AUC,0.979,0.00013,0.30923
159571,orlraws10P.csv,100,1000,paper_algo,LogisticRegression,all,LeaveOneOut,91,AUC,,0.00183,0.30947
17869,Morgan2012_IBD.3.csv,128,2061,paper_algo_improvement,LogisticRegression,all,StratifiedKFold,3,MCC,-0.108,0.00019,0.03491
14693,Morgan2012_IBD.3.csv,128,2061,rfe,RandomForestClassifier,3,StratifiedKFold,1,MCC,-0.33,0.0009,0.148622
481479,warpAR10P.csv,130,1000,relieff,KNeighborsClassifier,5,StratifiedKFold,5,AUC,0.794,0.00018,0.010533


### Get feature selection results

In [None]:
def from_file_to_dict_result(file_path):
  with open (file_path, 'r') as f:
    fold_num = int(file_path.split('fold')[1].replace('.json', '').replace('=', ''))
    json_fs =  json.load(f)
    json_fs['fold_num'] = fold_num
    return json_fs

In [None]:
import glob

all_fs_files = glob.glob("/content/gdrive/MyDrive/ml-bgu/fs_results_try/*.json")

In [None]:
selected_features_list = [] 
selectd_features_scores_list = []

def process_json_result(dict_result):
  selected_features = dict_result.pop('selected_features')
  selected_features_scores = dict_result.pop('selected_features_scores')
  is_reversed = True
  features_and_scores_dict = dict(zip(selected_features, selected_features_scores))
  features_and_scores_dict = dict(sorted(features_and_scores_dict.items(), key=lambda item: item[1], reverse=is_reversed))

  selected_features = list(features_and_scores_dict.keys())
  selected_features_scores = list(features_and_scores_dict.values())
  selected_features_list.append(selected_features)
  selectd_features_scores_list.append(selected_features_scores)
  return dict_result

dict_results = []
for json_file_path in all_fs_files:
  dict_result = process_json_result(from_file_to_dict_result(json_file_path))
  dict_results.append(dict_result)


In [None]:
fs_df = pd.DataFrame(dict_results)
fs_df['selected_features'] = pd.Series(selected_features_list)
fs_df['selected_features_scores'] = pd.Series(selectd_features_scores_list)
fs_df = fs_df.sort_values(by=['ds_name', 'score_function'])
fs_df.sample(5)


Unnamed: 0,ds_name,score_function,file_time,fe_fit_time,fold_num,selected_features,selected_features_scores
1975,singh.csv,paper_algo_improvement,05:30:58.876399,5.68,18,"[Feature5908, Feature6184, Feature7066, Featur...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
463,orlraws10P.csv,f_classif,08:34:09.826528,0.01,17,"[Feature7511, Feature17815, Feature28233, Feat...","[116.29, 116.29, 114.43, 113.57, 113.57, 111.9..."
485,orlraws10P.csv,relieff,08:41:40.519374,0.1,22,"[Feature4613, Feature14917, Feature6854, Featu...","[451282.55, 451282.55, 448698.02, 448698.02, 4..."
1359,sorlie.csv,mrmr,12:33:39.669758,121.07,6,"[Feature60, Feature305, Feature761, Feature121...","[2.19, 2.19, 2.19, 2.19, 2.14, 2.14, 2.14, 2.1..."
801,orlraws10P.csv,paper_algo,10:43:31.278901,98.08,1,"[Feature8, Feature224, Feature226, Feature672,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
len(set(fs_df['ds_name']))

20

In [None]:
fs_df.to_csv('/content/gdrive/MyDrive/ml-bgu/results/fs_df.csv')

## Part 4 from the project

In [None]:
!pip install datomize --quiet

In [None]:
import pandas as pd
results_df = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/results/clfs_part2_results.csv')

In [None]:
results_df

Unnamed: 0.1,Unnamed: 0,dataset_name,samples_num,original_features_num,filtering_algorithm,learning_algorithm,k,cv_method,fold_num,measure_type,measure_value,avg_inference_time,fit_time
0,0,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,ACC,0.700,0.00010,0.046466
1,1,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,MCC,0.394,0.00009,0.046466
2,2,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,PR-AUC,0.594,0.00009,0.046466
3,3,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,AUC,0.697,0.00009,0.046466
4,4,arcene.csv,200,1000,f_classif,SVC,1,StratifiedKFold,1,ACC,0.700,0.00011,0.019222
...,...,...,...,...,...,...,...,...,...,...,...,...,...
518395,518395,Yale.csv,165,3072,paper_algo_improvement,RandomForestClassifier,all,StratifiedKFold,10,AUC,0.984,0.00086,0.212608
518396,518396,Yale.csv,165,3072,paper_algo_improvement,KNeighborsClassifier,all,StratifiedKFold,10,ACC,0.688,0.00019,0.023805
518397,518397,Yale.csv,165,3072,paper_algo_improvement,KNeighborsClassifier,all,StratifiedKFold,10,MCC,0.687,0.00017,0.023805
518398,518398,Yale.csv,165,3072,paper_algo_improvement,KNeighborsClassifier,all,StratifiedKFold,10,PR-AUC,0.493,0.00021,0.023805


In [None]:
part4_df = results_df.drop_duplicates(
    subset=['dataset_name', 'samples_num', 
            'original_features_num', 'filtering_algorithm', 
            'learning_algorithm', 'k', 'cv_method', 'fold_num', 'measure_type',
            'measure_value'], keep='last')
part4_df = part4_df[part4_df['measure_type'] == 'ACC']
part4_df = part4_df.drop(columns=['samples_num', 'original_features_num', 'cv_method', 'avg_inference_time', 'fit_time'
, 'fold_num', 'Unnamed: 0'])

In [None]:
auc_results_per_k_df = part4_df.groupby(by=['dataset_name', 'filtering_algorithm', 'learning_algorithm', 'k']).mean().reset_index()

In [None]:
auc_results_per_k_df

Unnamed: 0,dataset_name,filtering_algorithm,learning_algorithm,k,measure_value
0,Carcinom.csv,f_classif,GaussianNB,1,0.2994
1,Carcinom.csv,f_classif,GaussianNB,10,0.2683
2,Carcinom.csv,f_classif,GaussianNB,100,0.5385
3,Carcinom.csv,f_classif,GaussianNB,15,0.2858
4,Carcinom.csv,f_classif,GaussianNB,2,0.2994
...,...,...,...,...,...
4995,yeoh.csv,rfe,SVC,3,0.7580
4996,yeoh.csv,rfe,SVC,30,0.9798
4997,yeoh.csv,rfe,SVC,4,0.8507
4998,yeoh.csv,rfe,SVC,5,0.8987


In [None]:
from sklearn.decomposition import KernelPCA
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectKBest
from sklearn.dummy import DummyClassifier
from collections import Counter
from operator import itemgetter
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, PowerTransformer
from sklearn.feature_selection import VarianceThreshold
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

fa_factory_dict = {    
    'f_classif': run_f_classif,
    'relieff': run_relieff,
    'rfe': run_rfe,
    'mrmr': run_mrmr,
    'paper_algo': paper_algo,
    'paper_algo_improvement': paper_algo_improvement,
}

classifiers_factory_dict = {}
for clf in classifiers:
  classifiers_factory_dict[type(clf).__name__] = clf


def get_filtering_algorithm(filtering_algorithm_name):
  return fa_factory_dict[filtering_algorithm_name]

def get_classifier(classifier_name):
  return classifiers_factory_dict[classifier_name]


def run_smote_augmentation(X_train, y_train):
  df = pd.concat([X_train,y_train],axis=1).dropna()
  X_train = df.drop(columns=['y'])
  y_train = df['y']
  y_train_counter = Counter(y_train)
  print(y_train_counter)
  _, min_count = min(y_train_counter.items(), key=itemgetter(1))
  min_count = min(5, min_count-1)
  sm = BorderlineSMOTE(random_state=0, k_neighbors=min_count)
  X_train_aug, y_train_aug = sm.fit_resample(X_train, y_train)
  return X_train_aug, y_train_aug

def part4_preprocessing(X_train, y_train, X_test, y_test, best_config):
  fa_function = get_filtering_algorithm(best_config['filtering_algorithm'])
  k = best_config['k'] if best_config['k']=='all' else int(best_config['k'])

  print(f'k={k}, best_config={best_config}')
  part4_pipeline = Pipeline(
      [
       ('lin_kernel_pca', KernelPCA(kernel='linear', n_jobs=-1)),
       ('rbs_kernel_pca', KernelPCA(kernel='rbf', n_jobs=-1))
       ]
  )
  select_k_best = SelectKBest(fa_function, k=k)
  if best_config['k'] == 'all':
    columns = np.array(X_train.columns)
    scores = fa_function(X_train, y_train)
    support = [True if score == 1 else False for score in scores]
    selected_features = columns[support]
    x_train_k_best = X_train[selected_features]
    x_test_k_best = X_test[selected_features]

  else:
    x_train_k_best = select_k_best.fit_transform(X_train, y_train)
    selected_features = select_k_best.get_feature_names_out()
    x_train_k_best = pd.DataFrame(x_train_k_best, columns=selected_features)
    x_test_k_best = select_k_best.transform(X_test)
    x_test_k_best = pd.DataFrame(x_test_k_best, columns=selected_features)

  X_train = part4_pipeline.fit_transform(x_train_k_best, y_train)
  X_train = pd.DataFrame(X_train)
  X_test = part4_pipeline.transform(x_test_k_best)
  X_test = pd.DataFrame(X_test)
  X_train = x_train_k_best.join(X_train)
  X_test = x_test_k_best.join(X_test)
  X_train, y_train = run_smote_augmentation(X_train, y_train)
  X_train = X_train.fillna(0)
  X_test = X_test.fillna(0)
  return X_train, y_train, X_test, y_test


def part4_runner(ds_paths):
  results = []
  for ds_path in ds_paths:
    print(f'ds_path={ds_path}')
    ds_name = os.path.basename(ds_path)
    best_config = get_best_auc_configuration(ds_name)
    clf = get_classifier(best_config['learning_algorithm'])
    fa_function = get_filtering_algorithm(best_config['filtering_algorithm'])
    best_k_config = best_config['k'] if best_config['k']=='all' else int(best_config['k'])
    results = results + run_experiment([ds_path], [best_k_config], [clf], fs_methods=[fa_function], part4_preprocessing=part4_preprocessing)
  part4_result_df = pd.DataFrame(results)
  part4_result_df.to_csv('/content/gdrive/MyDrive/ml-bgu/results/part4_df.csv')
  return pd.DataFrame(results)



In [None]:
part4_final_result = part4_runner(ds_paths)

## Friedman & Post-Hoc tests

In [None]:
!pip install scikit-posthocs --quiet

In [None]:
# get the data
import pandas as pd

def get_fs_auc_scores(auc_results_df, fs_name):
  return auc_results_df[auc_results_df['filtering_algorithm'] == fs_name]['measure_value']

with open('/content/gdrive/MyDrive/ml-bgu/results/pickles/clf_result.pkl', 'rb') as f:
  results_df1 = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/results/clfs_part2_results.csv')
  auc_results_df = results_df1[results_df1['measure_type'] == 'AUC'].dropna()
  fa_names = set(auc_results_df['filtering_algorithm'])
  f_classif_scores = get_fs_auc_scores(auc_results_df, 'f_classif')
  paper_algo_scores = get_fs_auc_scores(auc_results_df, 'paper_algo')
  paper_algo_improved_scores = get_fs_auc_scores(auc_results_df, 'paper_algo_improvement')
  relieff_scores = get_fs_auc_scores(auc_results_df, 'relieff')
  mrmr_scores = get_fs_auc_scores(auc_results_df, 'mrmr')
  rfe_scores = get_fs_auc_scores(auc_results_df, 'rfe')

  print(f'names of feature selection= {fa_names}')


names of feature selection= {'mrmr', 'paper_algo_improvement', 'relieff', 'f_classif', 'paper_algo', 'rfe'}


In [None]:
auc_results_df.head()

Unnamed: 0.1,Unnamed: 0,dataset_name,samples_num,original_features_num,filtering_algorithm,learning_algorithm,k,cv_method,fold_num,measure_type,measure_value,avg_inference_time,fit_time
3,3,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,AUC,0.697,9e-05,0.046466
7,7,arcene.csv,200,1000,f_classif,SVC,1,StratifiedKFold,1,AUC,0.697,9e-05,0.019222
11,11,arcene.csv,200,1000,f_classif,LogisticRegression,1,StratifiedKFold,1,AUC,0.697,9e-05,0.013545
15,15,arcene.csv,200,1000,f_classif,RandomForestClassifier,1,StratifiedKFold,1,AUC,0.404,0.00062,0.158133
19,19,arcene.csv,200,1000,f_classif,KNeighborsClassifier,1,StratifiedKFold,1,AUC,0.652,0.00011,0.01149


In [None]:
from scipy import stats
stats.friedmanchisquare(paper_algo_scores,
                        paper_algo_improved_scores, relieff_scores, mrmr_scores, rfe_scores)


FriedmanchisquareResult(statistic=2591.3608763016514, pvalue=0.0)

This p-value is lower than 0.05. Therefore we can reject the null hypothesis

The meaning is that the auc scores is statistically significant between them.

In [None]:
import scikit_posthocs as sp
data = np.array([paper_algo_scores,
                        paper_algo_improved_scores, relieff_scores, mrmr_scores, rfe_scores])
posthocs_df = sp.posthoc_nemenyi_friedman(data.T)
names = ['paper_algo', 'paper_algo_improved', 'relieff', 'mrmr', 'rfe']
posthocs_df.columns = [ 'paper_algo', 'paper_algo_improved', 'relieff', 'mrmr', 'rfe']
posthocs_df.insert(0, 'names', names)
posthocs_df

Unnamed: 0,names,paper_algo,paper_algo_improved,relieff,mrmr,rfe
0,paper_algo,1.0,0.001,0.001,0.001,0.64369
1,paper_algo_improved,0.001,1.0,0.001,0.001,0.001
2,relieff,0.001,0.001,1.0,0.001,0.001
3,mrmr,0.001,0.001,0.001,1.0,0.001
4,rfe,0.64369,0.001,0.001,0.001,1.0


## Results

Calculate the ACC mean for each feature selection method in each dataset.

In [137]:
results_df1 = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/results/clfs_part2_results.csv')

In [138]:
acc_results = results_df1[results_df1['measure_type'] == 'ACC'][['dataset_name', 'filtering_algorithm', 'measure_value']]
auc_results = results_df1[results_df1['measure_type'] == 'AUC'][['dataset_name', 'filtering_algorithm', 'measure_value']]
acc_results.groupby(by=['dataset_name', 'filtering_algorithm']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,measure_value
dataset_name,filtering_algorithm,Unnamed: 2_level_1
Carcinom.csv,f_classif,0.365505
Carcinom.csv,mrmr,0.610045
Carcinom.csv,paper_algo,0.850100
Carcinom.csv,paper_algo_improvement,0.852440
Carcinom.csv,relieff,0.694700
...,...,...
yeoh.csv,mrmr,0.828305
yeoh.csv,paper_algo,0.959640
yeoh.csv,paper_algo_improvement,0.950040
yeoh.csv,relieff,0.817762


Calculate the ACC mean for each feature selection method.

In [139]:
acc_results.groupby(by=['filtering_algorithm']).mean()

Unnamed: 0_level_0,measure_value
filtering_algorithm,Unnamed: 1_level_1
f_classif,0.576746
mrmr,0.620289
paper_algo,0.694622
paper_algo_improvement,0.771194
relieff,0.617844
rfe,0.692581


Calculate the ACC mean for each feature selection method.

In [140]:
auc_results.groupby(by=['filtering_algorithm']).mean()

Unnamed: 0_level_0,measure_value
filtering_algorithm,Unnamed: 1_level_1
f_classif,0.695983
mrmr,0.67723
paper_algo,0.726115
paper_algo_improvement,0.738618
relieff,0.6949
rfe,0.720909


## Save the final results to csv

In [223]:
import pandas as pd
df = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/results/clfs_part2_results.csv')
part4_df = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/results/part4_df.csv')
feature_selection_df = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/results/fs_df.csv')

In [207]:
df.head()

Unnamed: 0.1,Unnamed: 0,dataset_name,samples_num,original_features_num,filtering_algorithm,learning_algorithm,k,cv_method,fold_num,measure_type,measure_value,avg_inference_time,fit_time
0,0,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,ACC,0.7,0.0001,0.046466
1,1,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,MCC,0.394,9e-05,0.046466
2,2,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,PR-AUC,0.594,9e-05,0.046466
3,3,arcene.csv,200,1000,f_classif,GaussianNB,1,StratifiedKFold,1,AUC,0.697,9e-05,0.046466
4,4,arcene.csv,200,1000,f_classif,SVC,1,StratifiedKFold,1,ACC,0.7,0.00011,0.019222


In [224]:
part4_df['learning_algorithm'] = part4_df['learning_algorithm'].apply(lambda name: 'aug_' + name)

In [225]:
feature_selection_df = feature_selection_df.rename(columns={"ds_name": "dataset_name", "score_function": "filtering_algorithm"})

In [259]:
import json
import ast

def top_k(selected_features, k):
  selected_features = ast.literal_eval(selected_features)
  if k == 'all':
    return selected_features
  return selected_features[0:int(k)]

In [260]:
results_df = pd.concat([df, part4_df])
full_with_feature_selection_df = results_df.merge(feature_selection_df, on=['dataset_name', 'filtering_algorithm', 'fold_num'])
full_with_feature_selection_df = full_with_feature_selection_df.drop(columns=['file_time', 'Unnamed: 0_x', 'Unnamed: 0_y'])
full_with_feature_selection_df['selected_features'] = full_with_feature_selection_df.apply(
    lambda row: top_k(row['selected_features'], row['k']), axis=1)
full_with_feature_selection_df['selected_features_scores'] = full_with_feature_selection_df.apply(
    lambda row: top_k(row['selected_features_scores'], row['k']), axis=1)


In [268]:
full_with_feature_selection_df.sample()

Unnamed: 0,dataset_name,samples_num,original_features_num,filtering_algorithm,learning_algorithm,k,cv_method,fold_num,measure_type,measure_value,avg_inference_time,fit_time,fe_fit_time,selected_features,selected_features_scores
481654,Gevers2014_IBD_ileum.csv,140,297,f_classif,KNeighborsClassifier,4,StratifiedKFold,8,PR-AUC,0.44,0.00016,0.006706,0.01,"[Feature258, Feature241, Feature60, Feature159]","[32.99, 17.85, 16.88, 16.88]"


In [270]:
full_with_feature_selection_df.to_excel('/content/gdrive/MyDrive/ml-bgu/results/final_results.xlsx')

Full report: 
[https://github.com/Ofir408/ml-feature-selection/blob/main/report.pdf](https://github.com/Ofir408/ml-feature-selection/blob/main/report.pdf)