<a href="https://colab.research.google.com/github/Ofir408/ml-feature-selection/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Ofir Ben Shoham

This nootbook is part of the code of the final project. 

Here I do the required preprocessing on the 20 datasets

## Setup

In [104]:
import pandas as pd
from google.colab import drive
import scipy

In [36]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Data Preprocessing

In [92]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, PowerTransformer
from sklearn.feature_selection import VarianceThreshold

def preprocess(X, y):
  prep_df = pd.DataFrame(X, columns=["Feature" + str(x) for x in range(X.shape[1])])

  ct = ColumnTransformer(
    [
        ("simpleImputer", SimpleImputer(missing_values=np.nan, strategy='mean'), prep_df.columns),
        ("varianceThreshold", VarianceThreshold(), prep_df.columns),
        ("normalizer", PowerTransformer(), prep_df.columns)

    ])
  X = ct.fit_transform(prep_df)
  prep_df['y'] = y

  preprocess_df = pd.DataFrame(X, columns=["Feature" + str(x) for x in range(X.shape[1])])
  preprocess_df['y'] = y
  return preprocess_df



### scikit-feature datasets preprocessing

In [None]:
for ds_name in ['colon', 'Yale', 'GLIOMA', 'arcene', 'Carcinom']:
  mat = scipy.io.loadmat(f'/content/gdrive/MyDrive/ml-bgu/datasets/scikit-feature/{ds_name}.mat')
  X = mat['X']
  y = mat['Y']
  final_df = preprocess(X, y)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


### Datamicroarray datasets preprocessing


In [144]:
import pyreadr
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/Datamicroarray/csv/'

for ds_name in ['alon', 'christensen', 'khan', 'sorlie', 'west']:
  inputs_file_path = f'{dir_path}/{ds_name}/{ds_name}_inputs.csv'
  outputs_file_path = f'{dir_path}/{ds_name}/{ds_name}_outputs.csv'
  inputs_df = pd.read_csv(inputs_file_path, header=None)
  inputs_df.columns = [str(x) for x in inputs_df.columns]
  outputs_df = pd.read_csv(outputs_file_path, header=None)
  outputs_df.columns = ['y']

  final_df = preprocess(inputs_df.to_numpy(), outputs_df.to_numpy())
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


### Misc datasets preprocessing

In [None]:
from scipy.io import arff
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/Misc'

for ds_name in ['GDS3610', 'GDS6063']:
  df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  final_df = preprocess(df.to_numpy(), df['Class'].to_numpy())
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


In [None]:
from sklearn import preprocessing

for ds_name in ['journal.pone.0246039.s002', 'journal.pone.0246039.s005',
                'pone.0246039.s001']:
  df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  if 'samples' in df.columns:
    df.drop(columns=['samples'], inplace=True)
  le = preprocessing.LabelEncoder()
  class_column = le.fit_transform(df['Response'])
  final_df = preprocess(df.drop(columns=['Response']).to_numpy(), class_column)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


### mAML datasets preprocessing

In [196]:
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/mAML_benchmark_datasets'

for ds_name in ['Montassier2016_Bacteremia', 'Qin2012_Diabetes',
                'Ravel2011_Vaginal', 'Wu2011_Diet', 'Costello2009_Subject.7']:
  print(f'ds_name={ds_name}')
  inputs_df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  output_df = pd.read_csv(f'{dir_path}/{ds_name}.mf.csv')
  df = inputs_df.join(output_df, lsuffix='i')
  le = preprocessing.LabelEncoder()
  class_column = le.fit_transform(df['label'])
  final_df = preprocess(df.drop(columns=['#SampleID', '#SampleIDi', 'label']).to_numpy(), class_column)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


ds_name=Montassier2016_Bacteremia
ds_name=Qin2012_Diabetes
ds_name=Ravel2011_Vaginal
ds_name=Wu2011_Diet
ds_name=Costello2009_Subject.7


### validation that everything was good

In [201]:
df = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/GDS3610.csv')
X = df.drop(columns=['y'])
y = df['y']
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

clf = SVC(gamma='auto')
cross_val_score(clf, X, y.ravel(), cv=3)


array([0.9       , 0.88888889, 0.88888889])