<a href="https://colab.research.google.com/github/Ofir408/ml-feature-selection/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Ofir Ben Shoham

This nootbook is part of the code of the final project. 

Here I do the required preprocessing on the 20 datasets

## Setup

In [1]:
import pandas as pd
from google.colab import drive
import scipy
from sklearn import preprocessing
import numpy as np

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Data Preprocessing

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, PowerTransformer
from sklearn.feature_selection import VarianceThreshold

def preprocess(X, y):
  prep_df = pd.DataFrame(X, columns=["Feature" + str(x) for x in range(X.shape[1])])

  ct = ColumnTransformer(
    [
        ("simpleImputer", SimpleImputer(missing_values=np.nan, strategy='mean'), prep_df.columns),
        ("varianceThreshold", VarianceThreshold(), prep_df.columns),
        ("normalizer", PowerTransformer(), prep_df.columns)

    ])
  X = ct.fit_transform(prep_df)
  prep_df['y'] = y

  preprocess_df = pd.DataFrame(X, columns=["Feature" + str(x) for x in range(X.shape[1])])
  preprocess_df['y'] = y
  return preprocess_df



### scikit-feature datasets preprocessing

In [53]:
from scipy import io
#for ds_name in ['colon', 'Yale', 'GLIOMA', 'arcene', 'Carcinom', 'madelon', 'ORL', 'leukemia', 'PCMAC', 'pixraw10P', 'lung_small', 'lymphoma', 'nci9', 'orlraws10P']:
for ds_name in ['arcene']:
  mat = io.loadmat(f'/content/gdrive/MyDrive/ml-bgu/datasets/scikit-feature/{ds_name}.mat')
  X = mat['X']
  y = mat['Y']
  final_df = preprocess(X, y)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


  loglike = -n_samples / 2 * np.log(x_trans.var())


### Datamicroarray datasets preprocessing


In [None]:
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/Datamicroarray/csv/'

for ds_name in ['sorlie', 'su', 'gravier', 'christensen', 'singh', 'yeoh']:
  inputs_file_path = f'{dir_path}/{ds_name}/{ds_name}_inputs.csv'
  outputs_file_path = f'{dir_path}/{ds_name}/{ds_name}_outputs.csv'
  inputs_df = pd.read_csv(inputs_file_path, header=None)
  inputs_df.columns = [str(x) for x in inputs_df.columns]
  outputs_df = pd.read_csv(outputs_file_path, header=None)
  outputs_df.columns = ['y']

  final_df = preprocess(inputs_df.to_numpy(), outputs_df.to_numpy())
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


### Misc datasets preprocessing

In [None]:
from scipy.io import arff
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/Misc'

for ds_name in ['GDS3610', 'GDS6063']:
  df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  final_df = preprocess(df.to_numpy(), df['Class'].to_numpy())
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


In [None]:
from sklearn import preprocessing

for ds_name in ['journal.pone.0246039.s002', 'journal.pone.0246039.s005',
                'pone.0246039.s001']:
  df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  if 'samples' in df.columns:
    df.drop(columns=['samples'], inplace=True)
  le = preprocessing.LabelEncoder()
  class_column = le.fit_transform(df['Response'])
  final_df = preprocess(df.drop(columns=['Response']).to_numpy(), class_column)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


### mAML datasets preprocessing

In [None]:
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/mAML_benchmark_datasets'

for ds_name in ['Montassier2016_Bacteremia', 'Qin2012_Diabetes',
                'Ravel2011_Vaginal', 'Wu2011_Diet', 'Costello2009_Subject.7', 'Gevers2014_IBD_rectum']:
  print(f'ds_name={ds_name}')
  inputs_df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  output_df = pd.read_csv(f'{dir_path}/{ds_name}.mf.csv')
  df = inputs_df.join(output_df, lsuffix='i')
  le = preprocessing.LabelEncoder()
  class_column = le.fit_transform(df['label'])
  final_df = preprocess(df.drop(columns=['#SampleID', '#SampleIDi', 'label']).to_numpy(), class_column)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


ds_name=Montassier2016_Bacteremia
ds_name=Qin2012_Diabetes
ds_name=Ravel2011_Vaginal
ds_name=Wu2011_Diet
ds_name=Costello2009_Subject.7


In [11]:
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/mAML_benchmark_datasets'

for ds_name in ['Gevers2014_IBD_ileum', 'Huttenhower2012_HMP.BS.5', 'Morgan2012_IBD.3', 'Gevers2014_IBD_rectum']:
  print(f'ds_name={ds_name}')
  inputs_df = pd.read_csv(f'{dir_path}/{ds_name}.csv')
  output_df = pd.read_csv(f'{dir_path}/{ds_name}.mf.csv')
  df = inputs_df.join(output_df, lsuffix='i')
  le = preprocessing.LabelEncoder()
  class_column = le.fit_transform(df['label'])
  final_df = preprocess(df.drop(columns=['#SampleID', '#SampleIDi', 'label']).to_numpy(), class_column)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/{ds_name}.csv')


ds_name=Gevers2014_IBD_rectum


## ARFF dataset preprocessing

In [9]:
from scipy.io import arff
import pandas as pd

data = arff.loadarff('/content/gdrive/MyDrive/ml-bgu/datasets/ARFF/Lung.arff')
df = pd.DataFrame(data[0])
df.head()


Unnamed: 0,AFFX-MurIL2_at,AFFX-MurIL10_at,AFFX-MurIL4_at,AFFX-MurFAS_at,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,...,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at,type
0,-18.6,10.54,0.01,19.44,-16.98,-27.5,-1.6,38.88,-29.12,-42.87,...,8.11,33.21,30.78,47.8,1.63,17.02,13.78,-103.49,76.98,b'1'
1,9.12,9.12,10.18,29.29,-4.68,-1.5,-3.62,20.8,-13.18,-35.47,...,15.49,27.17,26.11,45.22,10.18,69.64,-24.85,-34.41,105.73,b'1'
2,-2.175,-2.21,-0.06,6.32,-1.775,-16.53,-3.61,16.41,-17.97,-57.02,...,14.61,10.875,10.615,35.14,1.745,29.71,10.465,-42.63,73.735,b'1'
3,-1.54,21.75,5.835,23.815,-24.785,-12.89,-4.485,19.5,-21.445,-47.205,...,9.615,27.355,30.86,48.71,10.355,40.845,54.615,-71.38,65.435,b'1'
4,-9.07,3.08,-1.98,17.26,-10.09,-15.15,-18.19,13.21,-13.13,-39.47,...,-14.14,23.34,6.12,33.46,-10.09,40.55,16.25,-48.59,39.54,b'1'


## Eficifent FS

In [13]:
from scipy.io import arff
import pandas as pd

data = arff.loadarff('/content/gdrive/MyDrive/ml-bgu/datasets/EfficientFS/pone.0202167.s015.arff')
df = pd.DataFrame(data[0])
df.head()


Unnamed: 0,MZ-7.86E-05,MZ2.18E-07,MZ9.60E-05,MZ0.000366014,MZ0.000810195,MZ0.001428564,MZ0.002221123,MZ0.003187869,MZ0.004328805,MZ0.005643929,...,MZ19974.404,MZ19977.042,MZ19979.68,MZ19982.319,MZ19984.957,MZ19987.596,MZ19990.235,MZ19992.874,MZ19995.513,Class
0,0.494626,0.263735,0.321841,0.220934,0.297622,0.316458,0.154763,0.223685,0.304346,0.241757,...,0.483622,0.449296,0.449296,0.449296,0.449296,0.449296,0.449296,0.449296,0.449296,b'Normal'
1,0.258063,0.406593,0.321841,0.069771,0.333335,0.354432,0.321431,0.14474,0.260869,0.142853,...,0.631765,0.619718,0.619718,0.619718,0.619718,0.619718,0.619718,0.619718,0.619718,b'Normal'
2,0.537636,0.032966,0.321841,0.209307,0.404762,0.113927,0.369049,0.223685,0.536231,0.131865,...,0.038462,0.035918,0.035918,0.035918,0.035918,0.035918,0.035918,0.035918,0.035918,b'Normal'
3,0.0,0.395605,0.310347,0.197673,0.404762,0.455701,0.416666,0.210527,0.420292,0.274723,...,0.497864,0.486621,0.486621,0.486621,0.486621,0.486621,0.486621,0.486621,0.486621,b'Normal'
4,0.526884,0.395605,0.367817,0.383719,0.488099,0.392405,0.238094,0.5,0.362316,0.274723,...,0.267096,0.251408,0.251408,0.251408,0.251408,0.251408,0.251408,0.251408,0.251408,b'Normal'


In [14]:
pd.read_csv('/content/gdrive/MyDrive/ml-bgu/datasets/EfficientFS/pone.0202167.s016.csv').head()

Unnamed: 0,class,AFFX-MurIL2_at,AFFX-MurIL10_at,AFFX-MurIL4_at,AFFX-MurFAS_at,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,...,100_g_at,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at
0,1,-12,16,37,40,269,209,197,695,748,...,232,56,21,228,63,64,-23,-94,-99,248
1,1,6,8,46,46,261,215,153,676,658,...,188,36,27,93,79,52,14,-70,-179,129
2,1,-55,33,14,44,570,535,378,1362,1400,...,205,39,20,56,86,90,-15,-102,-89,117
3,1,-26,3,-10,39,273,249,177,632,672,...,198,100,19,78,61,9,16,-117,-64,153
4,1,-27,1,1,18,261,251,193,654,711,...,187,74,34,26,38,7,23,-53,-66,141


## microbiome data processing

In [204]:
dir_path = '/content/gdrive/MyDrive/ml-bgu/datasets/microbiome_data'

for ds_name in ['bacteremia',  'bushman_cafe', 'david', 'kostic', 'claesson', 'turnbaugh']:
  print(f'ds_name={ds_name}')
  inputs_df = pd.read_csv(f'{dir_path}/{ds_name}/gg/otutable.txt', delimiter = "\t")
  output_df = pd.read_csv(f'{dir_path}/{ds_name}/task.txt', delimiter = "\t")
  df = inputs_df.join(output_df, lsuffix='i')
  le = preprocessing.LabelEncoder()
  class_column = le.fit_transform(df['Var'])
  final_df = preprocess(df.drop(columns=['#SampleID', 'Var']).to_numpy(), class_column)
  final_df.to_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/{ds_name}.csv')


ds_name=bushman_cafe


### validation that everything was good

In [11]:
df = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/GDS3610.csv')
X = df.drop(columns=['y'])
y = df['y']
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

clf = SVC(gamma='auto')
cross_val_score(clf, X, y.ravel(), cv=3)


array([0.9       , 0.88888889, 0.88888889])

In [57]:
t = ['Gevers2014_IBD_ileum', 'Huttenhower2012_HMP.BS.5', 'Morgan2012_IBD.3']
for t1 in t: 
  t2 = pd.read_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/{t1}.csv')
  print(t2.shape) 

(140, 299)
(1529, 971)
(128, 2063)


In [18]:
t = ['pone.0246039.s001', 'journal.pone.0246039.s005', 'journal.pone.0246039.s002', 'GDS6063', 'GDS3610'] 
for t1 in t: 
  t2 = pd.read_csv(f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/{t1}.csv')
  path = f'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/{t1}.csv'
  print(path, 'shape=', t2.shape) 

/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/pone.0246039.s001.csv shape= (72, 10715)
/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/journal.pone.0246039.s005.csv shape= (60, 21389)
/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/journal.pone.0246039.s002.csv shape= (62, 6002)
/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/GDS6063.csv shape= (10, 49154)
/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Misc/GDS3610.csv shape= (28, 42185)


In [42]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/*.csv", recursive=True)
for t1 in ds_paths: 
  t2 = pd.read_csv(t1)
  print(f'ds={t1}, shape={t2.shape}') 

ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/colon.csv, shape=(62, 6002)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/Yale.csv, shape=(165, 3074)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/GLIOMA.csv, shape=(50, 13304)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/arcene.csv, shape=(200, 29963)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/Carcinom.csv, shape=(174, 27548)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/madelon.csv, shape=(2600, 1502)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/ORL.csv, shape=(400, 3074)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/leukemia.csv, shape=(72, 21212)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/PCMAC.csv, shape=(1943, 9869)
ds=/content/gdrive/MyDrive/

In [98]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/*.csv", recursive=True)
for t1 in ds_paths: 
  t2 = pd.read_csv(t1)
  print(f'ds={t1}, shape={t2.shape}') 

ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/FSH.csv, shape=(1217, 99)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/PDX.csv, shape=(74018, 201)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/PBS.csv, shape=(74018, 201)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/CSS.csv, shape=(4793, 358)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/CBH.csv, shape=(6979, 553)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/CS.csv, shape=(2543, 141)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/FS.csv, shape=(1217, 105)
ds=/content/gdrive/MyDrive/ml-bgu/datasets/microbiomic data/BP.csv, shape=(13503, 152)


In [32]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets/scikit-feature/*.csv", recursive=True)
for t1 in ds_paths: 
  t2 = pd.read_csv(t1)
  print(f'ds={t1}, shape={t2.shape}') 

In [58]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/*.csv", recursive=True)
for t1 in ds_paths: 
  t2 = pd.read_csv(t1)
  print(f'ds={t1}, shape={t2.shape}') 

ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/alon.csv, shape=(62, 6002)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/christensen.csv, shape=(217, 4241)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/khan.csv, shape=(63, 6926)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/sorlie.csv, shape=(85, 1370)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/west.csv, shape=(49, 21389)


In [13]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/*.csv", recursive=True)
for t1 in ds_paths: 
  t2 = pd.read_csv(t1)
  if t2.shape[0]>100:
    print(f'ds={t1}, shape={t2.shape}') 

ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Morgan2012_IBD.3.csv, shape=(128, 2063)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Qin2012_Diabetes.csv, shape=(124, 35642)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Ravel2011_Vaginal.csv, shape=(342, 1529)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Costello2009_Subject.7.csv, shape=(140, 7631)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Gevers2014_IBD_ileum (1).csv, shape=(140, 299)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Yang2010_EsophagitisPDX.4.csv, shape=(200, 49151)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datasets/Gevers2014_IBD_ileum.csv, shape=(140, 299)
ds=/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/mAML_benchmark_datas

In [62]:
import glob
ds_paths = glob.glob("/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/*.csv", recursive=True)
paths = []
for t1 in ds_paths: 
  t2 = pd.read_csv(t1)
  if t2.shape[0]>100 and t2.shape[1]>200:
    paths.append(t1)
paths


['/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/bushman_cafe.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/david.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/claesson.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/kostic.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/turnbaugh.csv']

## Filter features due to CPU runtime 

In [45]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

def filter_features(df):
  target = df['y']
  features_df = df.drop(columns=['y'])
  selector = SelectKBest(f_classif, k=1000)
  selector.fit(features_df, target)
  cols = selector.get_support(indices=True)
  features_df_new = features_df.iloc[:,cols]
  features_df_new['y'] = target
  return features_df_new


In [None]:
df_paths_to_filter = ['/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/arcene.csv',
'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/Carcinom.csv',
'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/orlraws10P.csv',
'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/scikit-feature/warpAR10P.csv']
for df_path in df_paths_to_filter:
  df = pd.read_csv(df_path)
  df = filter_features(df)
  df.to_csv(df_path)

In [None]:
df_paths_to_filter = ['/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/yeoh.csv',
'/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/Datamicroarray/singh.csv']
for df_path in df_paths_to_filter:
  df = pd.read_csv(df_path)
  df = filter_features(df)
  df.to_csv(df_path)

## Filter number of rows

In [179]:
df_paths = ['/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/bushman_cafe.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/david.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/claesson.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/kostic.csv',
 '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/turnbaugh.csv']

for path in df_paths:
  df = pd.read_csv(path)
  print(df.shape)
  #df.groupby('y', group_keys=False).apply(lambda x: x.sample(frac=0.2 if df.shape[0] < 800 else 0.1))

  #df = df.sample(n=400)
  #df.to_csv(path)


(354, 291)
(157, 1383)
(113, 510)
(107, 576)
(121, 849)


In [173]:
path = '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/bushman_cafe.csv'
t = pd.read_csv(path)
t2 = t.groupby('y', group_keys=False).apply(lambda x: x.sample(frac=0.2))
t2.to_csv(path)

In [190]:
from collections import Counter 
path = '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/david.csv'
t = pd.read_csv(path)
print(Counter(t['y']))
t_temp = t[t['y'] != 2]
t_temp2 = t[t['y'] == 2].sample(frac=0.025)
t = pd.concat([t_temp, t_temp2], ignore_index=True)
t.to_csv(path)

Counter({2: 6274, 1: 10, 0: 9})


In [175]:
path = '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/claesson.csv'
t = pd.read_csv(path)
t2 = t.groupby('y', group_keys=False).apply(lambda x: x.sample(frac=0.03))
t2.to_csv(path)

In [176]:
path = '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/kostic.csv'
t = pd.read_csv(path)
t2 = t.groupby('y', group_keys=False).apply(lambda x: x.sample(frac=0.033))
t2.to_csv(path)

In [201]:
path = '/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/turnbaugh.csv'
t = pd.read_csv(path)
print(Counter(t['y']))
t_temp = t[t['y'] != 2]
t_temp2 = t[t['y'] == 2].sample(frac=0.038)
t = pd.concat([t_temp, t_temp2], ignore_index=True)
t.to_csv(path)

Counter({2: 3909, 1: 107, 0: 35})


In [211]:
t = pd.read_csv('/content/gdrive/MyDrive/ml-bgu/datasets_after_preprocessing/microbiome_data/bushman_cafe.csv')
print(Counter(t['y']))
t_temp = t[t['y'] != 2]
t_temp2 = t[t['y'] == 2].sample(frac=0.08)
t = pd.concat([t_temp, t_temp2], ignore_index=True)
print(Counter(t['y']))
print(t.shape)
t.to_csv(path)

Counter({2: 1759, 0: 5, 1: 5})
Counter({2: 141, 0: 5, 1: 5})
(151, 290)
