# Raw data prep

In [2]:
# the raw data was already processed
prepare = True

In [3]:
import scipy.io
import glob
import numpy as np
import pandas as pd
import os
from scipy.io import arff
from sklearn import preprocessing
from tqdm import tqdm
# import cantools

In [19]:
#scikit_feature_datasets
if prepare:
    for filename in glob.glob('../data/microarrays/data/scikit_feature_datasets/raw/*.mat'):
        _mat = scipy.io.loadmat(filename)
        _data = np.hstack([_mat['X'], _mat['Y']])
        _columns = [f'feature_{ii}' for ii in range(_data.shape[1])]
        _columns[-1] = 'target'
        _df = pd.DataFrame(_data, columns=_columns)
        _csv_name = filename.split(os.sep)[-1].split('.')[0]+'.csv'
        _df.to_csv(os.path.join('data/scikit_feature_datasets/', _csv_name), index=False)

In [26]:
# ARFF
if prepare:
    for filename in glob.glob('../data/microarrays/data/ARFF/raw/*.arff'):
        try:
            _data = arff.loadarff(filename)
            _df = pd.DataFrame(_data[0])
            _columns = list(_df.columns)
            _columns[-1] = 'target'
            _df.columns = _columns
            # _df = _df[~_df['target'].isna()]
            _df.at[_df['target'].isna(), 'target'] = 'control'
            # _df = _df.fillna(0.0)
            _csv_name = filename.split(os.sep)[-1].split('.')[0]+'.csv'
            _df.to_csv(os.path.join('data/ARFF/', _csv_name), index=False)
        except:
            pass

In [27]:
# bioconductor
if prepare:
    for filename in glob.glob('../data/microarrays/data/bioconductor/raw/*.csv'):
        if 'COPDSexualDimorphism' in filename: 
            continue 
        if 'DLBCL' in filename:
            continue

        _df = pd.read_csv(filename)
        _df = _df.T.reset_index(drop=True)
        _df.columns = _df.iloc[0]
        _df = _df[1:]
        _columns = list(_df.columns)
        _columns[0] = 'target'
        _df.columns = _columns 
        # _df = _df[~_df['target'].isna()]
        _df.at[_df['target'].isna(), 'target'] = 'control'
        # _df = _df.fillna(0.0)
        _csv_name = filename.split(os.sep)[-1].split('.')[0]+'.csv'
        _df.to_csv(os.path.join('data/bioconductor/', _csv_name), index=False)


In [47]:
# EfficientFS
if prepare:
    for filename in glob.glob('../data/microarrays/data/EfficientFS/raw/*.*'):   
        if '.csv' in filename:
            _df = pd.read_csv(filename)
            _df = _df.rename(columns = {'class':'target', ' class':'target'})
            _csv_name = filename.split(os.sep)[-1].split('.csv')[0]+'.csv'    
        elif '.mat' in filename:
            _mat = scipy.io.loadmat(filename)
            _data = np.hstack([_mat['X'], _mat['Y']])
            _columns = [f'feature_{ii}' for ii in range(_data.shape[1])]
            _columns[-1] = 'target'
            _df = pd.DataFrame(_data, columns=_columns)
            _csv_name = filename.split(os.sep)[-1].split('.mat')[0]+'.csv'    
        elif '.arff' in filename:
            try:
                _data = arff.loadarff(filename)
                _df = pd.DataFrame(_data[0])
                _columns = list(_df.columns)
                _columns[-1] = 'target'
                _df.columns = _columns  
                _csv_name = filename.split(os.sep)[-1].split('.arff')[0]+'.csv'    
            except:
                continue
        else: # not a data file
            continue
        _df.to_csv(os.path.join('data/EfficientFS/', _csv_name), index=False)

In [8]:
# mAML_benchmark_datasets
if prepare:
    os.makedirs('../data/microarrays/data/mAML_benchmark_datasets/LabelEncoder/', exist_ok=True)
    for filename in glob.glob('data/mAML_benchmark_datasets/raw/*/'):
        print(filename)
        csv_data = list(set(glob.glob(filename + '*.csv')) - set(glob.glob(filename + '*mf.csv')))[0]
        csv_label = glob.glob(filename + '*mf.csv')[0]
        _df_data = pd.read_csv(csv_data)
        _df_label = pd.read_csv(csv_label)
        _columns = list(_df_data.columns)
        if '#SampleID' not in _columns:
            _columns[0] = '#SampleID'
            _df_data.columns = _columns
        try:
            _df = pd.merge(_df_data, _df_label, on='#SampleID')
        except:
            continue
        _df = _df.rename(columns = {'label':'target'})
        _df = _df[list(set(_df.columns) - set(['#SampleID']))]
        le = preprocessing.LabelEncoder()
        le.fit(list(_df['target']))
        _df_LabelEncoder = pd.DataFrame([{'target_str': target_str, 'target_index':idx}  for (idx, target_str) in enumerate(list(le.classes_))])
        list(le.classes_)
        _df['target'] = le.transform(_df['target'])        
        _csv_name = csv_data.split(os.sep)[-1].split('.csv')[0]+'.csv'    
        _df_LabelEncoder.to_csv(os.path.join('data/mAML_benchmark_datasets/LabelEncoder/', _csv_name.replace('.csv', '_LabelEncoder.csv')), index=False)
        _df.to_csv(os.path.join('data/mAML_benchmark_datasets/', _csv_name), index=False)


data/mAML_benchmark_datasets/raw/Cho2012_Antibiotics_cecal.5/
data/mAML_benchmark_datasets/raw/Cho2012_Antibiotics_fecal.5/
data/mAML_benchmark_datasets/raw/Costello2009_BodyHabitats.6/
data/mAML_benchmark_datasets/raw/Costello2009_SkinSites.12/
data/mAML_benchmark_datasets/raw/Costello2009_Subject.7/
data/mAML_benchmark_datasets/raw/Fierer2010_Subject.3/
data/mAML_benchmark_datasets/raw/Fierer2010_SubjectH.6/
data/mAML_benchmark_datasets/raw/Gevers2014_IBD_ileum/
data/mAML_benchmark_datasets/raw/Gevers2014_IBD_rectum/
data/mAML_benchmark_datasets/raw/Huttenhower2012_HMP.BS.5/
data/mAML_benchmark_datasets/raw/Karlsson2013_Diabetes.3/
data/mAML_benchmark_datasets/raw/Montassier2016_Bacteremia/
data/mAML_benchmark_datasets/raw/Morgan2012_IBD.3/
data/mAML_benchmark_datasets/raw/Qin2012_Diabetes/
data/mAML_benchmark_datasets/raw/Qin2014_Cirrhosis/
data/mAML_benchmark_datasets/raw/Ravel2011_Vaginal/
data/mAML_benchmark_datasets/raw/Wu2011_Diet/
data/mAML_benchmark_datasets/raw/Yang2010_Esop

# How to load

In [2]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from tqdm import tqdm
summary_data = []

for idx, csv in tqdm(enumerate(glob.glob('data/*/*.csv'))):
    # print(csv)
    df = pd.read_csv(csv)
#     train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['target'])
    
#     y_train = train_df.pop('target')
#     y_train = y_train.to_numpy()
#     X_train = train_df.to_numpy()

#     y_test = test_df.pop('target')
#     y_test = y_test.to_numpy()
#     X_test = test_df.to_numpy()
    
#     clf = RandomForestClassifier(random_state=42)
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
#     acc = accuracy_score(y_test, y_pred)
    
    features_count = df.shape[1]
    samples_count = df.shape[0]
    # num_classes = len(set(y_test))
    num_classes = len(set(df['target']))
    
    
    summary_data.append({
        'path':csv,
        'name':csv.split(os.sep)[-1],
        'corpus':csv.split(os.sep)[-2],
        'samples': samples_count,
        'features': features_count,        
        'num_classes': num_classes,
        'has_na': True if df.isnull().values.any() else False,
        'na_target_to_control': True if 'control' in list(df['target']) else False,
        # 'baseline_acc': acc,
    })
    
summary_df = pd.DataFrame(summary_data)

70it [00:53,  1.31it/s]


In [3]:
pd.set_option('display.max_rows', None)
display(summary_df)
pd.set_option('display.max_rows', 10)

Unnamed: 0,path,name,corpus,samples,features,num_classes,has_na,na_target_to_control
0,data/ARFF/Breast.csv,Breast.csv,ARFF,97,24482,2,False,False
1,data/ARFF/CNS.csv,CNS.csv,ARFF,60,7130,2,False,False
2,data/ARFF/Leukemia_3c.csv,Leukemia_3c.csv,ARFF,72,7130,3,False,False
3,data/ARFF/Leukemia_4c.csv,Leukemia_4c.csv,ARFF,72,7130,4,False,False
4,data/ARFF/Lung.csv,Lung.csv,ARFF,203,12601,5,False,False
5,data/ARFF/Lymphoma.csv,Lymphoma.csv,ARFF,66,4027,3,True,False
6,data/ARFF/MLL.csv,MLL.csv,ARFF,72,12583,3,False,False
7,data/ARFF/Ovarian.csv,Ovarian.csv,ARFF,253,15155,2,False,False
8,data/ARFF/SRBCT.csv,SRBCT.csv,ARFF,83,2309,4,False,False
9,data/bioconductor/ALL.csv,ALL.csv,bioconductor,128,12626,3,False,True


In [10]:
# summary_df.to_csv('../../data/microarrays/data_summary.csv', index=False)

In [22]:
# toy example
if prepare:
    filename = '../data/microarrays/data_toy_example/SPECTF.train'
    _df = pd.read_csv(filename, header=None)
    _columns = [f'feature_{ii-1}' for ii in range(_df.shape[1])]
    _columns[0] = 'target'
    _df.columns = _columns 
    display(_df.head(3))
    _df.to_csv('../data/microarrays/data_toy_example/SPECTF_train.csv', index=False)

Unnamed: 0,target,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43
0,1,59,52,70,67,73,66,72,61,58,...,66,56,62,56,72,62,74,74,64,67
1,1,72,62,69,67,78,82,74,65,69,...,65,71,63,60,69,73,67,71,56,58
2,1,71,62,70,64,67,64,79,65,70,...,73,70,66,65,64,55,61,41,51,46


In [16]:
set(_df['target'])

{0, 1}

In [20]:
_csv_name

'SPECTF.csv'