# Exploratory Data Analysis - covariants
by __Pawel Rosikiewicz__ 

---

## Setup
---

__global imports__
* I purposely placed other imports, such as my custom made functions for thsi project in each section
* to allow you fast inspection of my code, but also, copying these important to new notebooks, for pipeline development

In [1]:
import os
import sys
import re # module to use regular expressions, 
import glob # lists names in folders that match Unix shell patterns
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn import set_config
from sklearn.preprocessing import RobustScaler # creates custom transfomers
from sklearn.preprocessing import FunctionTransformer # creates custom transfomers
from sklearn.pipeline import make_pipeline, Pipeline # like pipeline function, but give step names automatically, 
from sklearn.compose import ColumnTransformer # allows using different transformers to different columns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer # skleanr transformers,

In [2]:
# basedir
basedir = os.path.dirname(os.getcwd())
os.chdir(basedir)
sys.path.append(basedir)

In [3]:
# paths
PATH_data_raw     = os.path.join(basedir, "data/raw")
PATH_data_interim = os.path.join(basedir, "data/interim")
PATH_results      = os.path.join(basedir, "data/results")
PATH_models       = os.path.join(basedir, "models")

__load functions, and classes created for that project__

In [4]:
from src.utils.helper_data_loaders import load_tsv

# helper function for qc and piepline calibration
from src.utils.helper_tpm_summary import tpm_summary
from src.utils.helper_tpm_summary import tpm_plots
from src.utils.helper_cluster_histogram import spearman_clustermap
from src.utils.helper_boxplot import colored_boxplots
from src.utils.helper_colored_boxplot import plot_colored_boxplot
from src.utils.helper_gene_expression_clustermap import gene_expression_clustermap

# my custom transformers
from src.utils.preprocessing_spearman_filter import SpearmanFilter # to remove sample outliers, 
from src.utils.preprocessing_zero_value_filter import ZeroValueFilter # to remove genes with no tpm in most of the samples

__configurations__

In [5]:
# main variable groups, and types
VAR_GROUPS = dict(
    TARGET_VAR = ["target"],
    CATEGORICAL_VAR = [ 'Baseline ECOG Score', 'Enrollment IC', 'IC Level', 'TC Level', 'Immune phenotype', 'Sex',
           'TCGA Subtype', 'Lund', 'Lund2', 'Received platinum',
           'Met Disease Status', 'Sample age', 'Sample collected pre-platinum',
           'Intravesical BCG administered', 'Tobacco Use History'],
    QUANTITATIVE_VAR = ['FMOne mutation burden per MB', 'Neoantigen burden per MB'],
)

# variable encoding in data_cov
VAR_DTYPES = {
    "TARGET_VAR": "int", # only for EDA
    "CATEGORICAL_VAR": "O",
    "QUANTITATIVE_VAR": "float64"
}

# target variable encoding
TARGET_ENCODING = {0:"non-responder", 1:"responder"}

# list potential confounding variables, used to stratify the results
CONFOUNDING_VAR = ['Sex', 'Tobacco Use History']

## PART 1. Load the data

In [6]:
# load target data
data_cov =  load_tsv(PATH_data_raw, 'X_covariates.tsv')
data_genes = load_tsv(PATH_data_raw, 'X_genes.tsv')
target = load_tsv(PATH_data_raw, 'y.tsv', header=None)

# small correction
target.columns=[VAR_GROUPS["TARGET_VAR"][0]]

(200, 17)
(200, 31085)
(200, 1)


In [7]:
# inspect
print("shape:", data_genes.shape)
print("missing data nr:", data_genes.isnull().sum().sum())

# get example
data_genes.head()

shape: (200, 31085)
missing data nr: 0


Unnamed: 0,TPM_hugo_A1BG,TPM_hugo_A1BG-AS1,TPM_hugo_A1CF,TPM_hugo_A2M,TPM_hugo_A2M-AS1,TPM_hugo_A2ML1,TPM_hugo_A2MP1,TPM_hugo_A3GALT2,TPM_hugo_A4GALT,TPM_hugo_A4GNT,...,TPM_hugo_ZWILCH,TPM_hugo_ZWINT,TPM_hugo_ZXDA,TPM_hugo_ZXDB,TPM_hugo_ZXDC,TPM_hugo_ZYG11A,TPM_hugo_ZYG11B,TPM_hugo_ZYX,TPM_hugo_ZZEF1,TPM_hugo_ZZZ3
0,1.564289,2.711834,0.0,599.387994,2.354073,43.245808,0.0,0.0,11.43709,0.070903,...,8.574489,6.467672,1.906227,3.293924,8.333586,2.189232,19.280571,168.26622,27.175332,17.83686
1,3.487859,1.717013,0.0,222.711937,2.288359,5.718716,0.0,0.564476,6.026609,0.108688,...,10.409939,3.572365,2.76178,3.411667,9.293182,1.813353,21.761841,66.403339,21.311923,22.296492
2,0.613334,0.50852,0.0,204.222937,0.627338,300.472716,0.0,0.0,11.797474,0.040773,...,6.272013,3.109443,1.068439,2.559726,5.181549,0.225283,15.800051,172.944084,14.743828,18.920023
3,2.385017,1.600782,0.0,1851.589619,3.30154,1.346349,0.0,0.0,23.938826,0.0,...,4.586123,1.150169,1.464567,1.386418,4.50198,0.036808,11.444219,116.271619,19.22279,11.936066
4,1.964353,0.791064,0.0,982.752783,0.589165,85.088254,0.0,0.096887,17.058419,0.447727,...,4.292896,2.469881,1.809374,3.056738,6.604204,0.436553,20.036719,143.793153,24.820985,17.297542


## PART 2. Establish Proeprocessing Pipeline
---

__set up low level tranformers and helper functions for tpm data preprocessor__

In [11]:
# Function, .................................
def transpose_rebuild(transposed_arr, df):
    ''' tranpose aarr, and adds col/row names 
        to it aftem making dataframe
    '''
    arr = transposed_arr.transpose()
    new_df = pd.DataFrame(arr, columns=df.columns)
    return new_df


# Function, .................................
def log_transformer(x):
    ''' log1p scaller for input dataframe (x)
        CAUTION: log1p returns runtime warning if negative data are used'''
    log_transformer = make_pipeline( 
        FunctionTransformer(np.log1p, validate=False),
    )
    x_log = pd.DataFrame(
        log_transformer.fit_transform(x),
        columns=x.columns
    )
    return x_log

__get some automated diagnostic funcitons to have the idea whether transformer does its job correctly__

In [12]:
def make_tpm_summary(df_list, name_list=None):
    ''' Creates tpm cummary table for several dataframes provided in the list (df_list)
        colnames - list with columns names for new summary table 
        for more ifo see help for tpm_summary() 
    '''
    for i, df in enumerate(df_list):
        if i==0:
            summary_table = tpm_summary(df=df)
        else:
            summary_table =  pd.concat([summary_table, tpm_summary(df=df)])
    
    # transpose for nice looking output
    summary_table = summary_table.transpose() 
    
    # add column names corresponding to different df's
    if(name_list is not None):
        summary_table.columns=name_list
    else:
        summary_table.columns=list(range(len(df_list)))
    
    return summary_table
            

__Create custom function for detecting differencially expressed genes, between the two classe with tstudent test and foldchnage__
* I applied this, method, as the siples possible option, 
* it can be replaced at any moent with another method, 
* more informaiton on the topic can be found here:
    * "Detecting differentially expressed genes in heterogeneous diseases using half Student’s t-test" https://academic.oup.com/ije/article/39/6/1597/736515
    * "Robustness of differential gene expression analysis of RNA-seq" https://doi.org/10.1016/j.csbj.2021.05.040

In [15]:
def select_genes(x, y):
    ''' simple funciton that calulates gene expression foldchnage between two classes of samples
        and ttstudent test, then it data frame, with list of all genes, sorted with pvalue from ttest, 
        from the most, to the least signifficant,
        . x, y - dataframe and pd. series repsectively, with the data, and labels, 
        ...............
        important: ttest shoudl be used only with relatively large number of replicates in each gorup, 
        eg >20, For more informaiton see these articles:
           https://academic.oup.com/ije/article/39/6/1597/736515
           https://doi.org/10.1016/j.csbj.2021.05.040
    '''
    # test input df, & work on df copy,
    assert type(x) == pd.DataFrame, "x_train Incorrect obj type"
    x = x.copy()
    assert type(y) == pd.Series, "y_train Incorrect obj type"
    y = y.copy()
        
    # divide the set into two group
    ttest=[]
    Log2FC=[]
    for idx in range(x.shape[1]):
        one_row = x_transf.iloc[:,idx].values
        a = one_row[y==0]
        b = one_row[y==1]

        # .. ttest
        ttest.append((stats.ttest_ind(a, b).pvalue))

        # Log2FC
        Log2FC.append(np.median(a)/np.median(b))

    # store results in nice dataframe
    ttest_results = pd.DataFrame([ttest_results,fold_change_results]).transpose()
    ttest_results.columns = ['pvalues', 'Log2FC']
    ttest_results.index = x.columns   
    
    return ttest_results.sort_values(by="pvalues", ascending=True)

__create main preprocessing function for tpm data__

In [10]:
# Function, .................................
def prepare_tpm_data(
    x_train, y_train, x_te=None, y_te=None, noise_tr=0.5, sp_filter_tr=0.95, 
    sp_filter_quantile=True, genes_to_select=1000    
):

    
    # - instanciate transformers ........................................... 
    robust_scaler = RobustScaler() 
    zv_filter = ZeroValueFilter() # custom build for the project 
    sp_filter = SpearmanFilter() # custom build for the project 
      
    # - fit_transform train data ...........................................

    # test input df, & work on df copy,
    assert type(x_train) == pd.DataFrame, "x_train Incorrect obj type"
    assert type(y_train) == pd.Series, "y_train Incorrect obj type"
    x, y =  x_train.copy(), y_train.copy()    
    
    # step 1. log1p to combat heteroscedascity,- no params
    x_log = log_transformer(x)

    # step 2. remove genes with too much noise,
    x_log_filtered = zv_filter.fit_transform(x_log, na_tr=noise_tr)

    # step 3. robuscaller - transpose to work per sample
    arr = robust_scaler.fit_transform(x_log_filtered.transpose())
    # rebuild original df, shape
    x_log_filtered_scaled = transpose_rebuild(arr, x_log_filtered)

    # step 4. remove potential outliers from train data - params to set
    x_transf, y_transf = sp_filter.fit_transform(
        x=x_log_filtered_scaled, 
        y=y, 
        tr=sp_filter_tr,
        quantile=sp_filter_quantile
    )
    # ... keep idx of removed and accepted samples for the next step
    train_samples_removed = sp_filter._train_samples_removed
    train_samples_accepted = sp_filter._train_samples_accepted
    print("removed samples",len(train_samples_removed),train_samples_removed)
    
    # step 5. select most singnifficantly differently expressed genes
    genes_sumary = select_genes(x_transf, y_transf)
    top_scores_genes_names = genes_sumary.index.values[0:genes_to_select].tolist()
    x_transf_topgenes = x_transf.loc[:,top_scores_genes_names]
    
    # info
    report = make_tpm_summary(
        df_list=[x, x_log, x_log_filtered,  x_log_filtered_scaled, x_transf, x_transf_topgenes],
        name_list=['input', 'log', 'log_filtered',  'log_filtered_scaled', 
                   'outliers_removed', 'top_genes']
        )
    display(report)

    
    # - transform test data ...........................................
    if x_te is None:
        pass
    else:
        for x_test, y_test in zip(x_te, y_te):
       
            # test input df, & work on df copy,
            assert type(x_test) == pd.DataFrame, "x_train Incorrect obj type"
            assert type(y_test) == pd.Series, "y_train Incorrect obj type"
            x, y =  x_train.copy(), y_train.copy()    

            # - fit_transform train data ...........................................

            # step 1. log1p to combat heteroscedascity,- no params
            x_log = log_transformer(x)

            # step 2. remove genes with too much noise,
            x_log_filtered = zv_filter.transform(x_log)

            # step 3. robuscaller - transpose to work per sample
            arr = robust_scaler.transform(x_log_filtered.transpose())
            x_log_filtered_scaled = transpose_rebuild(arr, x_log_filtered)

            # step 4. remove potential outliers from train data - params to set
            x_transf, y_transf = sp_filter.transform(
                x=x_log_filtered_scaled, 
                y=y
            )
            # ... keep idx of removed and accepted samples for the next step
            train_samples_removed = sp_filter._train_samples_removed
            train_samples_accepted = sp_filter._train_samples_accepted
            print("removed samples",len(train_samples_removed),train_samples_removed)

            # step 5. select most singnifficantly differently expressed genes
            x_transf_topgenes = x_transf.loc[:,top_scores_genes_names]

            # info
            report = make_tpm_summary(
                df_list=[x, x_log, x_log_filtered,  x_log_filtered_scaled, x_transf, x_transf_topgenes],
                name_list=['input', 'log', 'log_filtered',  'log_filtered_scaled', 
                           'outliers_removed', 'top_genes']
                )
            display(report)    






In [116]:
# ********************************************************************************** #
#                                                                                    #
#   Project: Ardigen task                                                            #                        
#   Author: Pawel Rosikiewicz                                                        #
#   Contact: prosikiewicz(a)gmail.com                                                #
#                                                                                    #
#   License: MIT License                                                             #
#   Copyright (C) 2022.06.04 Pawel Rosikiewicz                                       #
#                                                                                    #
# Permission is hereby granted, free of charge, to any person obtaining a copy       #
# of this software and associated documentation files (the "Software"), to deal      #
# in the Software without restriction, including without limitation the rights       #
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell          #
# copies of the Software, and to permit persons to whom the Software is              #
# furnished to do so, subject to the following conditions:                           #
#                                                                                    # 
# The above copyright notice and this permission notice shall be included in all     #
# copies or substantial portions of the Software.                                    #
#                                                                                    #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR         #
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,           #
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE        #
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER             #
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,      #
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE      #
# SOFTWARE.                                                                          #
#                                                                                    #
# ********************************************************************************** #



#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os # allow changing, and navigating files and folders, 
import sys
import re # module to use regular expressions, 
import glob # lists names in folders that match Unix shell patterns
import random # functions that use and generate random numbers

import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import seaborn as sns # advance plots, for statistics, 
import matplotlib as mpl # to get basic plt   functions, heping with plot mnaking 
import matplotlib.pyplot as plt # for making plots, 



class SpearmanFilter():
    ''' .............................................................
        Custom made transformer for TMP data provided from RNAseq experiments
        .............................................................        
        Allows Rank-based filtering with Spearman correlation of samples on rnaseq data.
        * first it creates an average gene expression profile for all samples
        * then, it calulates speerman rho correlation coef, for each sample, and the mean profile
        * finally, it removes the samples, that are either below preset tthreshold, or at lower quantile, 
        * the threshold, methods, and averaging methods are parametrized  
        
        FUNCTIONS
        . fit_transform()    : see below fit transform policy
        . transform()        : see funciton for transfomed policy
    '''

    # (a) private methods and variables, 

    # Method,.....................................
    def __init__(self):
        
        # parametrs
        self._tr=None # its 1-speakerm corr. with the mean, sample 
        self._quantile=None
        self._method=None
        self._avg_menthod=None

        # train data
        self._train_mean_profile=None
        self._train_samples_corr=None
        self._train_samples_removed=None
        self._train_samples_accepted=None
        
        # test data
        self._test_samples_corr=None
        self._test_samples_removed=None
        self._test_samples_accepted=None        
    
    # (b) public methods
    
    # Method,.....................................
    def fit_transform(self, x, y, tr=0.95, quantile=True, method='spearman', avg_menthod="median"):
        ''' Allows Rank-based filtering with Spearman correlation of samples on rnaseq data.
            * first it creates an average gene expression profile for all samples
            * then, it calulates speerman rho correlation coef, for each sample, and the mean profile
            * finally, it removes the samples, that are either below preset tthreshold, or at lower quantile, 
            * the threshold, methods, and averaging methods are parametrized  
            
            parameters:
            . x; Pandas DataFrame
            . y; pandas Series
            . tr; float [0-1], if qunatile=False (see below) it will reject all samples we corr<tr, 
            . quantile; bool, if True, tr used to calulate lower quantile boudary for 1-tr 
            . method; str, eg: 'spearman', method from pandas.df.corr()
            . avg_menthod; str, "median", or "mean"
            
            returns:
            . trandsformed x; Pandas DataFrame
            . trandsformed y; pandas Series            
            
            comments:
            parameters used, and list of corr, results, and samples rejected and accepted are available as private variables, 
        '''
        
        # store threshold parameters,  
        self._tr=tr # may be modified later later, based on the quantile options
        self._quantile=quantile
        self._method=method
        self._avg_menthod=avg_menthod
        
        # Test input df,
        assert type(x) == pd.DataFrame, "Incorrect obj type: x shoudl be pd.Series"
        assert type(y) == pd.Series, "Incorrect obj type: y shoudl be pd.Series"

        # create average gen expression profile for train data
        if avg_menthod=="median":
            self._train_mean_profile = x.apply(np.median, axis=0)
        else:
            self._train_mean_profile = x.apply(np.mean, axis=0)
        train_mean_profile = self._train_mean_profile
            
            
        # calulate spearman corr between each sample, and avg_profile
        ''' done with for lopp to avoid any error, 
            and becuase i have relatively small sample nr
        '''
        train_samples_corr=[]
        for index in range(x.shape[0]):
            # calulate corr.
            dftemp = pd.concat([x.iloc[index,:], train_mean_profile], axis=1)
            one_sample_corr = dftemp.corr(method=method).iloc[0,1]

            # store the results
            train_samples_corr.append(one_sample_corr)

        # keep corr results, and sample Id's inside pd series,
        train_samples_corr = pd.Series(train_samples_corr, index=x.index.values.tolist())
        self._train_samples_corr = train_samples_corr

        # find samples to remove and accept
        if quantile==False:
            '''threshold is used dirently, to filter the samples with corr results'''
            self._train_samples_removed = train_samples_corr.iloc[(train_samples_corr<tr).values.tolist()].index.values.tolist()
            self._train_samples_accepted = train_samples_corr.iloc[(train_samples_corr>=tr).values.tolist()].index.values.tolist()
        else:
            '''1-threshold is used as quantile value'''
            lower_quntile_tr = train_samples_corr.quantile(1-tr)
            self._train_samples_removed = train_samples_corr.iloc[(train_samples_corr<lower_quntile_tr).values.tolist()].index.values.tolist()
            self._train_samples_accepted = train_samples_corr.iloc[(train_samples_corr>=lower_quntile_tr).values.tolist()].index.values.tolist() 
            self._tr=lower_quntile_tr

        # remove rejected samples and return the data
        x_transf = x.iloc[self._train_samples_accepted,:]
        y_transf = y.iloc[self._train_samples_accepted]
        
        return x_transf, y_transf 
            
            
    # Method,.....................................
    def transform(self, x, y, inform=False):
        ''' transform method for Rank-based filtering with Spearman correlation of samples on rnaseq data.
            * first it creates an average gene expression profile for all samples
            * then, it calulates speerman rho correlation coef, for each sample, and the mean profile
            * finally, it removes the samples, that are either below preset tthreshold, or at lower quantile, 
            * the threshold, methods, and averaging methods are parametrized  
            
            parameters:
            . x; Pandas DataFrame
            . y; pandas Series
            . inform; bool; if True, fucntion will return pd.series with correlation vvalues only, 
            
            returns:
            . trandsformed x; Pandas DataFrame
            . trandsformed y; pandas Series            
            
            comments:
            parameters used, and list of corr, results, and samples rejected and accepted are available as private variables, 
        '''
        # store threshold parameters,  
        tr = self._tr
        quantile = self._quantile
        method = self._method
        avg_menthod = self._avg_menthod
        train_mean_profile = self._train_mean_profile
        
        # Test input df,
        assert type(x) == pd.DataFrame, "Incorrect obj type: x shoudl be pd.Series"
        assert type(y) == pd.Series, "Incorrect obj type: y shoudl be pd.Series"

        # calulate spearman corr between each sample, and avg_profile
        ''' done with for lopp to avoid any error, 
            and becuase i have relatively small sample nr
        '''
        test_samples_corr=[]
        for index in range(x.shape[0]):
            # calulate corr.
            dftemp = pd.concat([x.iloc[index,:], train_mean_profile], axis=1)
            one_sample_corr = dftemp.corr(method=method).iloc[0,1]

            # store the results
            test_samples_corr.append(one_sample_corr)

        # keep corr results, and sample Id's inside pd series,
        test_samples_corr = pd.Series(test_samples_corr, index=x.index.values.tolist())
        self._test_samples_corr = test_samples_corr

        # find samples to remove and accept
        '''in tranfomr the threshold is taken from fit_transform method'''
        self._test_samples_removed = test_samples_corr.iloc[(test_samples_corr<tr).values.tolist()].index.values.tolist()
        self._test_samples_accepted = test_samples_corr.iloc[(test_samples_corr>=tr).values.tolist()].index.values.tolist()

        # remove rejected samples and return the data
        x_transf = x.iloc[self._test_samples_accepted,:]
        y_transf = y.iloc[self._test_samples_accepted]
        
        if inform==False:
            return x_transf, y_transf 
        else:
            return self._train_samples_corr, self._test_samples_corr

In [118]:
prepare_tpm_data(
    x_train=data_genes,
    y_train=pd.Series(target.iloc[:,0]),
    x_te=[data_genes],
    y_te=[pd.Series(target.iloc[:,0])]
)





            
            
    

removed samples 10 [1, 26, 36, 55, 86, 120, 121, 125, 137, 159]


Unnamed: 0,input,log,log_filtered,log_filtered_scaled,outliers_removed,top_genes
sample nr,200,200,200,200,190,190
gene nr,31085,31085,18510,18510,18510,1000
- VALUES -,,,,,,
min tpm value recorded,0.0,0.0,0.0,-0.9,-0.9,-0.9
median tpm value recorded,0.3,0.3,2.0,0.0,0.0,0.1
mean tpm value recorded,32.2,1.3,2.1,0.0,0.0,0.1
max tpm value recorded,314419.1,12.7,12.7,5.1,5.1,3.2
- DETECTED IN -,,,,,,
mean % of genes expressed per sample,60.1%,60.1%,95.9%,100.0%,100.0%,100.0%
% of genes not expressed in any sample,17.6%,17.6%,0.0%,0.0%,0.0%,0.0%


removed samples 10 [1, 26, 36, 55, 86, 120, 121, 125, 137, 159]


Unnamed: 0,input,log,log_filtered,log_filtered_scaled,outliers_removed,top_genes
sample nr,200,200,200,200,190,190
gene nr,31085,31085,18510,18510,18510,1000
- VALUES -,,,,,,
min tpm value recorded,0.0,0.0,0.0,-0.9,-0.9,-0.9
median tpm value recorded,0.3,0.3,2.0,0.0,0.0,0.1
mean tpm value recorded,32.2,1.3,2.1,0.0,0.0,0.1
max tpm value recorded,314419.1,12.7,12.7,5.1,5.1,3.2
- DETECTED IN -,,,,,,
mean % of genes expressed per sample,60.1%,60.1%,95.9%,100.0%,100.0%,100.0%
% of genes not expressed in any sample,17.6%,17.6%,0.0%,0.0%,0.0%,0.0%


In [86]:
def select_genes(x, y):
    ''' simple funciton that calulates gene expression foldchnage between two classes of samples
        and ttstudent test, then it data frame, with list of all genes, sorted with pvalue from ttest, 
        from the most, to the least signifficant,
        . x, y - dataframe and pd. series repsectively, with the data, and labels, 
        
        important: ttest shoudl be used only with relatively large number of replicates in each gorup, 
        eg >20, For more informaiton see these articles:
    
        > "Detecting differentially expressed genes in heterogeneous diseases using half Student’s t-test"
           https://academic.oup.com/ije/article/39/6/1597/736515
        
        > "Robustness of differential gene expression analysis of RNA-seq"
           https://doi.org/10.1016/j.csbj.2021.05.040
        
    '''
    # test input df, & work on df copy,
    assert type(x_train) == pd.DataFrame, "x_train Incorrect obj type"
    x_train = x_train.copy()
    assert type(y_train) == pd.Series, "y_train Incorrect obj type"
    y_train = y_train.copy()
        
    # divide the set into two group
    ttest_results=[]
    fold_change_results=[]
    for idx in range(x.shape[1]):
        one_row = x_transf.iloc[:,idx].values
        a = one_row[y==0]
        b = one_row[y==1]

        # .. ttest
        ttest_results.append((stats.ttest_ind(a, b).pvalue))

        # log chnage
        fold_change_results.append(np.abs(np.median(a)-np.median(b))/np.median(a))

    # store results in nice dataframe
    ttest_results = pd.DataFrame([ttest_results,fold_change_results]).transpose()
    ttest_results.columns = ['pvalues', 'foldchange']
    ttest_results.index = x.columns   
    
    return ttest_results.sort_values(by="pvalues", ascending=True)
    


In [None]:
# inputs
x = x_transf
y = y_transf

# divide the set into two group
ttest_results=[]
fold_change_results=[]
for idx in range(x.shape[1]):
    one_row = x_transf.iloc[:,idx].values
    a = one_row[y==0]
    b = one_row[y==1]
    
    # .. ttest
    ttest_results.append((stats.ttest_ind(a, b).pvalue))
    
    # log chnage
    fold_change_results.append(np.abs(np.median(a)-np.median(b))/np.median(a))
    
# store results in nice series
ttest_results = pd.DataFrame([ttest_results,fold_change_results]).transpose()
ttest_results.columns = ['pvalues', 'foldchange']
ttest_results.index = x.columns