# Obnibus and post hoc testing

## Imports

In [1]:
import sys
from sklearn.ensemble import IsolationForest
import scikit_posthocs as sp
import pathlib
import numpy as np
import pandas as pd
from collections import defaultdict

rel_root = pathlib.Path("..")
sys.path.append(f'{rel_root}/utils')
import analysis_utils as au
import preprocess_utils as ppu

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess data using preprocess utils

In [2]:
rnd_val = 0 # Random value for all seeds
rng = np.random.default_rng(seed=rnd_val) # random number generator

In [3]:
filename = 'nf1_sc_all_cellprofiler.csv.gz'
plates = [1,2]
platesdf = [] # Will hold plate dataframes

for plate in plates:
    # Initializes the preprocessing class
    po = ppu.Preprocess_data(plate=plate, filename=filename, rel_root=rel_root,
                          kept_meta_columns=['Metadata_genotype'])
    
    # Gets the dataframe after removing metadata columns, 
    # except for the 'Metadata_genotype' column specified above
    platedf = po.get_ml_df()
    platedf.drop(['Cytoplasm_Number_Object_Number'], axis=1, inplace=True)
    
    # Use isolation forest to select inliers:
    isof = IsolationForest(random_state = rnd_val) 
    out_preds = isof.fit_predict(platedf.drop(columns=['Metadata_genotype']))
    ind = np.nonzero(out_preds == 1)[0] # Select inliers
    
    # Select inlier samples:
    platesdf.append(platedf.iloc[ind])
    

# Conduct testing with scheffe's test

In [4]:
#gtypes = ['Null','WT'] # The 2 types of genotypes
test = sp.posthoc_scheffe
# Pass the plates dataframes in order, so that genotypes are suffixed corretly (eg. WT1WT2)
# Plates are 1 indexed
# res_test = au.sig_test(test, platesdf)
st = au.Sig_testing(platesdf)
anova_feats, sig_anova_pvals = st.anova_test()
res_test = st.posthoc_test(anova_feats, sig_anova_pvals, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['plate'] = [str(i+1)]*len(df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['group'] = df['Metadata_genotype'] + df['plate']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Metadata_genotype','plate'], inplace=True) # Remove unnecessary columns for testing


In [5]:
# Returns a dictionary of genotype pairs containing the significant column names
# being analyzed
sig_groups = st.get_columns(res_test['sig_feat_phoc'])

In [6]:
tot_columns = len(platesdf[0].columns)
sig_groupsdf = pd.DataFrame(list({grp: len(tot) for grp, tot in sig_groups.items()}.items()),
                            columns=['genotype', 'number_of_significant_columns'])
sig_groupsdf['proportion_of_significant_columns'] = sig_groupsdf['number_of_significant_columns'] / tot_columns
sig_groupsdf = sig_groupsdf.round({'proportion_of_significant_columns': 2})

In [7]:
out_file = pathlib.Path('data/plate_1_2_genotype_signifance.tsv')
sig_groupsdf.to_csv(out_file, sep='\t', index=False)

In [8]:
sig_groupsdf

Unnamed: 0,genotype,number_of_significant_columns,proportion_of_significant_columns
0,WT1Null1,404,0.34
1,WT1WT2,622,0.52
2,WT1Null2,666,0.56
3,Null1WT2,750,0.63
4,Null1Null2,804,0.68
5,WT2Null2,794,0.67
