This notebook will produce a graph as follows:
- X axis is
    - em-var status (categorical) OR pleiotropy
    - mean predicted activity bin
- Y-axis is the odds ratio of phylop significance representation in that category


Minor technical note : the operations here may produce a table that is taller than required. i.e. we may have counts divvied by a column "X" which is never used. However, the actual odds-ratio function `orc.compute_OR` performs a summation, collapsing irrelevant columns, so we can just leave the extra rows without consequence.

In [1]:
import glob
import pandas as pd

In [2]:
#importlib code just for debugging : to allow changes in scripts we are imporing from
#reflect here. It can be removed if you are not making changes to the scripts we are imporing from

import importlib ##can remove
import graphing_config as gc
import or_common as orc
importlib.reload(gc) ##can remove
importlib.reload(orc) ##can remove

<module 'or_common' from '/gpfs/gibbs/pi/reilly/VariantEffects/scripts/noon_scripts/5.graphs/odds_ratio/or_common.py'>

In [3]:
#phylop_emvar=pd.read_csv(glob.glob(GC.data_base_path+"phylop_emvar/*.csv")[0])
master_table=pd.read_csv(glob.glob("/home/mcn26/varef/scripts/noon_data/4.old.count/count_all.csv/*.csv")[0])

In [4]:
#fix the dumb character substitutions
#(pyspark hates certain characters, so we had to put substitutes)
#both in the actual table...
master_table.columns=[col_name.replace("&", ".").replace("^",",") for col_name in master_table.columns]

In [5]:
#extract some relevant sets of columns & drop all the columns we don't need...

columns=master_table.columns.to_list()

genomic_regions_cols=[i for i in columns if i.startswith("is_in_") ]
mean_activity_cols=[i for i in columns if i.startswith("mean_skew_")]
emVar_cols=[i for i in columns if i.startswith("emVar_")]

In [6]:
#perform the actual subsetting
#columns_to_keep is columns we would want no matter what
columns_to_keep=genomic_regions_cols+mean_activity_cols+["count","phylop_significant"]

master_table=master_table[columns_to_keep+emVar_cols+["pleio"]]

In [9]:
#convert pleiotropy to boolean masks
pleio_columns=[]
for i in [0,1,2,3]:
    name=f"pleio_{i}"
    pleio_columns.append(name)
    master_table[name] = master_table["pleio"]==i

In [10]:
#break the table into two sub-tables : one for pleiotropy analysis, and one for emvar status analysis

pleio_table=master_table[columns_to_keep+pleio_columns].copy()
emvar_table=master_table[columns_to_keep+emVar_cols].copy()

#delete the original table to free up memory
del master_table

In [17]:
def get_or(emvar_or_pleio):
    #not a very reusable function : just called twice to avoid code duplication. 

    #this is the return value : a list of dictionaries
    records=[]
    
    #temp lines : will change once have a function call
    table=None
    inner_cols=None
    
    if emvar_or_pleio=="emvar":
        table=emvar_table
        inner_cols=emVar_cols
    elif emvar_or_pleio=="pleio":
        table=pleio_table
        inner_cols=pleio_columns
    else:
        print("valid values for `emvar_or_pleio` are `emvar` and `pleio` :^) ")
        exit(-1)
    
    
    

    #select one combo of subset cata
    for genome_region in genomic_regions_cols+["all"]:


        for mean_activity_strata in mean_activity_cols:


            #subset the table to whatever we have selected   
            
            #only the mean activity strata of interest (boolean mask)
            selected=table[table[mean_activity_strata]]
            
            #only the genomic region of interest (boolean mask)
            if genome_region!="all":
                selected=selected[selected[genome_region]]
            
            
            

            #the location of this subsetting code is very particular : the innermost loop *besides*
            #the cata loop
            #I can't "subset as I go" because this would result in `selected` being re-subset inconsistently
            #I could put the subsetting code in the innermost loop, but `cata` does not involve performing a subset
            #So that would be slower than required.

            for cata in inner_cols:

                #note that we don't have to 

                stats=orc.compute_OR(df=selected,
                              a=cata,
                              a_val=True,
                              b="phylop_significant",
                              b_val=True)

                #build a dictionary of all the information in the current record and append it to the return list

                to_app={"mean_activity_strata":mean_activity_strata,
                     "genome_region":genome_region,emvar_or_pleio:cata}
                to_app.update(stats)

                records.append(to_app)


    return records
#get_or(emvar_or_pleio="pleio")
get_or(emvar_or_pleio="emvar")

[{'mean_activity_strata': 'mean_skew_(-Inf, -1.5)',
  'genome_region': 'is_in_dELS',
  'emvar': 'emVar_K562',
  'OR': 1.3715766231536959,
  'ci_lower': 1.0452874519886601,
  'ci_upper': 1.7997177997331433,
  'p': 0.020942491308712997},
 {'mean_activity_strata': 'mean_skew_(-Inf, -1.5)',
  'genome_region': 'is_in_dELS',
  'emvar': 'emVar_SKNSH',
  'OR': 0.7809148430628667,
  'ci_lower': 0.47463416264180447,
  'ci_upper': 1.2848379659854465,
  'p': 0.32215385877954383},
 {'mean_activity_strata': 'mean_skew_(-Inf, -1.5)',
  'genome_region': 'is_in_dELS',
  'emvar': 'emVar_HepG2',
  'OR': 1.3721178954386355,
  'ci_lower': 0.33114095219376777,
  'ci_upper': 5.685517017784258,
  'p': 1.0},
 {'mean_activity_strata': 'mean_skew_[-1.5, -1.0)',
  'genome_region': 'is_in_dELS',
  'emvar': 'emVar_K562',
  'OR': 1.1887162133335365,
  'ci_lower': 1.0898765650644404,
  'ci_upper': 1.2965195152704962,
  'p': 7.243196815372367e-05},
 {'mean_activity_strata': 'mean_skew_[-1.5, -1.0)',
  'genome_region':