In [1]:
import pandas as pd
import numpy as np
import os
from statsmodels.stats.proportion import proportion_confint
from scipy.stats import fisher_exact

In [2]:
dataset = pd.read_csv("/cs/labs/michall/roeizucker/10krun/ukbb_dataset.csv")

In [4]:
# SAVE_DOCX = True
SAVE_DOCX = False
OUTPUT_DOCX_FILE_PATH = '/cs/phd/nadavb/pwas_results/ukbb_imputation_cancer_significant_pwas_genes_risk_plots.docx'

if SAVE_DOCX:
    from docx import Document
    document = Document()

def mask_to_index_intervals(mask):
   
    i_start = None
   
    for i, flag in enumerate(list(mask) + [False]):
        if flag and i_start is None:
            i_start = i
        elif not flag and i_start is not None:
            yield i_start, i
            i_start = None

def choose_spaced_values(prioritized_values, min_diff):
   
    prioritized_values = np.array(prioritized_values)
    chosen_values = []
   
    while len(prioritized_values) > 0:
        chosen_values.append(prioritized_values[0])
        prioritized_values = prioritized_values[1:]
        priotitized_from_chosen_diffs = np.abs(prioritized_values.reshape(-1, 1) - np.array(chosen_values).reshape(1, -1)).min(axis = -1)
        prioritized_values = prioritized_values[priotitized_from_chosen_diffs >= min_diff]
       
    return np.array(sorted(chosen_values))

def create_fisher_cont_table(n_total, n1, n2, n_overlap):
    return [
        [n_overlap, n1 - n_overlap],
        [n2 - n_overlap, n_total - n1 - n2 + n_overlap],
    ]

def add_fig_to_doc(document, fig, width_inches = 5, save_fig_kwargs = {}):
   
    from io import BytesIO
    from docx.shared import Inches
   
    with BytesIO() as memfile:
        fig.savefig(memfile, **save_fig_kwargs)
        document.add_picture(memfile, width = Inches(width_inches))

        
        
def do_stuff(phenotype_name,uniprot_id,pwas_qval,is_dominant,is_recessive,gene_index, gene_record):
    if is_dominant and is_recessive:
        inheritance = 'Dominant & recessive'
    elif is_dominant:
        inheritance = 'Dominant'
    elif is_recessive:
        inheritance = 'Recessive'
    else:
        inheritance = 'None'
       
    phenotype_col = phenotype_name.replace('pan_', 'pan-').replace('non_', 'non-').replace('_', ' ').capitalize().replace('Pan-cancer', 'Pan-Cancer')
    phenotype_display = phenotype_col.replace('Cancer', 'cancer')
       
    # Focus on three specific associations for the figure itself.
#     if (gene_record['symbol'], phenotype_display) not in {('MITF', 'Melanoma'), ('HOXB13', 'Prostate cancer'), \\
#             ('SLC45A2', 'Non-melanoma skin cancer')}:
   
#         continue

    eid_to_phenotype = dataset.set_index('eid')[phenotype_col].dropna()
    gene_effect_scores = pd.read_csv(os.path.join('/cs/usr/roeizucker/my_storage/virt_env_install_test/ukbb_imputation_gene_effect_scores/', '%d.csv' % gene_index),
            index_col = 'sample_id')
    gene_effect_scores['phenotype'] = eid_to_phenotype.reindex(gene_effect_scores.index)
    gene_effect_scores.dropna(inplace = True)
   
    total_n_samples = len(gene_effect_scores)
    total_n_cases = int(gene_effect_scores['phenotype'].sum())
    total_case_freq = total_n_cases / total_n_samples

#     fig, axes = plt.subplots(figsize = (18, 3), ncols = 4, gridspec_kw = dict(width_ratios = [2, 1, 2, 1]))

    for i, (score_name, score_color) in enumerate([('dominant', '#F39C12'), ('recessive', '#9B59B6')]):

        score_counts = gene_effect_scores.groupby(score_name)['phenotype'].agg(['size', 'sum']).astype(int).sort_index().reset_index().rename(columns = {'size': 'n_samples', 'sum': 'n_cases', score_name: 'score'})
        cumulative_score_counts = score_counts[['n_samples', 'n_cases']].cumsum()
       
        bottom_score_partitions = pd.concat([pd.Series(len(score_counts) * ['bottom']).rename('type'), score_counts['score'], 
                cumulative_score_counts['n_samples'], cumulative_score_counts['n_cases']], axis = 1)
        top_score_partitions = pd.concat([pd.Series(len(score_counts) * ['top']).rename('type'), score_counts['score'], 
                total_n_samples - cumulative_score_counts['n_samples'] + score_counts['n_samples'], total_n_cases - 
                cumulative_score_counts['n_cases'] + score_counts['n_cases']], axis = 1)
        score_partitions = pd.concat([bottom_score_partitions, top_score_partitions]).reset_index(drop = True)
       
        score_partitions['quantile'] = score_partitions['n_samples'] / total_n_samples
        score_partitions['case_freq'] = score_partitions['n_cases'] / score_partitions['n_samples']
       
        score_partitions['complement_n_samples'] = total_n_samples - score_partitions['n_samples']
        score_partitions['complement_n_cases'] = total_n_cases - score_partitions['n_cases']
        score_partitions['complement_case_freq'] = score_partitions['complement_n_cases'] / score_partitions['complement_n_samples']
       
        score_partitions['x'] = -np.log10(score_partitions['quantile']) * score_partitions['type'].map({'top': 1, 'bottom': -1})
        score_partitions = score_partitions.drop_duplicates(subset = ['x']).sort_values('x')
        selected_x = choose_spaced_values(list(sorted(score_partitions['x'].tolist(), key = abs)), 0.05)
        score_partitions = score_partitions[score_partitions['x'].isin(set(selected_x))]
       
        score_partitions['name'] = score_partitions.apply(lambda partition: 'All' if partition['quantile'] == 1 else 
                ('%s %d%%' if partition['quantile'] >= 0.1 else '%s %.1g%%') % (partition['type'].capitalize(), 
                100 * partition['quantile']), axis = 1)
        score_partitions = pd.concat([score_partitions, score_partitions.apply(lambda partition: pd.Series(list(proportion_confint(
                partition['n_cases'], partition['n_samples'], method = 'wilson')), index = ['case_p_low_ci', 'case_p_high_ci']), 
                axis = 1)], axis = 1)
        score_partitions = pd.concat([score_partitions, score_partitions.apply(lambda partition: pd.Series(list(
                fisher_exact(create_fisher_cont_table(total_n_samples, total_n_cases, partition['n_samples'], partition['n_cases']))), 
                index = ['OR', 'fisher_pval']), axis = 1)], axis = 1)
       
        if len(score_partitions.loc[score_partitions['quantile'] <= 0.5, 'fisher_pval']) == 0:
            return
        best_score_partition = score_partitions.loc[score_partitions.loc[score_partitions['quantile'] <= 0.5, 'fisher_pval'].idxmin()]
        pd.DataFrame( score_partitions).to_csv('/cs/labs/michall/roeizucker/10krun/risk_scores_example/' + phenotype_name  +'/'+ uniprot_id + '.csv' )
       


   
if SAVE_DOCX:
    document.save(OUTPUT_DOCX_FILE_PATH)
   
print('Done.')

Done.


In [5]:
 sbatch --array=0-3 --mem=30g -c1 --time=1-0 --killable --requeue --wrap="~/my_storage/10krun/base_script.py"
os.system(f"sbatch --mem=8g -c1 --time=15:0:0 --wrap 'python3 /cs/labs/michall/roeizucker/10krun/second_script.py 1'")

In [None]:
import phenotype_specs
for spec in phenotype_specs.specs:
    file_name = '/cs/labs/michall/roeizucker/10krun/' + spec['name'] + '.csv'
    if not os.path.isfile(file_name):
        continue
    os.system(f"sbatch --mem=8g -c1 --time=15:0:0 --wrap 'python3 minimap_fastq.py {spec['name']} {}'")
    if not os.path.isdir('/cs/labs/michall/roeizucker/10krun/risk_scores_example/' + spec['name']):
        os.mkdir('/cs/labs/michall/roeizucker/10krun/risk_scores_example/' + spec['name'])
    doc = pd.read_csv('/cs/labs/michall/roeizucker/10krun/' + spec['name'] + '.csv')
    if spec['name'] in done:
        continue
    if spec['name'] == current:
        counter=68
    for index, row in doc.iterrows():
        if counter >= max_genes:
            break
        if not os.path.isfile('/cs/usr/roeizucker/my_storage/virt_env_install_test/ukbb_imputation_gene_effect_scores/'+ str(row['gene_index'])+ '.csv' ):
            print("error")
            continue
        
#         print(spec['name'], counter)
        do_stuff(spec['name'],row['uniprot_id'],row['fdr_qval'],True,True,row['gene_index'],{'symbol':row['symbol']})
        counter+=1
    print("done", spec['name'])
    done.append(spec['name'])
