# Collating the inferred CS effects for all 3 deconvolution approaches & calculating Pearson correlation with GT effects

### What this does:
- Loads in data across all CS screens on the pearson correlation between GT & CS for
    - Linear model deconvolution
    - Mahalanobis distance deconvolution
    - Boostrapped mahalanobis distance deconvolution
- Formats data for plotting
- Generates a plot comparing the methods

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve,roc_curve, roc_auc_score, RocCurveDisplay, auc,average_precision_score,PrecisionRecallDisplay,precision_score,recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
plt.rcParams["font.family"] = "Helvetica"

# Part one: Making a table summarizing, GT, Mahala, Mahala Boot, and regression L1 for all run, scheme,etc

In [7]:
data_read_path = "../../../../cell_painting_data_lock/4_CS_OBJECTS_median_aggregated/"
data_read_suffix = "_PCH_rd_OG316"
save_path = "data_tables/"

regression_read_path = "../../regression_deconvolution_December2021/decon_out/raw_rd_OG316_raw_decon_out/"

### GT

In [8]:
# Load in the set of 316 drugs used in the screens
cs_drugs = pd.read_csv("../mahalanobis_calcs_on_compressed_15Nov2021/mahala_calculations/CS_run1_2.0x5.0r_randommahala_table.csv").Metadata_perturbation.values

gt_mahala = pd.read_csv("../../../../BEM/Manu_analysis/2_OG316_analysis/GT316_mahala_empirical_table.csv")

# Need to update the name of one drug with a weird name that got annotated in two ways
gt_mahala.Metadata_perturbation.values[gt_mahala.Metadata_perturbation.values=='Methyldopa Sesquihydrate (L-A-Methyl-Dopa Sesquihyrate)']='Methyldopa Sesquihydrate (L-_-Methyl-Dopa Sesquihyrate)'

# Filter down to OG 316 and remove the DMSO wells
gt_mahala = gt_mahala.loc[gt_mahala.Metadata_perturbation.isin(cs_drugs)]
gt_mahala = gt_mahala.loc[gt_mahala.Metadata_perturbation!="DMSO"]

# rename columns where necesseary
gt_mahala.columns = ['Metadata_perturbation','gt_mahalanobis','gt_signif_mahala']
gt_mahala.index = gt_mahala.Metadata_perturbation.values
gt_mahala.sort_index(inplace=True)


### CS

In [9]:
bootstrap_percentile = 5 # using the 5th percentile of the adjust mahalanobis value as the "adjusted mahalanobis"

for run in ['CS_run1','CS_run2','CS_run3']:

    metadata = pd.read_csv(data_read_path+run+data_read_suffix+"_metadata.csv",index_col=0)
     # Layout all of the compression schemes
    compression_methods = metadata.groupby(['Metadata_compression','Metadata_replicates','Metadata_perturbation','Metadata_Plate']).size().reset_index().rename(columns={0:''})
    compression_methods = compression_methods.loc[np.isin(compression_methods['Metadata_perturbation'],['random','random1','random2'])]
    compression_methods.to_csv(run+"_compression_methods.csv",index=False)

    for i in range(compression_methods.shape[0]):

        compression = compression_methods['Metadata_compression'].iloc[i]
        replicates = compression_methods['Metadata_replicates'].iloc[i]
        scheme = compression_methods['Metadata_perturbation'].iloc[i]
        
        if run =='CS_run3' and compression == 32.0:
            compression = 32
            replicates = 5

        cs_mahala = pd.read_csv("../mahalanobis_calcs_on_compressed_15Nov2021/mahala_calculations/"+run+"_"+str(compression)+"x"+str(replicates)+"r"+"_"+scheme+"mahala_table_DMSO_1e7_samples_17Nov2021_empiricalcov.csv",index_col=0)
        cs_mahala = cs_mahala.iloc[:,0:3]
        cs_mahala.columns = ['Metadata_perturbation','cs_mahalanobis','cs_signif_mahala',]
        cs_mahala = cs_mahala[cs_mahala.Metadata_perturbation!="DMSO"]
        cs_mahala.index = cs_mahala.Metadata_perturbation.values


        bootstrap_values = np.load("../mahalanobis_calcs_on_compressed_15Nov2021/mahala_calculations/"+run+"_"+str(compression)+"x"+str(replicates)+"r"+"_"+scheme+"mahalas_resampled_DMSO_1e7_samples_17Nov2021_empiricalcov.npy")
        mahalas_boot = np.zeros(bootstrap_values.shape[0])
        for m in range(bootstrap_values.shape[0]):
            mahalas_boot[m] = np.percentile(bootstrap_values[m,:],bootstrap_percentile)
        cs_mahala['cs_mahalanobis_bootstrap'] = mahalas_boot


        if run =='CS_run3' and compression == 32:
            compression = 32.0
            replicates = 5.0
        
        model_coef = pd.read_csv(regression_read_path +run+"_"+str(compression)+"x_"+str(replicates)+"r"+"_"+scheme+"_permute_model_coef.csv",index_col=0)
        model_coef.index.rename('Metadata_perturbation',inplace=True)
        model_coef = model_coef.loc[model_coef.index.values!="DMSO"]
        model_coef.sort_index(inplace=True)
        model_coef['regression_l1_norm'] = np.abs(model_coef).sum(1)


        if i ==0 and run =='CS_run1':
            mahalanobis_regression_comparison = pd.concat([gt_mahala,cs_mahala,model_coef.regression_l1_norm],axis=1,sort=True)
            mahalanobis_regression_comparison['Metadata_run'] = run
            mahalanobis_regression_comparison['Metadata_compression'] = compression
            mahalanobis_regression_comparison['Metadata_replicates'] = replicates
            mahalanobis_regression_comparison['Metadata_optimization'] = scheme
        else:
            temp = pd.concat([gt_mahala,cs_mahala,model_coef.regression_l1_norm],axis=1,sort=True)
            temp['Metadata_run'] = run
            temp['Metadata_compression'] = compression
            temp['Metadata_replicates'] = replicates
            temp['Metadata_optimization'] = scheme
            mahalanobis_regression_comparison = pd.concat([mahalanobis_regression_comparison,temp])



mahalanobis_regression_comparison.drop(labels=["Metadata_perturbation"],axis=1,inplace=True)
mahalanobis_regression_comparison.to_csv("mahalanobis_plusMinus_bootstrap_and_regressionL1_empiricalcov_with_permute_raw_reg.csv")



In [None]:
all_compression_methods = mahalanobis_regression_comparison.groupby(['Metadata_run','Metadata_compression','Metadata_replicates','Metadata_optimization']).size().reset_index().rename(columns={0:''})

gt_cs_naive_mahala_cor = []
gt_cs_boot_mahala_cor = []
gt_mahala_cs_reg_corr = []
gt_mahala_cs_no_permute_corr = []
for i in range(all_compression_methods.shape[0]):
    run = all_compression_methods.Metadata_run.iloc[i]
    compression = all_compression_methods.Metadata_compression.iloc[i]
    replicates = all_compression_methods.Metadata_replicates.iloc[i]
    scheme = all_compression_methods.Metadata_optimization.iloc[i]
    data =mahalanobis_regression_comparison.loc[(mahalanobis_regression_comparison.Metadata_run==run)&
                                         (mahalanobis_regression_comparison.Metadata_compression==compression)&
                                         (mahalanobis_regression_comparison.Metadata_replicates==replicates)&
                                         (mahalanobis_regression_comparison.Metadata_optimization==scheme)]

    gt_cs_naive_mahala_cor.append(pearsonr(data.gt_mahalanobis,data.cs_mahalanobis)[0])
    gt_cs_boot_mahala_cor.append(pearsonr(data.gt_mahalanobis,data.cs_mahalanobis_bootstrap)[0])
    gt_mahala_cs_reg_corr.append(pearsonr(data.gt_mahalanobis,data.regression_l1_norm)[0])
    gt_mahala_cs_no_permute_corr.append(pearsonr(data.gt_mahalanobis,data.no_permute_regression_l1_norm)[0])

all_compression_methods['gt_mahala_vs_cs_mahala_pearson'] = gt_cs_naive_mahala_cor
all_compression_methods['gt_mahala_vs_cs_boot_mahala_pearson'] = gt_cs_boot_mahala_cor 
all_compression_methods['gt_mahala_vs_cs_regression_l1_norm'] = gt_mahala_cs_reg_corr
all_compression_methods['gt_mahala_vs_cs_no_permute_regression_l1_norm'] = gt_mahala_cs_no_permute_corr
all_compression_methods.to_csv("CS_all_pearsons_gt_mahala_vs_mahala_mahalaBoot_regL1_empiricalcov_with_permute_raw_reg.csv",index=False)