# rescue-experiment-comparisons
1.18.22

So the full workflow is this: 

1. Run `rescue-experiment-optimized.py` to generate optimized reconstructed matrices with NMF, neural net and kNN. This will write optimized reconstructions to csvs. It will also write small "tester" matrices to csvs that you can use for a much faster test. 

2. Run `calculate-loq-runner.sh`. This is a shell script that will run `calculate-loq.py` on the reconstructed matrices from each imputation method. 

3. Run this notebook to get the "quantifiable" peptides for each reconstruction, test or full. 

Trying to generate UpSet plots to do comparisons across all five imputation methods. 

In [1]:
import pandas as pd
import numpy as np
import upsetplot
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
import seaborn as sns

# plotting templates
sns.set(context="talk", style="ticks") 
pal = sns.color_palette()

#### Get the "quantifiable" peptides for the original matrix

In [2]:
fom_orig = pd.read_csv("out/fom-orig-MCAR.csv")

# find the rows where LOQ == True
q_bool = [fom_orig["LOQ"] != np.inf]
# subset the original output dataframe
fom_orig_quant = fom_orig.loc[np.array(q_bool)[0]]
# reset the indices on the figures of merit quantitative dataframe
fom_orig_quant = fom_orig_quant.reset_index(drop=True)

#### Get the "quantifiable" peptides for the NMF imputed matrix

In [3]:
fom_nmf = pd.read_csv("out/fom-NMF-recon-MCAR.csv")

# find the rows where LOQ == True
q_bool = [fom_nmf["LOQ"] != np.inf]
# subset the original output dataframe
fom_nmf_quant = fom_nmf.loc[np.array(q_bool)[0]]
# reset the indices on the figures of merit quantitative dataframe
fom_nmf_quant = fom_nmf_quant.reset_index(drop=True)

#### Get the "quantifiable" peptides in the kNN imputed matrix

In [4]:
fom_knn = pd.read_csv("out/fom-KNN-recon-MCAR.csv")

# find the rows where LOQ == True
q_bool = [fom_knn["LOQ"] != np.inf]
# subset the original output dataframe
fom_knn_quant = fom_knn.loc[np.array(q_bool)[0]]
# reset the indices on the figures of merit quantitative dataframe
fom_knn_quant = fom_knn_quant.reset_index(drop=True)

#### Get the "quantifiable" peptides in the missForest imputed matrix

In [5]:
fom_mf = pd.read_csv("out/fom-mf-recon-MCAR.csv")

# find the rows where LOQ == True
q_bool = [fom_mf["LOQ"] != np.inf]
# subset the original output dataframe
fom_mf_quant = fom_mf.loc[np.array(q_bool)[0]]
# reset the indices on the figures of merit quantitative dataframe
fom_mf_quant = fom_mf_quant.reset_index(drop=True)

#### Get the "quantifiable" peptides in the sample min imputed matrix

In [6]:
fom_min = pd.read_csv("out/fom-min-recon-MCAR.csv")

# find the rows where LOQ == True
q_bool = [fom_min["LOQ"] != np.inf]
# subset the original output dataframe
fom_min_quant = fom_min.loc[np.array(q_bool)[0]]
# reset the indices on the figures of merit quantitative dataframe
fom_min_quant = fom_min_quant.reset_index(drop=True)

#### Get the "quantifiable" peptides in the Gaussian random draw imputed matrix

In [7]:
fom_std = pd.read_csv("out/fom-std-recon-MCAR.csv")

# find the rows where LOQ == True
q_bool = [fom_std["LOQ"] != np.inf]
# subset the original output dataframe
fom_std_quant = fom_std.loc[np.array(q_bool)[0]]
# reset the indices on the figures of merit quantitative dataframe
fom_std_quant = fom_std_quant.reset_index(drop=True)

#### Compare

In [8]:
orig_nquant = fom_orig_quant.shape[0]
nmf_recon_nquant = fom_nmf_quant.shape[0]
knn_recon_nquant = fom_knn_quant.shape[0]
mf_recon_nquant = fom_mf_quant.shape[0]
min_recon_nquant = fom_min_quant.shape[0]
std_recon_nquant = fom_std_quant.shape[0]

print("n quant peptides, original: ", orig_nquant)
print("n quant peptides, NMF impute: ", nmf_recon_nquant)
print("n quant peptides, kNN impute: ", knn_recon_nquant)
print("n quant peptides, missForest impute: ", mf_recon_nquant)
print("n quant peptides, sample min impute: ", min_recon_nquant)
print("n quant peptides, random sample impute: ", std_recon_nquant)

n quant peptides, original:  7707
n quant peptides, NMF impute:  4723
n quant peptides, kNN impute:  3505
n quant peptides, missForest impute:  10475
n quant peptides, sample min impute:  1487
n quant peptides, random sample impute:  1424


***

## Create a dataframe where rows are imputation methods and columns are the 300 peptides
1 indicates the peptide was quantitative, 0 not.

#### Init dataframe

In [11]:
all_pids = list(fom_orig["peptide"])
all_pids.insert(0, "method")

upset_df = pd.DataFrame(np.zeros((6,21876)), columns=all_pids)
upset_df["method"] = ["orig", "NMF", "kNN", "mf", "min", "std"]

upset_df.shape

(6, 21876)

#### Get the lists of quantitative peptides

In [12]:
orig_qpids = list(fom_orig_quant["peptide"])
nmf_qpids = list(fom_nmf_quant["peptide"])
knn_qpids = list(fom_knn_quant["peptide"])
mf_qpids = list(fom_mf_quant["peptide"])
min_qpids = list(fom_min_quant["peptide"])
std_qpids = list(fom_std_quant["peptide"])

#### Fill in the upset plot dataframe

In [13]:
for pid in upset_df.columns:
    if pid in orig_qpids:
        upset_df.loc[0, pid] = 1
    if pid in nmf_qpids:
        upset_df.loc[1, pid] = 1
    if pid in knn_qpids:
        upset_df.loc[2, pid] = 1
    if pid in mf_qpids:
        upset_df.loc[3, pid] = 1
    if pid in min_qpids:
        upset_df.loc[4, pid] = 1
    if pid in std_qpids:
        upset_df.loc[5, pid] = 1

In [14]:
upset_df

Unnamed: 0,method,AAAALAGGKK,AAAAQDEITGDGTTTVVC[+57.0214635]LVGELLR,AAADALSDLEIK,AAAEGVANLHLDEATGEMVSK,AAAEYEKGEYETAISTLNDAVEQGR,AAAGSVLLEDC[+57.0214635]K,AAALVYPGSETFIVWGHVGLDEVSPIGK,AAANHTPPDMTNMDTR,AAAPGIQLVAGEGFQSPLEDR,...,VILNQAFIER,VILPIASMFVK,VILPTQNMDGTIAK,VILQILNYIR,VILTQVGSGPQETNESLIDAK,VINAITGGVATDIADK,VINALDYDIIAAESHTISQAVR,VINATPTMVIPPLILVR,VINDAFGIEEGLMTTVHSLTATQK,VINDAFGIEEGLMTTVHSMTATQK
0,orig,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,NMF,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,kNN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,mf,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Transpose, bc I think this is what `UpsetR` wants

In [15]:
upset_T = upset_df.T
upset_T.columns = ["orig", "NMF", "kNN", "mf", "min", "std"]
upset_T = upset_T.drop(["method"])
upset_T

Unnamed: 0,orig,NMF,kNN,mf,min,std
AAAALAGGKK,0.0,0.0,0.0,0.0,0.0,0.0
AAAAQDEITGDGTTTVVC[+57.0214635]LVGELLR,1.0,1.0,0.0,1.0,0.0,0.0
AAADALSDLEIK,1.0,0.0,0.0,1.0,0.0,0.0
AAAEGVANLHLDEATGEMVSK,1.0,0.0,0.0,1.0,0.0,0.0
AAAEYEKGEYETAISTLNDAVEQGR,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
VINAITGGVATDIADK,0.0,0.0,0.0,1.0,0.0,0.0
VINALDYDIIAAESHTISQAVR,0.0,0.0,0.0,0.0,0.0,0.0
VINATPTMVIPPLILVR,1.0,0.0,0.0,1.0,0.0,0.0
VINDAFGIEEGLMTTVHSLTATQK,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
upset_T.sum()

orig     7707.0
NMF      4723.0
kNN      3505.0
mf      10475.0
min      1487.0
std      1424.0
dtype: object

#### Write to csv

In [17]:
#upset_T.to_csv("upset_df_tester.csv", index=False)
upset_T.to_csv("upset_df_full.csv", index=False)