In [None]:
import pandas as pd
import numpy as np
import itertools
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv("",sep="\t",low_memory=False)

In [None]:
"""
mapper = {
    "Raw file" : "Raw file",
    "Proteins" : "Proteins",
    "Modified sequence" : "Modified sequence",
    "Charge" : "Charge"
}

meta_data = {
    "software" : "MaxQuant",
    "version" : "1.5.8.3",
    "MBR" : True,
    "fasta" : "human_yeast_ecoli.fasta"
}

replicate_mapper = {
 'LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01' : 1,
 'LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02' : 1,
 'LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03' : 1,
 'LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01' : 2,
 'LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02' : 2,
 'LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03' : 2,
}

species_dict = {
    "YEAST" : "_YEAST",
    "ECOLI" : "_ECOLI",
    "HUMAN" : "_HUMAN"
}

decoy_flag = "+"
"""

mapper = {
    "shortname" : "Raw file",
    "protein" : "Proteins",
    "sequence" : "Modified sequence",
    "charge" : "Charge",
    "decoy" : "Reverse",
    "ms1_int_sum_apex_dn" : "Intensity"
}

meta_data = {
    "software" : "AlphaPept",
    "version" : "0-4-8_default",
    "MBR" : True,
    "fasta" : "human_yeast_ecoli.fasta"
}

replicate_mapper = {
 'LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01' : 1,
 'LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02' : 1,
 'LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03' : 1,
 'LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01' : 2,
 'LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02' : 2,
 'LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03' : 2,
}

species_dict = {
    "YEAST" : "_YEAST",
    "ECOLI" : "_ECOLI",
    "HUMAN" : "_HUMAN"
}


species_expected_ratio = {
    "YEAST" : {"1|2" : 0.5},
    "ECOLI" : {"1|2" : 1.5},
    "HUMAN" : {"1|2" : 1.0}
}

contaminant_flag = "Cont_"
decoy_flag = True
min_count_multispec = 1

In [None]:
df.rename(columns=mapper,inplace=True)

In [None]:
replicate_to_raw = {}
for k,v in replicate_mapper.items():
    try:
        replicate_to_raw[v].append(k) 
    except KeyError:
        replicate_to_raw[v] = [k]

In [None]:
df = df[df["Reverse"] != decoy_flag]

In [None]:
df["contaminant"] = df["Proteins"].str.contains(contaminant_flag)
for species,flag in species_dict.items():
    df[species] = df["Proteins"].str.contains(flag)
df["MULTI_SPEC"] = (df[list(species_dict.keys())].sum(axis=1) > min_count_multispec)
df["replicate"] = df["Raw file"].map(replicate_mapper)

In [None]:
df = pd.concat([df,pd.get_dummies(df["Raw file"])],axis=1)

In [None]:
df = df[df["MULTI_SPEC"] == False]

In [None]:
df.loc[df.index,"peptidoform"] = df.loc[df.index,"Modified sequence"]+df.loc[df.index,"Charge"].astype(str)
grouped_sum = df.groupby(["peptidoform"]).sum()[replicate_mapper.keys()].min(axis=1)
allowed_peptidoforms = list(grouped_sum[grouped_sum != 0].index)
filtered_df = df[df["peptidoform"].isin(allowed_peptidoforms)]

In [None]:
num_vertical = len(set(filtered_df[filtered_df[list(replicate_mapper.keys())[0]] != 0]["peptidoform"]))

In [None]:
quant_df = filtered_df.groupby(["peptidoform","Raw file"]).mean()["Intensity"].fillna(0.0) #["Intensity"]

In [None]:
def get_cv(peptidoforms_replicate,alpha=1e-20):
    return (np.std(peptidoforms_replicate)/(np.mean(peptidoforms_replicate)+alpha))*100

replicate_quant_list = {}

for replicate,replicate_runs in replicate_to_raw.items():
    selected_replicate_df = quant_df.index.get_level_values("Raw file").isin(replicate_runs)
    replicate_quant_df = quant_df[selected_replicate_df]
    
    cv_series = replicate_quant_df.groupby(["peptidoform"]).apply(get_cv)
    replicate_quant_list[replicate] = cv_series

In [None]:
cv_replicate_quant_df = pd.DataFrame(replicate_quant_list)

In [None]:
species_peptidoform = list(species_dict.keys())
species_peptidoform.append("peptidoform")
peptidoform_to_species = df[species_peptidoform].drop_duplicates()
peptidoform_to_species.index = peptidoform_to_species["peptidoform"]
peptidoform_to_species_dict = peptidoform_to_species.T.to_dict()

In [None]:
species_quant_df = pd.DataFrame([peptidoform_to_species_dict[idx] for idx in cv_replicate_quant_df.index])
species_quant_df.set_index("peptidoform", drop = True, inplace = True)

In [None]:
cv_replicate_quant_species_df = pd.concat([cv_replicate_quant_df,species_quant_df],axis=1)

In [None]:
ratio_dict = {}
for species in species_dict.keys():
    species_df_slice = cv_replicate_quant_species_df[cv_replicate_quant_species_df[species] == True]
    for conditions in itertools.combinations(set(replicate_mapper.values()),2):
        condition_comp_id = "|".join(map(str,conditions))
        
        ratio = species_df_slice[conditions[0]]/species_df_slice[conditions[1]]
        ratio_diff = abs(ratio-species_expected_ratio[species][condition_comp_id])*100
        
        try:
            ratio_dict[condition_comp_id+"_ratio"] = pd.concat([ratio,ratio_dict[condition_comp_id+"_ratio"]])
            ratio_dict[condition_comp_id+"_expected_ratio_diff"] = pd.concat([ratio_dict[condition_comp_id+"_expected_ratio_diff"],ratio_diff])
        except KeyError:
            ratio_dict[condition_comp_id+"_ratio"] = ratio
            ratio_dict[condition_comp_id+"_expected_ratio_diff"] = ratio_diff
ratio_df = pd.DataFrame(ratio_dict)

In [None]:
result_performance = pd.concat([cv_replicate_quant_species_df,ratio_df],axis=1)

In [None]:
result_performance.to_csv(f"{meta_data['software']}-{meta_data['version']}.csv")
pd.Series(meta_data).to_csv(f"{meta_data['software']}-{meta_data['version']}.meta.csv")