In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as pl

## Analysis DMSO

In [None]:
df_results = pd.read_table("./peptides.txt")
## Remove decoy sequences and contaminants
df_results_filter_cont = df_results.loc[df_results['Reverse']!="+"]
df_results_filter_cont = df_results_filter_cont.loc[df_results_filter_cont['Potential contaminant']!="+"]
## Remove false heavy signal
df_results_remove_H0 = df_results_filter_cont.drop(df_results_filter_cont[df_results_filter_cont["Intensity H DMSO_t0"]>0].index)
## Compute %L
df_results_remove_H0["percent_DMSO_L2"] = 1/(1+df_results_remove_H0["Ratio H/L DMSO_t2"])
df_results_remove_H0["percent_DMSO_L6"] = 1/(1+df_results_remove_H0["Ratio H/L DMSO_t6"])
## Compute log(%L)
df_results_remove_H0["percent_DMSO_L2_log"] = np.log(df_results_remove_H0["percent_DMSO_L2"])
df_results_remove_H0["percent_DMSO_L6_log"] = np.log(df_results_remove_H0["percent_DMSO_L6"])
## Fit the data
df_results_analysed = pd.DataFrame(columns=["Protein_group_ID","k_computed", "r_sq"])
for group_id in df_results_remove_H0["Protein group IDs"].unique():
    print(group_id)
    sub_df = df_results_remove_H0[df_results_remove_H0["Protein group IDs"]==group_id]
    list_time_all_peptides = []
    list_percent_L_all_peptides = []
    for index in sub_df.index:
        list_tmp_percent_L = [0,float(sub_df.loc[[index]]["percent_DMSO_L2_log"]),float(sub_df.loc[[index]]["percent_DMSO_L6_log"])]
        if not np.isnan(list_tmp_percent_L).any():
            list_time_all_peptides += [0,2,6]
            list_percent_L_all_peptides += list_tmp_percent_L
    if len(list_time_all_peptides)>0:
        X = np.array(list_time_all_peptides).reshape(-1, 1)
        y = np.array(list_percent_L_all_peptides).reshape(-1, 1)
        reg = LinearRegression(fit_intercept=False).fit(X, y)
        df_results_tmps = pd.DataFrame(np.array([[group_id,-reg.coef_[0,0],reg.score(X, y)]]), columns=["Protein_group_ID","k_computed", "r_sq"])
        df_results_analysed = df_results_analysed.append(df_results_tmps,ignore_index=True)
## Save the data
df_results_analysed["r_sq"] = np.float64(np.array(df_results_analysed["r_sq"]))
df_results_analysed["k_computed"] = np.float64(np.array(df_results_analysed["k_computed"]))
df_results_analysed.to_csv("./res_fit_DMSO_no_cont.csv")

## Analysis CHX

In [None]:
df_results = pd.read_table("./peptides.txt")
## Remove decoy sequences and contaminants
df_results_filter_cont = df_results.loc[df_results['Reverse']!="+"]
df_results_filter_cont = df_results_filter_cont.loc[df_results_filter_cont['Potential contaminant']!="+"]
## Remove false heavy signal
df_results_remove_H0 = df_results_filter_cont.drop(df_results_filter_cont[df_results_filter_cont["Intensity H CHX_t0"]>0].index)
## Compute %L
df_results_remove_H0["percent_CHX_L2"] = 1/(1+df_results_remove_H0["Ratio H/L CHX_t2"])
df_results_remove_H0["percent_CHX_L6"] = 1/(1+df_results_remove_H0["Ratio H/L CHX_t6"])
## Compute log(%L)
df_results_remove_H0["percent_CHX_L2_log"] = np.log(df_results_remove_H0["percent_CHX_L2"])
df_results_remove_H0["percent_CHX_L6_log"] = np.log(df_results_remove_H0["percent_CHX_L6"])
## Fit the data
df_results_analysed = pd.DataFrame(columns=["Protein_group_ID","k_computed", "r_sq"])

for group_id in df_results_remove_H0["Protein group IDs"].unique():
    print(group_id)
    sub_df = df_results_remove_H0[df_results_remove_H0["Protein group IDs"]==group_id]
    list_time_all_peptides = []
    list_percent_L_all_peptides = []
    for index in sub_df.index:
        list_tmp_percent_L = [0,float(sub_df.loc[[index]]["percent_CHX_L2_log"]),float(sub_df.loc[[index]]["percent_CHX_L6_log"])]
        if not np.isnan(list_tmp_percent_L).any():
            list_time_all_peptides += [0,2,6]
            list_percent_L_all_peptides += list_tmp_percent_L
    if len(list_time_all_peptides)>0:
        X = np.array(list_time_all_peptides).reshape(-1, 1)
        y = np.array(list_percent_L_all_peptides).reshape(-1, 1)
        reg = LinearRegression(fit_intercept=False).fit(X, y)
        df_results_tmps = pd.DataFrame(np.array([[group_id,-reg.coef_[0,0],reg.score(X, y)]]), columns=["Protein_group_ID","k_computed", "r_sq"])
        df_results_analysed = df_results_analysed.append(df_results_tmps,ignore_index=True)
## Save the data
df_results_analysed["r_sq"] = np.float64(np.array(df_results_analysed["r_sq"]))
df_results_analysed["k_computed"] = np.float64(np.array(df_results_analysed["k_computed"]))
df_results_analysed.to_csv("./res_fit_CHX_nocont.csv")

## Plot unique protein_IDs

In [None]:
## CHX

df_results_analysed = pd.read_csv("./res_fit_CHX_nocont.csv")
print(len(df_results_analysed))

df_results_analysed["r_sq"] = np.float64(np.array(df_results_analysed["r_sq"]))
df_results_analysed["k_computed"] = np.float64(np.array(df_results_analysed["k_computed"]))
df_results_analysed_filtered_rsq = df_results_analysed.drop(df_results_analysed[df_results_analysed["r_sq"]<0.9].index)
print(len(df_results_analysed_filtered_rsq))

Mask_shared_peptides = []
for prot_id in df_results_analysed_filtered_rsq["Protein_group_ID"]:
    Mask_shared_peptides +=  [len(prot_id.split(sep=";"))>1]
    
print(100*np.sum(Mask_shared_peptides)/len(df_results_analysed_filtered_rsq))

df_results_analysed_filtered_rsq_drop_duplicate = df_results_analysed_filtered_rsq.drop(df_results_analysed_filtered_rsq[Mask_shared_peptides].index)
df_results_analysed_filtered_rsq_drop_duplicate["Protein_group_ID"] = np.int64(np.array(df_results_analysed_filtered_rsq_drop_duplicate["Protein_group_ID"]))

print("%s unique prot_id!"%len(df_results_analysed_filtered_rsq_drop_duplicate["Protein_group_ID"].unique()))

df_results_analysed_filtered_rsq_drop_duplicate_average_protID = df_results_analysed_filtered_rsq_drop_duplicate.groupby("Protein_group_ID").mean()
df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] = np.log(2)/df_results_analysed_filtered_rsq_drop_duplicate_average_protID["k_computed"]
print(np.sum(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"]<0))

df_results_analysed_filtered_rsq_drop_duplicate_average_protID_CHX = df_results_analysed_filtered_rsq_drop_duplicate_average_protID.copy()

pl.hist(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] , range=(0,100), bins=100, label="CHX", alpha=0.5, density=True)
print("Mean = ", np.mean(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] ))
print("Median = ", np.median(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] ))


## DMSO
df_results_analysed = pd.read_csv("./res_fit_DMSO_no_cont.csv")
print(len(df_results_analysed))

df_results_analysed["r_sq"] = np.float64(np.array(df_results_analysed["r_sq"]))
df_results_analysed["k_computed"] = np.float64(np.array(df_results_analysed["k_computed"]))
df_results_analysed_filtered_rsq = df_results_analysed.drop(df_results_analysed[df_results_analysed["r_sq"]<0.9].index)
print(len(df_results_analysed_filtered_rsq))

Mask_shared_peptides = []
for prot_id in df_results_analysed_filtered_rsq["Protein_group_ID"]:
    Mask_shared_peptides +=  [len(prot_id.split(sep=";"))>1]
    
print(100*np.sum(Mask_shared_peptides)/len(df_results_analysed_filtered_rsq))

df_results_analysed_filtered_rsq_drop_duplicate = df_results_analysed_filtered_rsq.drop(df_results_analysed_filtered_rsq[Mask_shared_peptides].index)
df_results_analysed_filtered_rsq_drop_duplicate["Protein_group_ID"] = np.int64(np.array(df_results_analysed_filtered_rsq_drop_duplicate["Protein_group_ID"]))

print("%s unique prot_id!"%len(df_results_analysed_filtered_rsq_drop_duplicate["Protein_group_ID"].unique()))

df_results_analysed_filtered_rsq_drop_duplicate_average_protID = df_results_analysed_filtered_rsq_drop_duplicate.groupby("Protein_group_ID").mean()
df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] = np.log(2)/df_results_analysed_filtered_rsq_drop_duplicate_average_protID["k_computed"]
print(np.sum(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"]<0))

df_results_analysed_filtered_rsq_drop_duplicate_average_protID_DMSO = df_results_analysed_filtered_rsq_drop_duplicate_average_protID.copy()


pl.hist(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] , range=(0,100), bins=100, label="DMSO", alpha=0.5, density=True)
print("Mean = ", np.mean(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] ))
print("Median = ", np.median(df_results_analysed_filtered_rsq_drop_duplicate_average_protID["HL"] ))

pl.xlim(0,100)
pl.xlabel("Protein HL (hours)")
pl.ylabel("Density")
pl.legend()
#pl.savefig("./figures/density_CHX_DMSO.pdf")

## Find shared protein_IDs

In [None]:
df_merged = pd.merge(df_results_analysed_filtered_rsq_drop_duplicate_average_protID_DMSO, df_results_analysed_filtered_rsq_drop_duplicate_average_protID_CHX, left_index=True, right_index=True,suffixes=("_DMSO","_CHX"))
print(len(df_merged))
df_merged["FC_k"] = df_merged["k_computed_DMSO"]/df_merged["k_computed_CHX"]
pl.hist(df_merged["FC_k"], density="True", range=(0,4), bins=50)

pl.xlim(0,4)
pl.xlabel("k_DMSO/k_CHX (per prot)")
pl.ylabel("Density")

In [None]:
pl.scatter(df_merged["k_computed_DMSO"],df_merged["k_computed_CHX"], alpha=0.5)
pl.plot(np.linspace(0,1,1000),np.linspace(0,1,1000), color="k", ls="--")
pl.xlim(0.01,1)
pl.ylim(0.01,1)
pl.xscale("log")
pl.yscale("log")
pl.xlabel("k_DMSO (h-1)")
pl.ylabel("k_CHX (h-1)")