In [1]:
import os
import subprocess
import time
import sys
import pandas as pd
import random
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
import regex as re

In [2]:
root = "/data/luojaa/eukgen/"
sys.path.append(root)

import altair as alt
from core_functions.altair_plots import plot_alignment, plot_cumsum_counts

#disable altair max rows
alt.data_transformers.disable_max_rows()
#get default altair style'
%run /data/luojaa/eukgen/styling_and_visualisation.py

#dont wrap text output from cells
from IPython.core.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

  from IPython.core.display import display, HTML


In [30]:
def parse_clean_annot(file):
    try:
        data = pd.read_csv(file, sep=',', index_col=0, names=["protein_acc", "ec1", "ec2", "ec3", "ec4", "ec5", "ec6", "ec7", "ec8", "ec9", "ec10"]).dropna(axis = 1, how = "all")
    except pd.errors.ParserError:
        data = pd.read_csv(file, sep=',', index_col=0, names=["protein_acc", "ec1"])
    data["ec"] = [mappings[0].replace("EC:", "") for mappings in data["ec1"].str.split("/")]
    data["confidence_ec1"] = [float(mappings[1]) for mappings in data["ec1"].str.split("/")]
    return data

In [31]:
# pid2eclabel_df = pd.read_csv("/data/tobiassonva/data/databases/reviewed_EC_numbers/uniprotkb_reviewed_and_published_with_EC.tsv", sep = "\t")
# pid2eclabel_df.rename(columns = {"Entry": "Query", "EC number" : "EC_label"}, inplace = True)

In [32]:
def calc_stats(df, ec, ec_pos):
    ec = ".".join(ec.split(".")[:ec_pos])
    label, pred = df.EC_label.apply(lambda x: ".".join(x.split(".")[:ec_pos])), df.EC.apply(lambda x: ".".join(x.split(".")[:ec_pos]))
    num_members = len(set(df[label == ec].Query))
    TP = len(set(df[(label == pred) & (label == ec)].Query))
    FP = len(set(df[label != ec].Query))
    FN = num_members - TP
#     if TP == 0:
#         return ["0","0"]
#     sensitivity = TP / (TP + FN)
#     precision = TP / (TP + FP)
    return ",".join([ec, str(TP), str(FP), str(FN)])


In [33]:
def calc_stats_all(eval_dfs, fullec):
    stats_nested = [calc_stats(eval_dfs[0], fullec, 1), calc_stats(eval_dfs[1], fullec, 2), calc_stats(eval_dfs[2], fullec, 3), calc_stats(eval_dfs[3], fullec, 4)]
    for stats in stats_nested:
        print(stats)

def calc_correlations(df, ec, ec_pos):
    ec = ".".join(ec.split(".")[:ec_pos])
    label, pred = df.EC_label.apply(lambda x: ".".join(x.split(".")[:ec_pos])), df.EC.apply(lambda x: ".".join(x.split(".")[:ec_pos]))
    num_members = len(set(df[label == ec].Query))
    TP = len(set(df[(label == pred) & (label == ec)].Query))
    FP = len(set(df[label != ec].Query))
    FN = num_members - TP
    conf_avg = np.mean([float(conf) for conf in list(df.confidence)])
    if TP == 0:
        return [ec, 0, 0, conf_avg]
    sensitivity = TP / (TP + FN)
    precision = TP / (TP + FP)
    return [ec, sensitivity, precision, conf_avg]


def calc_correlations_all(eval_dfs, fullec):
    stats_nested = [calc_correlations(eval_dfs[0], fullec, 1), calc_correlations(eval_dfs[1], fullec, 2), calc_correlations(eval_dfs[2], fullec, 3), calc_correlations(eval_dfs[3], fullec, 4)]
    return stats_nested
        


### clean parse

In [34]:
def parse_clean(file):
    try:
        data = pd.read_csv(file, sep=',', index_col=0, names=["protein_acc", "ec1", "ec2", "ec3", "ec4", "ec5", "ec6", "ec7", "ec8", "ec9", "ec10"]).dropna(axis = 1, how = "all")
    except pd.errors.ParserError:
        data = pd.read_csv(file, sep=',', index_col=0, names=["protein_acc", "ec1"])
    data = data[data.ec1.apply(lambda x: float(x.split("/")[1]) > .3)]
    # get list of ecs each protein is annotated by
    rows = [data.loc[ind] for ind in data.index.tolist()]
    data["ecs"] = ["|".join([row.iloc[colind].split("/")[0].replace("EC:", "") for colind in range(sum(~row.isnull()))]) for row in rows]
    data["confidences"] = ["|".join([row.iloc[colind].split("/")[1] for colind in range(sum(~row.isnull()))]) for row in rows]
    
    #top ec summary stats
    data["ec"] = [mappings[0].replace("EC:", "") for mappings in data["ec1"].str.split("/")]
    data["confidence_ec1"] = [float(mappings[1]) for mappings in data["ec1"].str.split("/")]
    return data

In [35]:
clean_outdir = "/data/luojaa/CLEAN_processing/repseq/OUTPUT/"
output_csvs = [f"{clean_outdir}{file}" for file in os.listdir(clean_outdir) if ".csv" in file]

In [36]:
len(output_csvs)

878

In [38]:
clean_output_df

Unnamed: 0,Query,ec1,ec2,ec3,ec4,ec5,ecs,confidences,ec,confidence_ec1,domain
0,XP_017208601.1,EC:2.3.2.31/0.9693,,,,,2.3.2.31,0.9693,2.3.2.31,0.9693,euk72_ep
1,NP_477046.1,EC:3.4.15.1/0.9980,,,,,3.4.15.1,0.9980,3.4.15.1,0.9980,euk72_ep
2,XP_024538558.1,EC:2.7.11.1/0.6839,,,,,2.7.11.1,0.6839,2.7.11.1,0.6839,euk72_ep
3,KAA6374321.1,EC:3.4.22.45/0.4125,,,,,3.4.22.45,0.4125,3.4.22.45,0.4125,euk72_ep
4,XP_006580564.1,EC:2.3.2.27/0.9862,,,,,2.3.2.27,0.9862,2.3.2.27,0.9862,euk72_ep
...,...,...,...,...,...,...,...,...,...,...,...
4188,EP00972P005929,EC:3.5.1.76/0.5325,,,,,3.5.1.76,0.5325,3.5.1.76,0.5325,euk72_ep
4189,EP00855P004141,EC:2.5.1.22/0.9963,,,,,2.5.1.22,0.9963,2.5.1.22,0.9963,euk72_ep
4190,EP00919P011160,EC:6.1.1.7/0.9153,,,,,6.1.1.7,0.9153,6.1.1.7,0.9153,euk72_ep
4191,EP00919P076696,EC:1.13.11.18/0.3297,,,,,1.13.11.18,0.3297,1.13.11.18,0.3297,euk72_ep


In [41]:
# concat all dfs for regrouping
clean_benchmark_dfs = []
for i in range(len(output_csvs)):
    clean_output_df = parse_clean(output_csvs[i]).reset_index().rename(columns = {"predicted_label":"EC", "protein_acc": "Query"})
    clean_output_df["domain"] = [output_csvs[i].split("/")[-1].split(".")[0]]*clean_output_df.shape[0]
    clean_benchmark_dfs.append(clean_output_df[["Query","ecs","ec1","confidences","domain"]])
clean_benchmark_concat = pd.concat(clean_benchmark_dfs).drop_duplicates().dropna()
clean_benchmark_concat["EC"] = clean_benchmark_concat.ecs.str.split("|")
clean_benchmark_concat["confidence"] = clean_benchmark_concat.confidences.str.split("|")
clean_benchmark_concat = clean_benchmark_concat.explode(["EC", "confidence"]).reset_index()
clean_benchmark_concat.drop("index", axis = 1, inplace = True)
clean_benchmark_concat.drop(["ecs","ec1", "confidences"], axis = 1, inplace = True)

In [42]:
clean_benchmark_concat.to_csv("clean_ecpred.repseq.tsv", sep = "\t")

In [43]:
clean_benchmark_concat = pd.read_csv("clean_ecpred.repseq.tsv", sep = "\t")

### proteinfer parse

In [44]:
pinfer_outdir = "/data/luojaa/proteinfer/annot_microcosms/repseq_annots_old/"
output_tsvs = [f"{pinfer_outdir}{file}" for file in os.listdir(pinfer_outdir) if ".tsv" in file]

In [45]:
def parse_proteinfer(file):
    data = pd.read_csv(file, sep='\t', index_col=0)

    data_EC = data[data.predicted_label.str.contains('EC')].reset_index()
    data_EC[['label_type', 'predicted_label']] = [label.split(':') for label in data_EC.predicted_label]
    if len(data_EC) == 0:
        return
    
    # track which rows are essentially duplicates with less detail
    data_EC["next_row_more_detail"] = pd.Series(list(data_EC.predicted_label.apply(lambda x: x.count("-")))) > pd.Series(list(data_EC.predicted_label.apply(lambda x: x.count("-")))[1:] + [5])
    data_EC["next_row_same_query"] = pd.Series(list(data_EC.sequence_name)) == pd.Series(list(data_EC.sequence_name)[1:] + ["last_row_false"])
    data_EC["next_row_same_class"] = pd.Series(list(data_EC.predicted_label.apply(lambda x: x.replace("-", "").replace(".", "")))) == pd.Series(list(data_EC.predicted_label.apply(lambda x: "".join(x.replace("-", "").strip(".").split(".")[:-1])))[1:] + ["last_row_false"])
    
    return data_EC[~(data_EC.next_row_more_detail & data_EC.next_row_same_query & data_EC.next_row_same_class)]

In [46]:
# concat all dfs for regrouping
pinfer_output_dfs = []
for i in range(len(output_tsvs)):
    pinfer_output_df = parse_proteinfer(output_tsvs[i]).rename(columns = {"predicted_label":"EC", "sequence_name":"Query"})[["Query","EC","confidence", "description"]]
    pinfer_output_df["domain"] = [output_tsvs[i].split("/")[-1].split(".")[0]]*pinfer_output_df.shape[0]
    pinfer_output_dfs.append(pinfer_output_df)
pinfer_output_concat = pd.concat(pinfer_output_dfs).drop_duplicates().dropna().reset_index().drop("index", axis = 1)
pinfer_output_concat.to_csv("pinfer_ecpred.repseq.tsv", sep = "\t")

In [54]:
pinfer_output_concat = pd.read_csv("pinfer_ecpred.repseq.tsv", sep = "\t")
pinfer_output_concat.drop("Unnamed: 0", axis = 1, inplace = True)
pinfer_output_concat["EC_level"] = pinfer_output_concat.EC.apply(lambda x: 4 - x.count("-"))

In [55]:
pinfer_fullec = pinfer_output_concat[pinfer_output_concat.EC_level == 4]

In [56]:
ec_descriptions_pinfer = pinfer_fullec.drop_duplicates("EC")[["EC", "description"]].reset_index().drop("index", axis = 1)

In [57]:
ec_descriptions_pinfer.to_csv("/data/luojaa/microcosm_metadata/ec_descriptions_pinfer.tsv", sep = "\t")

In [58]:
ec_descriptions_pinfer = pd.read_csv("/data/luojaa/microcosm_metadata/ec_descriptions_pinfer.tsv", sep = "\t")

### consensus analysis

In [None]:
merged_consensus

In [60]:
clean_consensus = clean_benchmark_concat[["Query", "EC", "confidence", "domain"]].sort_values(["Query", "confidence"], ascending = False).drop_duplicates("Query").rename(columns={"confidence":"clean_confidence"}).set_index("Query")
pinfer_consensus = pinfer_fullec[["Query", "EC", "confidence", "domain"]].sort_values(["Query", "confidence"], ascending = False).drop_duplicates("Query").rename(columns={"confidence":"pinfer_confidence"}).set_index("Query")
merged_consensus = pd.merge(clean_consensus, pinfer_consensus, on = "Query", how = "outer")
merged_consensus.fillna({"EC_x": "0.0.0.0", "EC_y": "0.0.0.0", "clean_confidence": 0, "pinfer_confidence": 0}, inplace = True)
merged_consensus["pinfer_confidence"] = merged_consensus["pinfer_confidence"].astype(float)
merged_consensus["clean_confidence"] = merged_consensus["clean_confidence"].astype(float)
merged_consensus["domain"] = merged_consensus.domain_x.fillna(merged_consensus.domain_y)


In [62]:
df_same = merged_consensus[merged_consensus.EC_x == merged_consensus.EC_y].loc[:,["EC_x", "clean_confidence", "domain"]].rename(columns={"EC_x":"EC", "clean_confidence":"confidence"})
df_same

Unnamed: 0_level_0,EC,confidence,domain
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAA03053.1,2.7.2.8,0.9968,euk72_ep
AAD13944.1,7.1.1.2,0.8899,euk72_ep
AAK38119.1,4.1.1.39,0.4510,euk72_ep
AAP74370.1,2.3.1.199,0.9958,euk72_ep
AAR38861_1,2.1.1.63,0.9752,prok2111_as
...,...,...,...
YP_654087.1,7.1.1.2,0.9779,euk72_ep
YP_654221.1,4.1.1.39,0.9989,euk72_ep
YP_740422.1,7.1.1.2,0.8803,euk72_ep
YP_874516.1,3.6.4.12,0.7892,euk72_ep


In [33]:
df = merged_consensus[merged_consensus.EC_x != merged_consensus.EC_y]

clean_highconf_orwaymoreconf = (df.clean_confidence > .9) | (df.clean_confidence > .3) & (df.pinfer_confidence != 1)
pinfer_second_choice = (df.clean_confidence < .9) & (df.pinfer_confidence == 1)

df_clean_resolve = df[clean_highconf_orwaymoreconf].loc[:,["EC_x", "clean_confidence", "domain"]].rename(columns={"EC_x":"EC", "clean_confidence":"confidence"})
df_pinfer_resolve = df[pinfer_second_choice].loc[:,["EC_y", "pinfer_confidence", "domain"]].rename(columns={"EC_y":"EC", "pinfer_confidence":"confidence"})


In [40]:
df_resolve = pd.concat([df_clean_resolve, df_pinfer_resolve, df_same])
df_resolve.to_csv("repseq.microcosms.tmp.tsv", sep = "\t")

In [3]:
df_resolve = pd.read_csv("repseq.microcosms.tmp.tsv", sep = "\t")

In [22]:
df_resolve[df_resolve.confidence == 1]

Unnamed: 0,Query,EC,confidence,domain
2979934,AAB27397.1,7.1.1.2,1.0,euk72_ep
2979935,AAB94064.1,2.7.7.6,1.0,euk72_ep
2979936,AAF05789.1,1.6.99.3,1.0,euk72_ep
2979937,AAR38857_1,3.6.4.13,1.0,prok2111_as
2979938,AAR38876_1,3.6.4.12,1.0,prok2111_as
...,...,...,...,...
3702819,YP_538765.1,2.7.7.6,1.0,euk72_ep
3702820,YP_539016.2,7.1.1.2,1.0,euk72_ep
3702821,YP_588349.1,2.7.7.6,1.0,euk72_ep
3702822,YP_874512.1,1.11.1.15,1.0,euk72_ep


In [23]:
len(df_resolve)

6171043

# visualize stats

In [37]:
# len(df_resolve_prok), len(df_resolve_euk)

(3806370, 2364673)

In [280]:
# len(df_resolve_prok), len(df_resolve_euk)

(2667224, 2164314)

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [284]:
ec_domains = defaultdict(list)
for i in range(len(df_resolve)):
    ec = df_resolve.EC[i]
    domain = df_resolve.domain[i]
    if domain not in ec_domains[ec]:
        ec_domains[ec] += domain

  ec = df_resolve.EC[i]
  domain = df_resolve.domain[i]


KeyboardInterrupt: 

In [7]:
df_resolve_prok

Unnamed: 0,Query,EC,confidence,domain
7,AAR38860_1,2.1.1.218,0.9660,prok2111_as
8,AAR38871_1,3.1.26.5,0.4996,prok2111_as
9,AAR38877_1,3.1.12.1,0.9816,prok2111_as
10,AAR38879_1,1.8.5.4,0.3467,prok2111_as
11,AAR38900_1,5.6.2.1,0.6151,prok2111_as
...,...,...,...,...
6171025,YP_499302_1,1.17.4.1,0.4623,prok2111_as
6171026,YP_499747_1,2.1.1.228,0.9970,prok2111_as
6171027,YP_500068_1,3.6.1.23,0.9791,prok2111_as
6171028,YP_500550_1,3.6.1.23,0.9844,prok2111_as


In [12]:
df_resolve_prok.EC.value_counts()[df_resolve_prok.EC.value_counts() > 3].index

Index(['2.7.13.3', '3.6.4.12', '3.1.21.4', '2.7.11.1', '2.7.7.65', '5.2.1.8',
       '2.7.7.7', '3.4.16.4', '3.1.4.52', '3.6.4.13',
       ...
       '2.3.1.23', '1.2.1.84', '3.1.3.78', '1.13.11.19', '3.4.24.13',
       '2.4.1.333', '1.4.3.4', '2.4.1.338', '2.5.1.121', '3.1.1.105'],
      dtype='object', name='EC', length=3678)

In [17]:
df_resolve_prok = df_resolve[df_resolve.domain == "prok2111_as"].reset_index()
df_resolve_euk = df_resolve[df_resolve.domain == "euk72_ep"].reset_index()

df_resolve_prok_keep = set(df_resolve_prok.EC.value_counts()[df_resolve_prok.EC.value_counts() > 3].index)
df_resolve_euk_keep = set(df_resolve_euk.EC.value_counts()[df_resolve_euk.EC.value_counts() > 5].index)
ECs_pass_filter = df_resolve_prok_keep.intersection(df_resolve_euk_keep)

df_resolve_prok_f = df_resolve_prok[df_resolve_prok.EC.isin(ECs_pass_filter)]
df_resolve_euk_f = df_resolve_euk[df_resolve_euk.EC.isin(ECs_pass_filter)]

In [18]:
len(ECs_pass_filter), len(df_resolve_euk_keep)

(2440, 3574)

In [19]:
df_resolve_euk_f.EC.value_counts()

EC
2.7.11.1     247823
2.3.2.27     136935
3.2.2.22      50655
3.4.21.97     49341
3.1.3.16      48807
              ...  
2.8.3.10          6
4.2.1.85          6
1.5.1.40          6
6.3.4.6           6
2.3.1.267         6
Name: count, Length: 2440, dtype: int64

In [20]:
plot1, data1 = plot_cumsum_counts(df_resolve_euk_f.EC.value_counts(), title='euk', plot_type='default')
plot2, data2 = plot_cumsum_counts(df_resolve_prok_f.EC.value_counts(), title='prok', plot_type='default')
plot2, data2 = plot_cumsum_counts(pd.concat([df_resolve_euk_f, df_resolve_prok_f]).EC.value_counts(), title='combined', plot_type='default')
all_data = pd.concat([data1, data2])
plot, data = plot_cumsum_counts(all_data, formatted_data=True, title='microcosm size distributions', plot_type='default', x_label = "Number of euk/prok proteins in microcosm")
plot

In [279]:
plot1, data1 = plot_cumsum_counts(df_resolve_euk.EC.value_counts(), title='euk', plot_type='default')
plot2, data2 = plot_cumsum_counts(df_resolve_prok.EC.value_counts(), title='prok', plot_type='default')
all_data = pd.concat([data1, data2])
plot, data = plot_cumsum_counts(all_data, formatted_data=True, title='microcosm size distributions', plot_type='default', x_label = "Number of domain level proteins in microcosm")
plot

In [None]:
plot, data = plot_cumsum_counts(df_resolve_euk.EC.value_counts(), title='repseq microcosm prok members distribution', plot_type='default')
plot