In [None]:
from pathlib import Path

try:
    # script location -> parent of the script's directory (project root)
    base_dir = Path(__file__).resolve().parent.parent
except NameError:
    # running in a notebook or interactive shell where __file__ is not defined
    base_dir = Path.cwd().resolve().parent

import pandas as pd

from read_fs_tsv import read_fs_tsv
from label_hits import label_hits
from pre_rec import pre_rec

In [None]:
fs_cut_path = f"{base_dir}/data/alis/tb_pfam_fs_exh.tsv"
cif_cut_path = f"{base_dir}/data/alis/tb_pfam_cif_exh.tsv"
fs_cut_df = read_fs_tsv(fs_cut_path)
cif_cut_df = read_fs_tsv(cif_cut_path)

In [None]:
ipr_preds = pd.read_csv(f"{base_dir}/data/tb_ipr.tsv", sep="\t", header=None, 
                        names=["unip_id", "ipr_id", "description", "db_id", "qstart", "qend"])
pf_preds = ipr_preds[ipr_preds['db_id'].str.startswith('PF')]
pf_preds = pf_preds.rename(columns={"db_id": "pred_fam"})

In [None]:
fs_cut_labeled = label_hits(fs_cut_df, pf_preds)
cif_cut_labeled = label_hits(cif_cut_df, pf_preds)

In [None]:
print(pre_rec(fs_cut_labeled))
print(pre_rec(cif_cut_labeled))

In [None]:
dom_desc_cols = ["unip_id", "pred_fam_gs", "qstart_gs", "qend_gs"]
cif_cut_perf = cif_cut_labeled[cif_cut_labeled["label"] == 1]
fs_cut_perf = fs_cut_labeled[fs_cut_labeled["label"] == 1]

In [None]:
fs_cif_comp = cif_cut_perf.merge(
    fs_cut_perf,
    on=dom_desc_cols,
    suffixes=('_cif', '_fs'),
    how="outer",
    indicator='origin'   # name the indicator column
)

fs_cif_comp['origin'] = fs_cif_comp['origin'].map({
    'left_only': 'cif',
    'right_only': 'fs',
    'both': 'both'
})

In [None]:
fs_better = fs_cif_comp[fs_cif_comp['origin'] == 'fs']

In [None]:
fs_better_full_data = fs_better[dom_desc_cols].merge(fs_cut_labeled, 
                                     on=dom_desc_cols)

In [None]:
fs_better_full_data[["description"]].value_counts()

In [None]:
fs_better_full_data[["unip_id"]].value_counts()

In [None]:
fs_better_full_data[["pred_fam_ali"]].value_counts()

In [None]:
unip_id_pf17862 = fs_better_full_data[fs_better_full_data["pred_fam_ali"] == "PF17862"]["unip_id"]

In [None]:
cif_cut_labeled[cif_cut_labeled["unip_id"].isin(unip_id_pf17862)]

In [None]:
fs_cut_labeled[fs_cut_labeled["unip_id"].isin(unip_id_pf17862)] #.sort_values(by="qstart_ali")