In [None]:
import pandas as pd
import re
from datetime import datetime

rightnow = datetime.now().strftime("%Y%m%d_%H%M%S")

# ----------------------------
# Load AsCOG definitions
# ----------------------------
ascogs = pd.read_csv(
    "/home/anirudh/synteny/hmms/Sources/asCOGs.2020-10_modified.def.tab",
    sep=None,
    header=None,
    names=["ascog_id", "arcog_id",  'category', "gene", "description"],
    encoding="latin1"   # safer for NCBI-style tables
)


# ----------------------------
# Search terms (case-insensitive)
# ----------------------------
search_terms = [
    "escrt",
    "vps",
    "ubiquitin",
    "steadiness",
    "plekstrin",
    "eap",
    "MPN",
    "ALIX",
    "UFM"
]

# Build regex pattern
pattern = re.compile("|".join(search_terms), re.IGNORECASE)

# ----------------------------
# Filter rows where term appears
# in gene OR description
# ----------------------------
filtered_ascogs = ascogs[
    ascogs["gene"].astype(str).str.contains(pattern, na=False) |
    ascogs["description"].astype(str).str.contains(pattern, na=False)
].copy()

# ----------------------------
# Save merged searchable DB
# ----------------------------

# filtered_ascogs.sort_values(by=["ascog_id"], inplace=True)
filtered_ascogs.to_csv(
    f"ascogs_escrt_system_{rightnow}.tsv",
    sep="\t",
    index=False
)

print(f"Filtered AsCOGs: {len(filtered_ascogs)}")
print(filtered_ascogs.head(10))


Filtered AsCOGs: 89
       ascog_id    arcog_id gene  category  \
67   cog.000460  arCOG01676    H      ThiF   
361  cog.000486  arCOG01307    U      Vps4   
363  cog.001760  arCOG00453    U  ESCRTIII   
364  cog.001938  arCOG00453    U  ESCRTIII   
365  cog.002769           -    U         -   
366  cog.003176           -    U     VPS25   
386  cog.000160  arCOG01138    O         -   
388  cog.000179           -    U         -   
392  cog.000309  arCOG01676    O      Uba5   
407  cog.001103  arCOG01138    O         -   

                                           description  
67   Dinucleotide-utilizing enzyme involved in moly...  
361  Cell division ATPase of the AAA+ class, ESCRT ...  
363  Archaeal division protein CdvB, Snf7/Vps24/ESC...  
364  Archaeal division protein CdvB, Snf7/Vps24/ESC...  
365                         EAP30/Vps36 family protein  
366                  Vps25 subunit of ESCRT-II complex  
386  Jab1/MPN domain containing protein, possible C...  
388  EAP30/Vps36 

  ascogs = pd.read_csv(
