[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Unfettered-b/Asgard_Protein_annotation/blob/main/search_cds.ipynb)

# üîç CDS Search and Mapping Notebook

This notebook allows you to:
- Search for coding sequences (CDS) by ID or keyword based on asCOGs database  
- Retrieve corresponding protein sequences and headers  

üìÅ **Repository:** [Asgard_Protein_annotation](https://github.com/Unfettered-b/Asgard_Protein_annotation)


In [None]:
# ============================================
# üîß Setup + Interactive CDS Search Tool in Colab
# ============================================

print("üîß Setting up... please wait a few seconds for the CDS Search Tool to load üëá")

# --- Clone and prepare repo ---
!apt-get -qq install git-lfs
!git clone https://github.com/Unfettered-b/Asgard_Protein_annotation.git
%cd Asgard_Protein_annotation
!git lfs install
!git lfs pull
!ls -lh data/


import pandas as pd
from IPython.display import display, HTML

# --- üîß Search parameters ---
completeness = "50"  # @param ["50", "70", "80", "90", "95"]

# --- Load the database ---
cds_file = f"data/Proteins_genomes_cp{completeness}.csv"
try:
    cds_db = pd.read_csv(cds_file)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå File not found: {cds_file}\nMake sure this file exists in the 'data/' folder.")

# Move 'sequence' column to the end for cleaner display
cds_db = cds_db[[c for c in cds_db.columns if c != 'sequence'] + ['sequence']]
cds_db['header'] = cds_db['header'].str.split().str[1:].str.join(" ").str.strip()


print(f"‚úÖ Loaded database with {len(cds_db)} entries.")
print(f"üß¨ Unique species in database: {cds_db['organism_name'].nunique()}")


In [None]:
# ============================================
# üß¨ CDS Search Tool for Asgard Proteins (Colab-ready)
# ============================================

import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt

# --- üîß Search parameters ---
search_term = "tubulin"  # @param {type:"string"}
organism_filter = ""      # @param {type:"string"}

# --- Primary search logic ---
term = search_term.strip()
org = organism_filter.strip()

if not term and not org:
    print("‚ö†Ô∏è Please enter at least a search term or an organism filter.")
else:
    results = cds_db.copy()

    if term:
        results = results[results['header'].str.contains(term, case=False, na=False)]

    if org:
        results = results[results['organism_name'].str.contains(org, case=False, na=False)]

    # --- Results summary ---
    print(f"üîé Found {len(results)} results", end="")
    if term:
        print(f" for header search: '{term}'", end="")
    if org:
        print(f" (filtered by organism: '{org}')", end="")
    print("\n")

    # --- Visualization ---
    if len(results) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))

        # 1Ô∏è‚É£ Organism distribution pie chart
        org_counts = results['organism_name'].value_counts().head(10)
        axes[0].pie(
            org_counts.values,
            labels=org_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            textprops={'fontsize': 9}
        )
        axes[0].set_title("Organism Distribution (Top 10)", fontsize=12)

        # 2Ô∏è‚É£ Header (protein type) distribution pie chart
        header_counts = results['header'].value_counts().head(10)
        axes[1].pie(
            header_counts.values,
            labels=header_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            textprops={'fontsize': 9}
        )
        axes[1].set_title("Protein/Header Distribution (Top 10)", fontsize=12)

        plt.tight_layout()
        plt.show()

        # --- Display results table ---
        display(HTML(results.to_html(index=False)))
    else:
        print("No matches found.")

# --- Optional clean table styling ---
HTML("""
<style>
table {border-collapse: collapse; width:100%;}
th, td {padding: 6px 10px; text-align: left; border-bottom: 1px solid #ddd;}
tr:hover {background-color: #f5f5f5;}
</style>
""")
