[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Unfettered-b/Asgard_Protein_annotation/blob/main/search_cds.ipynb)

# üîç CDS Search and Mapping Notebook

This notebook allows you to:
- Search for coding sequences (CDS) by ID or keyword  
- Retrieve corresponding protein sequences and headers  
- Map each CDS to its genome and organism name using metadata from the repository  
- Explore protein and genome relationships interactively

üìÅ **Repository:** [Asgard_Protein_annotation](https://github.com/Unfettered-b/Asgard_Protein_annotation)


In [None]:
# ============================================
# üîß Setup + Interactive CDS Search Tool in Colab
# ============================================

print("üîß Setting up... please wait a few seconds for the CDS Search Tool to load üëá")

# --- Clone and prepare repo ---
!apt-get -qq install git-lfs
!git clone https://github.com/Unfettered-b/Asgard_Protein_annotation.git
%cd Asgard_Protein_annotation
!git lfs install
!git lfs pull
!ls -lh data/


In [None]:
# ===================================================
# üß¨ CDS Search Tool for Asgard Proteins (Colab Form)
# ===================================================

import pandas as pd
from IPython.display import display, HTML

# --- üîß User Inputs (Colab will render dropdowns and text boxes) ---
completeness = "50"  # @param ["50", "60", "70", "80", "90", "95"]
search_term = "tubulin"  # @param {type:"string"}

# --- üìÇ Load dataset ---
cds_file = f"data/Proteins_genomes_cp{completeness}.csv"

try:
    cds_db = pd.read_csv(cds_file)
except FileNotFoundError:
    print(f"‚ùå File not found: {cds_file}")
    print("Please ensure the CSV is in the 'data/' folder or mount Google Drive if needed.")
else:
    print(f"‚úÖ Loaded database with {len(cds_db)} entries.")
    print(f"üß¨ Unique species in database: {cds_db['organism_name'].nunique()}")

    # --- üîç Search for term ---
    term = search_term.strip()
    if term:
        results = cds_db[cds_db['header'].str.contains(term, case=False, na=False)]
        print(f"üîé Found {len(results)} results for search term: '{term}'")

        # Display only first 10 matches neatly
        if len(results) > 0:
            display(HTML(results.head(10).to_html(index=False)))
        else:
            print("No matches found.")
    else:
        print("‚ö†Ô∏è Please enter a valid search term.")
