In [None]:
!pip install -q transformers peft biopython pandas scikit-learn numpy tqdm

In [None]:
!pip uninstall -y protobuf
!pip install protobuf==3.20.3

In [None]:
import tensorflow as tf
import transformers
print("ok!")

In [None]:
import os
INPUT_FASTA = "/kaggle/input/test-fasta/Sara_phaeo_field_161223_2d_assembly_1374_1871_V2.fasta"  
FINE_TUNED_MODEL_PATH = "/kaggle/input/fine-tuned-model" 
REF_EMBEDDINGS_PATH = "/kaggle/input/finetuned-npy/embeddings_finetuned.npy"
REF_IDS_PATH = "/kaggle/input/finetuned-npy/ids_finetuned.npy"
REF_TAXONOMY_PATH = "/kaggle/input/normalized-tsv/combined_normalized_taxonomy.tsv"

OUTPUT_CSV = "final_taxonomy_report.csv"
NOVELTY_CSV = "novel_candidates_list.csv"

MODEL_BASE = "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species"
WINDOW_SIZE = 512
STRIDE = 256
BATCH_SIZE = 16
K_NEIGHBORS = 5  
NOVELTY_THRESHOLD = 0.15

In [None]:
import torch
import numpy as np
import pandas as pd
import re
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForMaskedLM
from peft import PeftModel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

# 1. NORMALIZATION
def normalize_seq(seq):
    """Standardizes DNA: U->T, Uppercase, Remove non-ACGTN."""
    s = seq.replace("U", "T").replace("u", "t").upper()
    s = "".join(s.split()) 
    s = re.sub(r"[^ACGTN]", "N", s)
    return s

# 2. CHUNKING 
def chunk_sequence(seq, window_size, stride):
    """Splits long sequences into overlapping windows."""
    chunks = []
    for i in range(0, len(seq), stride):
        chunk = seq[i : i + window_size]
        if len(chunk) > 50: 
            chunks.append(chunk)
        if i + window_size >= len(seq): break
    return chunks

# 3. AI INFERENCE ENGINE 
def generate_embeddings(fasta_path, model_path, base_model_name):
    print(f"Loading AI Model from: {model_path}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    base_model = AutoModelForMaskedLM.from_pretrained(base_model_name, trust_remote_code=True)
    
    model = PeftModel.from_pretrained(base_model, model_path)
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    embeddings = []
    ids = []
    
    print(f"Processing {fasta_path}...")
    for record in tqdm(SeqIO.parse(fasta_path, "fasta")):
        clean_seq = normalize_seq(str(record.seq))
        header = record.id
        
        chunks = chunk_sequence(clean_seq, WINDOW_SIZE, STRIDE)
        if not chunks: continue

        inputs = tokenizer(chunks, return_tensors="pt", padding=True, truncation=True, max_length=WINDOW_SIZE)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]
            attention_mask = inputs["attention_mask"].unsqueeze(-1)
            masked_hidden = hidden_states * attention_mask
            chunk_embs = masked_hidden.sum(dim=1) / attention_mask.sum(dim=1).clamp(min=1e-9)
            final_seq_emb = chunk_embs.mean(dim=0).cpu().numpy()
            
        embeddings.append(final_seq_emb)
        ids.append(header)
        
    return np.array(embeddings), ids

In [None]:
!pip install fastapi uvicorn python-multipart pyngrok

In [None]:
import os
from pyngrok import ngrok

NGROK_TOKEN = "348roSQj2iERV8fMgVaCYElBgfB_4yPs4jKrwU4U323bzpmJL" 

ngrok.set_auth_token(NGROK_TOKEN)

In [None]:
#1. Install Dependencies
!pip install -q transformers peft biopython pandas scikit-learn numpy fastapi uvicorn python-multipart pyngrok nest-asyncio

#2. Fix Protobuf
!pip uninstall -y protobuf
!pip install protobuf==3.20.3

#3. Imports
import os
import shutil
import numpy as np
import pandas as pd
import torch
import nest_asyncio
from pyngrok import ngrok
from transformers import AutoTokenizer, AutoModelForMaskedLM
from peft import PeftModel
from Bio import SeqIO
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import uvicorn
from fastapi import FastAPI, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware

nest_asyncio.apply()

print("Environment Ready.")

In [None]:
PATHS = {
    "MODEL_BASE": "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species",
    "FINE_TUNED_MODEL": "/kaggle/input/fine-tuned-model", # Path to your fine-tuned weights folder
    "REF_EMBEDDINGS": "/kaggle/input/finetuned-npy/embeddings_finetuned.npy",            # Your saved embeddings .npy
    "REF_IDS": "/kaggle/input/finetuned-npy/ids_finetuned.npy",                          # Your saved IDs .npy
    "REF_TAXONOMY": "/kaggle/input/normalized-tsv/combined_normalized_taxonomy.tsv"       # Your taxonomy .tsv
}

PARAMS = {
    "WINDOW_SIZE": 512,
    "STRIDE": 256,
    "K_NEIGHBORS": 5,
    "NOVELTY_THRESHOLD": 0.15
}

def load_knowledge_base():
    print("Loading Knowledge Base...")
    
    # 1. Load Files
    try:
        ref_emb = np.load(PATHS["REF_EMBEDDINGS"])
        ref_ids = np.load(PATHS["REF_IDS"])
        ref_tax_df = pd.read_csv(PATHS["REF_TAXONOMY"], sep='\t')
    except FileNotFoundError as e:
        return None, None, f"Error: {e}"

    # 2. Align Data (ID Matching)
    clean_ref_ids = [str(x).split('|')[0].replace('>', '').strip() for x in ref_ids]
    ref_df_ids = pd.DataFrame({'accession': clean_ref_ids, 'idx': range(len(clean_ref_ids))})
    
    ref_tax_df['accession'] = ref_tax_df['header'].astype(str).str.split().str[0].replace('>', '')
    
    merged = pd.merge(ref_df_ids, ref_tax_df, on='accession', how='inner')

    valid_indices = merged['idx'].values
    valid_embs = ref_emb[valid_indices]
    valid_taxa = merged['taxonomy'].values
    
    #3. Build Search Engine (k-NN)
    print("Building Search Engine...")
    ref_emb_norm = normalize(valid_embs, axis=1)
    knn_engine = NearestNeighbors(n_neighbors=PARAMS["K_NEIGHBORS"], metric='cosine', n_jobs=-1)
    knn_engine.fit(ref_emb_norm)
    
    print(f"Knowledge Base Loaded! ({len(valid_taxa)} sequences)")
    return knn_engine, valid_taxa, "Success"

knn_engine, ref_taxa, status = load_knowledge_base()
if "Error" in status: print(status)

In [None]:
import re

In [None]:
import os
# Force install specific compatible versions
!pip install -U -q "bitsandbytes>=0.48.0" "transformers>=4.57.0" "accelerate>=1.0.0" "peft>=0.10.0"

print("Libraries installed.")
print("NOW: Go to the Menu Bar -> 'Runtime' (or 'Kernel') -> 'Restart Session' (or 'Restart Kernel').")
print("DO NOT proceed until you have restarted!")

In [None]:
!pip install -q -U bitsandbytes transformers accelerate scipy

In [None]:
import bitsandbytes as bnb
import transformers

print(f"BitsAndBytes Version: {bnb.__version__}")
print(f"Transformers Version: {transformers.__version__}")

# Check if CUDA (GPU) is available - 4-bit ONLY works on GPU
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
from peft import PeftModel

# 1. Define the missing path variable
FINE_TUNED_DIR = "/kaggle/input/fine-tuned-model" 

# 2. Configure 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

print("Loading Base Model (4-bit)...")
base_model = AutoModelForMaskedLM.from_pretrained(
    "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print(f"Loading Fine-Tuned Adapters from: {FINE_TUNED_DIR}")
model = PeftModel.from_pretrained(base_model, FINE_TUNED_DIR)
tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_DIR, trust_remote_code=True)
model.eval()

print("‚úÖ Model loaded successfully!")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
from peft import PeftModel

# --- CONFIGURATION FOR QUANTIZATION ---
# This converts weights to 4-bit integers to save memory and increase throughput
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Compute in fast 16-bit
    bnb_4bit_quant_type="nf4",             # Normalized float 4 (better for pre-trained models)
    bnb_4bit_use_double_quant=True         # Compresses constants for more savings
)

print("Loading Fine-Tuned Model with 4-bit Quantization...")

# 1. Load Base Model (Quantized)
base_model = AutoModelForMaskedLM.from_pretrained(
    "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species",
    quantization_config=bnb_config,        # <--- Apply Quantization here
    device_map="auto",                     # <--- Automatically map to GPU
    trust_remote_code=True
)

# 2. Load your LoRA Adapters
# Note: We do NOT use .to("cuda") because device_map handled it
model = PeftModel.from_pretrained(base_model, FINE_TUNED_DIR)
tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_DIR, trust_remote_code=True)

model.eval()
print("‚úÖ Model loaded in 4-bit mode!")

In [None]:
import torch
import transformers
import bitsandbytes
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
from peft import PeftModel

# Verify versions match what we just installed
print(f"Transformers Version: {transformers.__version__} (Should be >= 4.57)")
print(f"BitsAndBytes Version: {bitsandbytes.__version__} (Should be >= 0.48)")

# --- CONFIGURATION ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

print("\nLoading Base Model in 4-bit...")
base_model = AutoModelForMaskedLM.from_pretrained(
    "InstaDeepAI/nucleotide-transformer-v2-100m-multi-species",
    quantization_config=bnb_config,  # 4-bit optimization
    device_map="auto",
    trust_remote_code=True
)

print("Loading LoRA Adapters...")
# Replace 'FINE_TUNED_DIR' with your actual path (e.g., "/kaggle/input/fine-tuned-model")
FINE_TUNED_DIR = "/kaggle/input/fine-tuned-model" 

model = PeftModel.from_pretrained(base_model, FINE_TUNED_DIR)
tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_DIR, trust_remote_code=True)
model.eval()

print("Step 1 Complete: Model loaded efficiently!")

In [None]:
from collections import defaultdict

class BiodiversityPipeline:
    def __init__(self, base_model_path, ft_model_path, knn_engine, ref_taxa):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Loading AI Model on {self.device}...")
        
        # Load Tokenizer & Model
        self.tokenizer = AutoTokenizer.from_pretrained(ft_model_path, trust_remote_code=True)
        base_model = AutoModelForMaskedLM.from_pretrained(base_model_path, trust_remote_code=True)
        self.model = PeftModel.from_pretrained(base_model, ft_model_path)
        self.model.to(self.device)
        self.model.eval()
        
        self.knn_engine = knn_engine
        self.ref_taxa = ref_taxa
        print("Pipeline Initialized (Optimized: 4-bit, Deduplication, Smart Sorting).")

    def _normalize_seq(self, seq):
        s = seq.replace("U", "T").replace("u", "t").upper()
        s = "".join(s.split())
        return re.sub(r"[^ACGTN]", "N", s)

    def _chunk_sequence(self, seq):
        chunks = []
        for i in range(0, len(seq), PARAMS["STRIDE"]):
            chunk = seq[i : i + PARAMS["WINDOW_SIZE"]]
            if len(chunk) > 50: chunks.append(chunk)
            if i + PARAMS["WINDOW_SIZE"] >= len(seq): break
        return chunks

    def process_file(self, fasta_path, batch_size=32):
        print(f"Reading {fasta_path}...")
        
        # --- PHASE 1: DEDUPLICATION (CPU) ---
        unique_seq_map = defaultdict(list)
        total_reads = 0
        
        for record in SeqIO.parse(fasta_path, "fasta"):
            total_reads += 1
            seq = self._normalize_seq(str(record.seq))
            unique_seq_map[seq].append(record.id)
            
        unique_sequences = list(unique_seq_map.keys())
        print(f"Pruning: Reduced {total_reads} reads to {len(unique_sequences)} unique sequences.")
        
        # --- PHASE 2: PREPARE SMART BATCHES ---
        # We create tuples: (chunk_text, unique_seq_index)
        flat_chunks = []
        
        for idx, seq in enumerate(unique_sequences):
            chunks = self._chunk_sequence(seq)
            for chunk in chunks:
                flat_chunks.append((chunk, idx))
                
        # SORT by length (Crucial for Smart Batching)
        flat_chunks.sort(key=lambda x: len(x[0]))
        
        print(f"Processing {len(flat_chunks)} chunks (Sorted by length)...")
        
        # --- PHASE 3: BATCH INFERENCE (GPU) ---
        # Dictionary to collect embeddings: { unique_seq_idx: [emb1, emb2...] }
        seq_embeddings = defaultdict(list)
        
        # Iterate through the sorted list
        for i in range(0, len(flat_chunks), batch_size):
            batch_items = flat_chunks[i : i + batch_size]
            batch_texts = [x[0] for x in batch_items]
            batch_indices = [x[1] for x in batch_items]
            
            # Tokenize (Dynamic Padding - pads only to longest in THIS batch)
            inputs = self.tokenizer(
                batch_texts, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=PARAMS["WINDOW_SIZE"]
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs, output_hidden_states=True)
                hidden = outputs.hidden_states[-1]
                mask = inputs["attention_mask"].unsqueeze(-1)
                
                # Mean Pool
                chunk_embs = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
                chunk_embs = chunk_embs.cpu() # Move to CPU
                
                # Distribute back to the correct unique sequence
                for k, emb in enumerate(chunk_embs):
                    seq_idx = batch_indices[k]
                    seq_embeddings[seq_idx].append(emb)

        # --- PHASE 4: AGGREGATE & IDENTIFY ---
        results = []
        print("Matching & Expanding Results...")
        
        for idx, seq_str in enumerate(unique_sequences):
            embs = seq_embeddings.get(idx)
            if not embs: continue
            
            # Average chunks to get Sequence Embedding
            # Stack list of tensors -> Tensor -> Mean
            final_emb = torch.stack(embs).mean(dim=0).numpy().reshape(1, -1)
            
            # KNN Search
            query_norm = normalize(final_emb, axis=1)
            dists, indices = self.knn_engine.kneighbors(query_norm)
            
            avg_dist = float(np.mean(dists[0]))
            status = "Known" if avg_dist < PARAMS["NOVELTY_THRESHOLD"] else "POTENTIALLY NOVEL"
            neighbors = self.ref_taxa[indices[0]]
            predicted_tax = max(set(neighbors), key=list(neighbors).count)
            
            # Copy result to all original IDs
            for original_id in unique_seq_map[seq_str]:
                results.append({
                    "sequence_id": original_id,
                    "status": status,
                    "novelty_score": round(avg_dist, 4),
                    "taxonomy": predicted_tax
                })
            
        return results

# Initialize Pipeline with the optimized class
pipeline = BiodiversityPipeline(PATHS["MODEL_BASE"], PATHS["FINE_TUNED_MODEL"], knn_engine, ref_taxa)

In [2]:
"""
Setup script for Kaggle notebook - Database Caching
Run this in your Kaggle notebook to set up database caching
"""

# Step 1: Install required packages
print("üì¶ Installing required packages...")
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    install_package("supabase")
    print("‚úÖ Supabase installed")
except Exception as e:
    print(f"‚ùå Failed to install supabase: {e}")

# Step 2: Set environment variables
import os
os.environ["SUPABASE_URL"] = "https://nbnyhdwbnxbheombbhtv.supabase.co"
os.environ["SUPABASE_KEY"] = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im5ibnloZHdibnhiaGVvbWJiaHR2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NjU0MDIyNDksImV4cCI6MjA4MDk3ODI0OX0.u5DxN1eX-K85WepTNCEs5sJw9M13YLmGm5pVe1WKy34"
os.environ["USE_DATABASE"] = "true"

print("‚úÖ Environment variables set")

# Step 3: Test database connection
print("üß™ Testing database connection...")
try:
    from supabase import create_client
    
    client = create_client(
        os.environ["SUPABASE_URL"],
        os.environ["SUPABASE_KEY"]
    )
    
    # Test query
    response = client.table('analysis_jobs').select('*').limit(1).execute()
    print("‚úÖ Database connection successful!")
    print(f"üìä Found {len(response.data)} existing records")
    
except Exception as e:
    print(f"‚ùå Database connection failed: {e}")

# Step 4: Ready message
print("\nüöÄ SETUP COMPLETE!")
print("=" * 50)
print("‚úÖ Supabase package installed")
print("‚úÖ Environment variables configured")
print("‚úÖ Database connection tested")
print("\nüí° Now run your backend with database caching!")
print("üîÑ Caching will work automatically!")
print("   ‚Ä¢ First upload: Processes and stores in DB")
print("   ‚Ä¢ Same file again: Returns cached result instantly")


üì¶ Installing required packages...
Collecting supabase
  Downloading supabase-2.25.1-py3-none-any.whl.metadata (4.6 kB)
Collecting realtime==2.25.1 (from supabase)
  Downloading realtime-2.25.1-py3-none-any.whl.metadata (7.0 kB)
Collecting supabase-functions==2.25.1 (from supabase)
  Downloading supabase_functions-2.25.1-py3-none-any.whl.metadata (2.4 kB)
Collecting storage3==2.25.1 (from supabase)
  Downloading storage3-2.25.1-py3-none-any.whl.metadata (2.1 kB)
Collecting supabase-auth==2.25.1 (from supabase)
  Downloading supabase_auth-2.25.1-py3-none-any.whl.metadata (6.4 kB)
Collecting postgrest==2.25.1 (from supabase)
  Downloading postgrest-2.25.1-py3-none-any.whl.metadata (3.4 kB)
Collecting deprecation>=2.1.0 (from postgrest==2.25.1->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting strenum>=0.4.15 (from supabase-functions==2.25.1->supabase)
  Downloading StrEnum-0.4.15-py3-none-any.whl.metadata (5.3 kB)
Downloading supabase-2.25.1-py

In [3]:
"""
Taxaformer Backend API with Database Caching
FastAPI server with file hash-based idempotency and Supabase storage
"""
import os
import sys
import shutil
import json
import hashlib
from datetime import datetime
from typing import Dict, Any, Optional
import uvicorn
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.middleware.cors import CORSMiddleware
from pyngrok import ngrok

# Import your existing pipeline
from pipeline import TaxonomyPipeline

# Database imports
try:
    from supabase import create_client, Client
    
    class TaxaformerDB:
        def __init__(self):
            self.url = os.getenv("SUPABASE_URL")
            self.key = os.getenv("SUPABASE_KEY")
            print(f"üîó Connecting to Supabase: {self.url}")
            self.client = create_client(self.url, self.key)
        
        def compute_file_hash(self, file_bytes: bytes) -> str:
            return hashlib.sha256(file_bytes).hexdigest()
        
        def get_job_by_hash(self, file_hash: str):
            try:
                response = self.client.table('analysis_jobs').select('*').eq('file_hash', file_hash).limit(1).execute()
                return response.data[0] if response.data else None
            except Exception as e:
                print(f"Error getting job by hash: {e}")
                return None
        
        def store_analysis(self, file_hash: str, filename: str, result_json: Dict[str, Any]) -> str:
            try:
                data = {
                    "file_hash": file_hash,
                    "filename": filename,
                    "status": "complete",
                    "result": result_json,
                    "completed_at": datetime.utcnow().isoformat()
                }
                
                response = self.client.table('analysis_jobs').insert(data).execute()
                job_id = response.data[0]['job_id']
                
                # Store sequences
                if "sequences" in result_json:
                    sequence_records = []
                    for seq in result_json["sequences"]:
                        record = {
                            "job_id": job_id,
                            "accession": seq.get("accession"),
                            "taxonomy": seq.get("taxonomy"),
                            "length": seq.get("length"),
                            "confidence": seq.get("confidence"),
                            "overlap": seq.get("overlap"),
                            "cluster": seq.get("cluster"),
                            "novelty_score": seq.get("novelty_score"),
                            "status": seq.get("status")
                        }
                        sequence_records.append(record)
                    
                    if sequence_records:
                        self.client.table('sequences').insert(sequence_records).execute()
                
                return job_id
            except Exception as e:
                print(f"Error storing analysis: {e}")
                raise
    
    # Initialize database
    db = TaxaformerDB()
    print("‚úÖ Supabase database connected")
    
except Exception as e:
    print(f"‚ö†Ô∏è Database not available: {e}")
    db = None

# Initialize FastAPI app
app = FastAPI(
    title="Taxaformer API",
    description="Taxonomic analysis pipeline with caching",
    version="1.1.0"
)

# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize pipeline
pipeline = TaxonomyPipeline()

# Directory for temporary files
TEMP_DIR = "temp_uploads"
os.makedirs(TEMP_DIR, exist_ok=True)

@app.get("/")
async def root():
    return {
        "status": "online",
        "service": "Taxaformer API",
        "version": "1.1.0",
        "database": "connected" if db else "disabled",
        "caching": True if db else False,
        "timestamp": datetime.utcnow().isoformat()
    }

@app.post("/analyze")
async def analyze_endpoint(
    file: UploadFile = File(...),
    metadata: Optional[str] = Form(None)
):
    temp_filepath = None
    
    try:
        # Validate file
        if not file.filename:
            raise HTTPException(status_code=400, detail="No filename provided")
        
        allowed_extensions = ['.fasta', '.fa', '.fastq', '.fq', '.txt']
        file_ext = os.path.splitext(file.filename)[1].lower()
        if file_ext not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}"
            )
        
        # Read file bytes for hashing
        file_bytes = await file.read()
        
        if db:
            file_hash = db.compute_file_hash(file_bytes)
            print(f"üìÅ File: {file.filename} ({len(file_bytes)} bytes)")
            print(f"üîç Hash: {file_hash[:16]}...")
            
            # Check cache
            cached_job = db.get_job_by_hash(file_hash)
            if cached_job and cached_job.get('status') == 'complete':
                print(f"üíæ Cache HIT: Returning cached result for job {cached_job['job_id']}")
                return {
                    "status": "success",
                    "job_id": cached_job["job_id"],
                    "cached": True,
                    "data": cached_job["result"]
                }
        
        # Save file temporarily for processing
        temp_filepath = os.path.join(TEMP_DIR, f"temp_{datetime.now().timestamp()}_{file.filename}")
        with open(temp_filepath, "wb") as buffer:
            buffer.write(file_bytes)
        
        print(f"üî¨ Processing file: {file.filename}")
        
        # Process file through pipeline
        start_time = datetime.now()
        result_data = pipeline.process_file(temp_filepath, file.filename)
        processing_time = (datetime.now() - start_time).total_seconds()
        
        # Add processing time
        if "metadata" in result_data:
            result_data["metadata"]["processingTime"] = f"{processing_time:.2f}s"
        
        print(f"‚úÖ Analysis complete: {file.filename} ({processing_time:.2f}s)")
        
        # Store in database if available
        job_id = None
        if db:
            try:
                job_id = db.store_analysis(file_hash, file.filename, result_data)
                print(f"üíæ Saved to database with job_id: {job_id}")
            except Exception as db_error:
                print(f"‚ö†Ô∏è Database save failed: {db_error}")
        
        # Return response
        response = {
            "status": "success",
            "cached": False,
            "data": result_data
        }
        
        if job_id:
            response["job_id"] = job_id
        
        return response
        
    except HTTPException:
        raise
    except Exception as e:
        print(f"‚ùå Error processing file: {str(e)}")
        return {
            "status": "error",
            "message": f"Analysis failed: {str(e)}"
        }
    finally:
        if temp_filepath and os.path.exists(temp_filepath):
            try:
                os.remove(temp_filepath)
            except Exception as e:
                print(f"Warning: Could not delete temp file: {e}")

@app.get("/health")
async def health_check():
    return {
        "status": "healthy",
        "pipeline": "initialized",
        "database": "connected" if db else "disabled",
        "caching": True if db else False,
        "temp_dir": os.path.exists(TEMP_DIR),
        "timestamp": datetime.utcnow().isoformat()
    }

@app.get("/jobs")
async def list_jobs(limit: int = 50):
    if not db:
        raise HTTPException(status_code=503, detail="Database not available")
    
    try:
        response = (db.client.table('analysis_jobs')
                   .select('job_id, filename, status, created_at, completed_at')
                   .order('created_at', desc=True)
                   .limit(limit)
                   .execute())
        return {"jobs": response.data or []}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

def start_server(port: int = 8000, use_ngrok: bool = True, ngrok_token: str = None):
    if use_ngrok:
        if not ngrok_token:
            raise ValueError("ngrok_token is required when use_ngrok=True")
        
        ngrok.set_auth_token(ngrok_token)
        
        try:
            tunnels = ngrok.get_tunnels()
            for tunnel in tunnels:
                print(f"Closing existing tunnel: {tunnel.public_url}")
                ngrok.disconnect(tunnel.public_url)
        except Exception as e:
            print(f"Note: {e}")
        
        try:
            public_url = ngrok.connect(port).public_url
            print("\n" + "="*60)
            print("üöÄ TAXAFORMER API STARTED (WITH CACHING)")
            print("="*60)
            print(f"üì° PUBLIC URL: {public_url}")
            print(f"üîß LOCAL URL:  http://localhost:{port}")
            print(f"üíæ DATABASE:   {'Connected' if db else 'Disabled'}")
            print(f"üîÑ CACHING:    {'Enabled' if db else 'Disabled'}")
            print("="*60)
            print(f"\n‚ö° Copy the PUBLIC URL to your frontend!")
            print(f"   Update API_URL to: {public_url}")
            print("\nüîÑ Caching Features:")
            print("   ‚Ä¢ First upload: Processes and stores in DB")
            print("   ‚Ä¢ Same file again: Returns cached result instantly")
            print("\n" + "="*60 + "\n")
        except Exception as e:
            print(f"\n‚ùå Failed to create ngrok tunnel: {e}")
            raise
    else:
        print(f"\nüöÄ Server starting on http://localhost:{port}")
        print(f"üíæ DATABASE: {'Connected' if db else 'Disabled'}")
    
    uvicorn.run(app, host="0.0.0.0", port=port)

if __name__ == "__main__":
    # Configuration
    NGROK_TOKEN = "348roSQj2iERV8fMgVaCYElBgfB_4yPs4jKrwU4U323bzpmJL"
    PORT = 8000
    USE_NGROK = True
    
    # Start server
    start_server(port=PORT, use_ngrok=USE_NGROK, ngrok_token=NGROK_TOKEN)


ModuleNotFoundError: No module named 'pipeline'

In [None]:
import uvicorn
import shutil
import os
import nest_asyncio
from fastapi import FastAPI, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from pyngrok import ngrok

# 1. Apply Asyncio patch (Required for Colab/Kaggle)
nest_asyncio.apply()

# 2. Define the API App
app = FastAPI()

# Enable CORS so your website can talk to this server
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/analyze")
async def analyze_endpoint(file: UploadFile = File(...)):
    """
    Receives a FASTA file, processes it with the OPTIMIZED pipeline, 
    and returns the taxonomy results.
    """
    temp_filename = f"temp_{file.filename}"
    try:
        # Save uploaded file temporarily
        with open(temp_filename, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        # --- RUN THE OPTIMIZED PIPELINE ---
        # This uses the 'pipeline' object we created in the previous step
        print(f"Received file: {file.filename}")
        data = pipeline.process_file(temp_filename, batch_size=32)
        
        return {"status": "success", "data": data}
        
    except Exception as e:
        print(f"Error: {e}")
        return {"status": "error", "message": str(e)}
    finally:
        # Clean up temp file
        if os.path.exists(temp_filename):
            os.remove(temp_filename)

# 3. Start the Server
# Set your Ngrok Token
NGROK_TOKEN = "348roSQj2iERV8fMgVaCYElBgfB_4yPs4jKrwU4U323bzpmJL" 
ngrok.set_auth_token(NGROK_TOKEN)

# Open a Tunnel
public_url = ngrok.connect(8000).public_url
print(f"\nSERVER IS LIVE!")
print(f"PUBLIC API URL: {public_url}")
print("Copy this URL and paste it into your website's JavaScript code.\n")

# Run the App
uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
!pip install supabase

In [None]:
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
from supabase import create_client, Client
import json
import shutil
import os
import uvicorn
from pyngrok import ngrok
from datetime import datetime
from uuid import uuid4

# --------------------------------
# 1. INIT FASTAPI
# --------------------------------
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# --------------------------------
# 2. CONNECT TO SUPABASE
# --------------------------------
url: str = "https://hdzzhfcgyvqsqoghjewz.supabase.co"
key: str = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImhkenpoZmNneXZxc3FvZ2hqZXd6Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NjUxNjU0MTksImV4cCI6MjA4MDc0MTQxOX0.ki_x3n6hVmdhZPBqUYeYaRKgt1oYzciS68YwVlCnR6Y"

supabase: Client = create_client(url, key)

# --------------------------------
# 3. ANALYZE ENDPOINT
#    (FILE + METADATA)
# --------------------------------
@app.post("/analyze")
async def analyze_endpoint(
    file: UploadFile = File(...),
    metadata: str = Form(None)
):
    temp_filename = f"temp_{uuid4()}_{file.filename}"  # safer unique temp file

    try:
        # Save uploaded FASTA temporarily
        with open(temp_filename, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        # Parse metadata JSON safely
        try:
            metadata_dict = json.loads(metadata) if metadata else {}
        except json.JSONDecodeError:
            metadata_dict = {"warning": "invalid JSON metadata"}

        # --------------------------------
        # RUN YOUR PIPELINE 
        # (Make sure 'pipeline' is imported or defined in your full code)
        # --------------------------------
        data = pipeline.process_file(temp_filename)
        

        # --------------------------------
        # SAVE RESULT IN SUPABASE
        # --------------------------------
        job_id = str(uuid4())
       # Prepare the row to insertaa
        # NOTE: Supabase stores JSON in 'jsonb' columns perfectly
        document = {
            "id": job_id,
            "filename": file.filename,
            "metadata": metadata_dict,
            "analysis_result": data,
            "created_at": datetime.utcnow().isoformat()
        }

        # Insert into table 'analysis_results'
        response = supabase.table("analysis_results").insert(document).execute()

        # --------------------------------
        # RETURN IMMEDIATE JSON TO FRONTEND
        # --------------------------------
        return {
            "status": "success",
            "job_id": job_id,
            "data": data
        }

    except Exception as e:
        return {"status": "error", "message": str(e)}

    finally:
        if os.path.exists(temp_filename):
            os.remove(temp_filename)


# --------------------------------
# 4. FETCH RESULTS BY ID
# --------------------------------
@app.get("/result/{job_id}")
async def get_result(job_id: str):
    try:
        # Select * from 'analysis_results' where id matches job_id
        response = supabase.table("analysis_results").select("*").eq("id", job_id).execute()
        
        if (result.job_id) :
          localStorage.setItem('currentJobId', result.job_id)
        

        # response.data is a list of results
        if not response.data:
            return {"status": "error", "message": "Result not found."}
            
        return {"status": "success", "data": response.data[0]}
        
    except Exception as e:
        return {"status": "error", "message": str(e)}


# --------------------------------
# 5. NGROK SETUP (KAGGLE)
# --------------------------------
NGROK_TOKEN = "348roSQj2iERV8fMgVaCYElBgfB_4yPs4jKrwU4U323bzpmJL"
ngrok.set_auth_token(NGROK_TOKEN)

public_url = ngrok.connect(8000).public_url
print("PUBLIC API URL:", public_url)
print("Put this URL in your website fetch() calls")

uvicorn.run(app, host="0.0.0.0", port=8000)

##### 