In [1]:
#!/usr/bin/env python3
"""
Complete Insect COI Analysis Pipeline - Unified Version v11.1
Preserves all original metadata + adds new analysis columns
Author: Integrated Analysis Pipeline
Date: 2025
"""

import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from pathlib import Path
import re
import warnings
import sys
warnings.filterwarnings('ignore')

# BioPython
try:
    from Bio import SeqIO
    from Bio import Align
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    BIOPYTHON_AVAILABLE = True
    PROTPARAM_AVAILABLE = True
except ImportError:
    BIOPYTHON_AVAILABLE = False
    PROTPARAM_AVAILABLE = False
    print("WARNING: BioPython not available. Install with: pip install biopython")

# Progress bar
try:
    from tqdm import tqdm
    TQDM_AVAILABLE = True
except ImportError:
    TQDM_AVAILABLE = False

# Visualization
try:
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import seaborn as sns
    VISUALIZATION_AVAILABLE = True
except ImportError:
    VISUALIZATION_AVAILABLE = False

# ========================
# CONFIGURATION
# ========================
# Input files
METADATA_FILE = "/Users/sarawut/Desktop/Manuscript_ASV_selection/raw_data/ASV_Authentication_Results_030925.csv"
FASTA_FILE = "/Users/sarawut/Desktop/Manuscript_ASV_selection/raw_data/ASV_table_sequences_66595.fasta"

# Output files
OUTPUT_DIR = "/Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis"
OUTPUT_FILE = f"{OUTPUT_DIR}/ASV_Complete_Analysis.csv"
OUTPUT_SUMMARY = f"{OUTPUT_DIR}/Analysis_Summary.csv"
OUTPUT_CODON_USAGE = f"{OUTPUT_DIR}/Codon_Usage_Table.csv"
OUTPUT_AA_COMPOSITION = f"{OUTPUT_DIR}/AA_Composition.csv"
OUTPUT_MOTIF_SUMMARY = f"{OUTPUT_DIR}/Motif_Analysis_Summary.csv"
OUTPUT_FASTA_CORRECTED = f"{OUTPUT_DIR}/ASV_Corrected_Sequences.fasta"
OUTPUT_FASTA_HIGH_QUALITY = f"{OUTPUT_DIR}/ASV_High_Quality_Sequences.fasta"

# Analysis settings
ANALYZE_ALL_SEQUENCES = True
MAX_SEQUENCES_FOR_CODON = None  # None = analyze all
CREATE_VISUALIZATIONS = True

# COI Reference for alignment
COI_REFERENCE = "ATGGCNCAYCCNCCNCCNGCNGGNTCNAARAARGARGTNTTYAARTTYAGNWSNGTNAARWSNATYGTNATYCCNCCNGCN"

# ========================
# INSECT MITOCHONDRIAL GENETIC CODE
# ========================
INSECT_GENETIC_CODE = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
    'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 'AGG': 'S',
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
}

# ========================
# COI CONSERVED MOTIFS
# ========================

COI_DNA_MOTIFS = {
    'GLYCINE_RICH': r'GG[ATGC]GG[ATGC]',
    'LEUCINE_RICH_1': r'[CT][TC][ATGC](?:[CT][TC][ATGC]){2}',
    'LEUCINE_RICH_2': r'TT[ATGC][CT][TC][ATGC]',
    'PROLINE_PATTERN': r'CC[ATGC]CC[ATGC]',
    'THR_GLY': r'AC[ATGC]GG[ATGC]',
    'ALA_GLY': r'GC[ATGC]GG[ATGC]',
    'VAL_LEU': r'GT[ATGC][CT][TC][ATGC]',
    'PHE_LEU': r'TT[TC][CT][TC][ATGC]',
    'ILE_VAL': r'AT[ATC]GT[ATGC]',
    'GLY_THR_GLY': r'GG[ATGC]AC[ATGC]GG[ATGC]',
}

COI_PROTEIN_MOTIFS = {
    'QUAD_LEUCINE': r'LLLL',
    'TRI_LEUCINE': r'LLL',
    'DI_LEUCINE': r'LL',
    'GLYCINE_PAIR': r'GG',
    'PROLINE_PAIR': r'PP',
    'LEU_PRO': r'LP',
    'GLY_THR': r'GT',
    'PHE_LEU': r'FL',
    'VAL_LEU': r'VL',
    'ILE_VAL': r'IV',
    'MET_ASN': r'MN',
    'LEU_SER_LEU': r'LSL',
    'GLY_ALA': r'GA',
    'ALA_SER': r'AS',
    'SER_VAL': r'SV',
    'THR_TRP': r'TW',
}

COI_CONSERVED_REGIONS = {
    'START_REGION': r'^ATG[GC][CA][CT]',
    'HELIX_I': r'[CT][TC][ATGC]GG[ATGC][GC][CT][ATGC]',
    'HELIX_II': r'[CT][TC][ATGC][CT][TC][ATGC][GC]C[ATGC]',
}

HYDROPHOBIC_AA = set('AILMFVPGW')
POLAR_AA = set('STNQCY')
CHARGED_AA = set('DEKRH')
CHARGED_POSITIVE = set('KRH')
CHARGED_NEGATIVE = set('DE')
AROMATIC_AA = set('FYW')
ALIPHATIC_AA = set('ILV')

# ========================
# UTILITY FUNCTIONS
# ========================

def clean_sequence(seq: str) -> str:
    """Clean DNA sequence"""
    return str(seq).upper().replace('-', '').replace(' ', '').replace('.', '')

def reverse_complement(seq: str) -> str:
    """Get reverse complement of DNA sequence"""
    complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
    return ''.join(complement.get(base, 'N') for base in reversed(seq))

def translate_insect(seq: str, frame: int = 0) -> str:
    """Translate DNA to protein using insect mitochondrial genetic code"""
    seq = seq[frame:]
    protein = ''
    for i in range(0, len(seq) - 2, 3):
        codon = seq[i:i+3]
        if len(codon) == 3:
            protein += INSECT_GENETIC_CODE.get(codon, 'X') if 'N' not in codon else 'X'
    return protein

def safe_value(val, default=0):
    """Safely convert value to float"""
    if pd.isna(val):
        return default
    try:
        if isinstance(val, pd.Series):
            if len(val) > 0:
                return float(val.iloc[0])
            return default
        return float(val)
    except:
        return default

# ========================
# SEQUENCE TRIMMING
# ========================

def trim_sequence(seq: str) -> tuple:
    """Trim low-quality regions from sequence"""
    original_len = len(seq)
    
    seq_trimmed = seq.strip('N')
    seq_trimmed = re.sub(r'^[AT]{10,}', '', seq_trimmed)
    seq_trimmed = re.sub(r'[AT]{10,}$', '', seq_trimmed)
    
    trimmed = original_len > len(seq_trimmed)
    
    return seq_trimmed, trimmed

# ========================
# ORF DETECTION & FRAME SELECTION
# ========================

def find_best_orf(seq: str) -> dict:
    """
    Find the best open reading frame across all 6 possible frames
    Returns: dict with frame, strand, sequence, protein, internal_stops, score
    """
    best_result = {
        'frame': 1,
        'strand': '+',
        'sequence': seq,
        'protein': '',
        'internal_stops': 999,
        'score': 0,
        'orf_length': 0
    }
    
    for strand_name, strand_seq in [('+', seq), ('-', reverse_complement(seq))]:
        for frame in range(3):
            protein = translate_insect(strand_seq, frame)
            
            internal_stops = protein[:-1].count('*') if len(protein) > 1 else 0
            
            score = 0
            
            if internal_stops == 0:
                score += 70
            elif internal_stops == 1:
                score += 35
            elif internal_stops == 2:
                score += 15
            
            if 200 <= len(protein) <= 250:
                score += 20
            elif 150 <= len(protein) <= 280:
                score += 10
            elif 100 <= len(protein) <= 300:
                score += 5
            
            at_pct = (strand_seq.count('A') + strand_seq.count('T')) / len(strand_seq) * 100
            if 60 <= at_pct <= 75:
                score += 10
            elif 55 <= at_pct <= 80:
                score += 5
            
            if score > best_result['score']:
                best_result = {
                    'frame': frame + 1,
                    'strand': strand_name,
                    'sequence': strand_seq,
                    'protein': protein,
                    'internal_stops': internal_stops,
                    'score': score,
                    'orf_length': len(protein)
                }
    
    return best_result

# ========================
# FRAMESHIFT CORRECTION
# ========================

def correct_frameshift(seq: str, protein: str, max_attempts: int = 3) -> tuple:
    """
    Attempt to correct frameshifts by inserting/deleting bases near stop codons
    Returns: (corrected_seq, corrected_protein, correction_applied)
    """
    if '*' not in protein[:-1]:
        return seq, protein, False
    
    best_seq = seq
    best_protein = protein
    min_stops = protein[:-1].count('*')
    corrected = False
    
    stop_positions = [i for i, aa in enumerate(protein[:-1]) if aa == '*']
    
    for stop_pos in stop_positions[:max_attempts]:
        for offset in range(-3, 4):
            test_pos = (stop_pos * 3) + offset
            
            if 0 <= test_pos < len(seq):
                test_seq = seq[:test_pos] + seq[test_pos+1:]
                test_protein = translate_insect(test_seq)
                test_stops = test_protein[:-1].count('*')
                
                if test_stops < min_stops:
                    min_stops = test_stops
                    best_seq = test_seq
                    best_protein = test_protein
                    corrected = True
    
    return best_seq, best_protein, corrected

# ========================
# MOTIF ANALYSIS
# ========================

def analyze_motifs_comprehensive(seq: str, protein: str) -> dict:
    """
    Comprehensive motif analysis for COI validation
    Analyzes both DNA and protein-level conserved patterns
    """
    if pd.isna(seq) or pd.isna(protein):
        return {
            'dna_motifs_found': 'None', 'dna_motif_count': 0, 'dna_motif_total_hits': 0,
            'dna_motif_coverage': 0, 'dna_motif_positions': 'NA',
            'protein_motifs_found': 'None', 'protein_motif_count': 0,
            'protein_motif_total_hits': 0, 'protein_motif_coverage': 0,
            'conserved_regions': 'None', 'conserved_region_count': 0,
            'max_leucine_run': 0, 'max_glycine_run': 0,
            'tga_count': 0, 'trp_count': 0, 'tga_trp_validated': False,
            'motif_score': 0, 'motif_quality': 'Poor', 'coi_confidence': 'Very_Low'
        }
    
    seq = str(seq).upper()
    protein = str(protein).replace('*', '').replace('X', '')
    
    # 1. DNA-level motifs
    dna_motifs_found = []
    dna_motif_counts = {}
    dna_positions = []
    
    for motif_name, pattern in COI_DNA_MOTIFS.items():
        matches = list(re.finditer(pattern, seq, re.IGNORECASE))
        if matches:
            dna_motifs_found.append(motif_name)
            dna_motif_counts[motif_name] = len(matches)
            dna_positions.append(matches[0].start())
    
    # 2. Protein-level motifs
    protein_motifs_found = []
    protein_motif_counts = {}
    
    for motif_name, pattern in COI_PROTEIN_MOTIFS.items():
        matches = list(re.finditer(pattern, protein))
        if matches:
            protein_motifs_found.append(motif_name)
            protein_motif_counts[motif_name] = len(matches)
    
    # 3. Conserved regions
    conserved_regions_found = []
    for region_name, pattern in COI_CONSERVED_REGIONS.items():
        if re.search(pattern, seq, re.IGNORECASE):
            conserved_regions_found.append(region_name)
    
    # 4. Leucine and glycine runs
    leucine_runs = re.findall(r'L{3,}', protein)
    max_leucine_run = max([len(run) for run in leucine_runs]) if leucine_runs else 0
    
    glycine_runs = re.findall(r'G{2,}', protein)
    max_glycine_run = max([len(run) for run in glycine_runs]) if glycine_runs else 0
    
    # 5. TGA (Trp) validation
    tga_count = len(re.findall(r'TGA', seq))
    trp_count = protein.count('W')
    tga_trp_validated = (tga_count > 0 and trp_count > 0)
    
    # 6. Calculate coverage scores
    dna_motif_coverage = (len(dna_motifs_found) / len(COI_DNA_MOTIFS)) * 100
    protein_motif_coverage = (len(protein_motifs_found) / len(COI_PROTEIN_MOTIFS)) * 100
    motif_score = (dna_motif_coverage + protein_motif_coverage) / 2
    
    # 7. Determine motif quality
    if motif_score >= 60 and tga_trp_validated:
        motif_quality = 'Excellent'
    elif motif_score >= 40 and tga_trp_validated:
        motif_quality = 'Good'
    elif motif_score >= 25:
        motif_quality = 'Fair'
    else:
        motif_quality = 'Poor'
    
    # 8. COI confidence assessment
    if dna_motif_coverage >= 30 and protein_motif_coverage >= 30:
        coi_confidence = 'High'
    elif dna_motif_coverage >= 20 and protein_motif_coverage >= 20:
        coi_confidence = 'Medium'
    elif dna_motif_coverage >= 10 or protein_motif_coverage >= 10:
        coi_confidence = 'Low'
    else:
        coi_confidence = 'Very_Low'
    
    return {
        'dna_motifs_found': ';'.join(dna_motifs_found) if dna_motifs_found else 'None',
        'dna_motif_count': len(dna_motifs_found),
        'dna_motif_total_hits': sum(dna_motif_counts.values()),
        'dna_motif_coverage': dna_motif_coverage,
        'dna_motif_positions': ';'.join(map(str, dna_positions[:10])) if dna_positions else 'NA',
        'protein_motifs_found': ';'.join(protein_motifs_found) if protein_motifs_found else 'None',
        'protein_motif_count': len(protein_motifs_found),
        'protein_motif_total_hits': sum(protein_motif_counts.values()),
        'protein_motif_coverage': protein_motif_coverage,
        'conserved_regions': ';'.join(conserved_regions_found) if conserved_regions_found else 'None',
        'conserved_region_count': len(conserved_regions_found),
        'max_leucine_run': max_leucine_run,
        'max_glycine_run': max_glycine_run,
        'tga_count': tga_count,
        'trp_count': trp_count,
        'tga_trp_validated': tga_trp_validated,
        'motif_score': motif_score,
        'motif_quality': motif_quality,
        'coi_confidence': coi_confidence
    }

# ========================
# NUCLEOTIDE COMPOSITION
# ========================

def analyze_nucleotide_composition(seq: str) -> dict:
    """Comprehensive nucleotide composition analysis"""
    if pd.isna(seq) or len(seq) == 0:
        return {}
    
    seq = str(seq).upper()
    length = len(seq)
    if length == 0:
        return {}
    
    a_count = seq.count('A')
    t_count = seq.count('T')
    g_count = seq.count('G')
    c_count = seq.count('C')
    n_count = seq.count('N')
    
    gc_content = ((g_count + c_count) / length) * 100
    at_content = ((a_count + t_count) / length) * 100
    
    gc_skew = (g_count - c_count) / (g_count + c_count) if (g_count + c_count) > 0 else 0
    at_skew = (a_count - t_count) / (a_count + t_count) if (a_count + t_count) > 0 else 0
    
    purine_count = a_count + g_count
    purine_pct = (purine_count / length) * 100
    
    entropy = 0
    for base in 'ATGC':
        count = seq.count(base)
        if count > 0:
            p = count / length
            entropy -= p * np.log2(p)
    
    codon_pos_gc = {}
    for pos in range(3):
        pos_bases = seq[pos::3]
        if len(pos_bases) > 0:
            pos_gc = (pos_bases.count('G') + pos_bases.count('C')) / len(pos_bases) * 100
            codon_pos_gc[f'gc_pos{pos+1}'] = pos_gc
        else:
            codon_pos_gc[f'gc_pos{pos+1}'] = 0
    
    return {
        'seq_length': length,
        'A_count': a_count, 'T_count': t_count, 'G_count': g_count, 'C_count': c_count, 'N_count': n_count,
        'A_percent': (a_count / length) * 100,
        'T_percent': (t_count / length) * 100,
        'G_percent': (g_count / length) * 100,
        'C_percent': (c_count / length) * 100,
        'N_percent': (n_count / length) * 100,
        'GC_content': gc_content,
        'AT_content': at_content,
        'GC_skew': gc_skew,
        'AT_skew': at_skew,
        'purine_percent': purine_pct,
        'shannon_entropy': entropy,
        **codon_pos_gc
    }

# ========================
# PROTEIN PROPERTIES
# ========================

def analyze_protein_properties(protein_seq: str) -> dict:
    """Comprehensive protein property analysis"""
    if pd.isna(protein_seq) or len(protein_seq) == 0:
        return {}
    
    clean_protein = str(protein_seq).replace('*', '').replace('X', '')
    if len(clean_protein) == 0:
        return {}
    
    aa_counts = Counter(clean_protein)
    length = len(clean_protein)
    
    hydrophobic_count = sum(aa_counts.get(aa, 0) for aa in HYDROPHOBIC_AA)
    polar_count = sum(aa_counts.get(aa, 0) for aa in POLAR_AA)
    positive_count = sum(aa_counts.get(aa, 0) for aa in CHARGED_POSITIVE)
    negative_count = sum(aa_counts.get(aa, 0) for aa in CHARGED_NEGATIVE)
    aromatic_count = sum(aa_counts.get(aa, 0) for aa in AROMATIC_AA)
    aliphatic_count = sum(aa_counts.get(aa, 0) for aa in ALIPHATIC_AA)
    
    molecular_weight_val = 0
    aromaticity = 0
    instability_index = 0
    isoelectric_point = 0
    gravy = 0
    
    if PROTPARAM_AVAILABLE:
        try:
            protein_analysis = ProteinAnalysis(clean_protein)
            molecular_weight_val = protein_analysis.molecular_weight()
            aromaticity = protein_analysis.aromaticity()
            instability_index = protein_analysis.instability_index()
            isoelectric_point = protein_analysis.isoelectric_point()
            gravy = protein_analysis.gravy()
        except:
            pass
    
    if molecular_weight_val == 0:
        aa_weights = {
            'A': 89, 'R': 174, 'N': 132, 'D': 133, 'C': 121, 'E': 147, 'Q': 146,
            'G': 75, 'H': 155, 'I': 131, 'L': 131, 'K': 146, 'M': 149, 'F': 165,
            'P': 115, 'S': 105, 'T': 119, 'W': 204, 'Y': 181, 'V': 117
        }
        molecular_weight_val = sum(aa_weights.get(aa, 110) for aa in clean_protein)
    
    return {
        'protein_length': length,
        'clean_protein_length': length,
        'hydrophobic_count': hydrophobic_count,
        'hydrophobic_percent': (hydrophobic_count / length) * 100,
        'polar_count': polar_count,
        'polar_percent': (polar_count / length) * 100,
        'charged_positive_count': positive_count,
        'charged_negative_count': negative_count,
        'net_charge': positive_count - negative_count,
        'aromatic_count': aromatic_count,
        'aromatic_percent': (aromatic_count / length) * 100,
        'aliphatic_count': aliphatic_count,
        'aliphatic_percent': (aliphatic_count / length) * 100,
        'leucine_count': aa_counts.get('L', 0),
        'leucine_percent': (aa_counts.get('L', 0) / length) * 100,
        'molecular_weight': molecular_weight_val,
        'aromaticity': aromaticity,
        'instability_index': instability_index,
        'isoelectric_point': isoelectric_point,
        'gravy_score': gravy
    }

# ========================
# SEQUENCE FEATURES
# ========================

def analyze_sequence_features(seq: str) -> dict:
    """Detect special sequence features"""
    if pd.isna(seq):
        return {}
    
    seq = str(seq).upper()
    
    has_poly_a = bool(re.search(r'A{8,}', seq))
    has_poly_t = bool(re.search(r'T{8,}', seq))
    has_poly_g = bool(re.search(r'G{8,}', seq))
    has_poly_c = bool(re.search(r'C{8,}', seq))
    
    window_size = 20
    low_complexity_windows = 0
    if len(seq) >= window_size:
        for i in range(0, len(seq) - window_size + 1, 10):
            window = seq[i:i+window_size]
            if len(set(window)) <= 2:
                low_complexity_windows += 1
    
    cpg_count = seq.count('CG')
    
    has_coi_reverse = bool(re.search(r'GG[AT]TA[TC][ACGT]{3}GT', seq))
    
    return {
        'has_poly_A': has_poly_a,
        'has_poly_T': has_poly_t,
        'has_poly_G': has_poly_g,
        'has_poly_C': has_poly_c,
        'low_complexity_windows': low_complexity_windows,
        'cpg_count': cpg_count,
        'has_coi_reverse_pattern': has_coi_reverse
    }

# ========================
# CODON USAGE
# ========================

def analyze_codon_usage(seq: str) -> dict:
    """Analyze codon usage in sequence"""
    if pd.isna(seq) or len(seq) < 3:
        return {
            'total_codons': 0, 'unique_codons': 0,
            'most_common_codon': 'N/A', 'codon_diversity': 0
        }
    
    codons = [seq[i:i+3] for i in range(0, len(seq)-2, 3) if len(seq[i:i+3]) == 3]
    codon_counts = Counter(codons)
    top_codon = codon_counts.most_common(1)[0] if codon_counts else ('N/A', 0)
    
    return {
        'total_codons': len(codons),
        'unique_codons': len(codon_counts),
        'most_common_codon': top_codon[0],
        'codon_diversity': (len(codon_counts) / 61) * 100 if len(codon_counts) > 0 else 0
    }

# ========================
# SEQUENCE ALIGNMENT
# ========================

def align_to_reference(seq: str, protein: str) -> dict:
    """Align sequence to COI reference"""
    if not BIOPYTHON_AVAILABLE:
        return {
            'aligned_sequence': seq[:100],
            'aligned_protein': protein[:50],
            'alignment_score': 0
        }
    
    try:
        aligner = Align.PairwiseAligner()
        aligner.mode = 'global'
        aligner.match_score = 2
        aligner.mismatch_score = -1
        aligner.open_gap_score = -2
        aligner.extend_gap_score = -0.5
        
        alignments = aligner.align(COI_REFERENCE, seq[:min(len(seq), len(COI_REFERENCE))])
        
        if alignments:
            best_alignment = alignments[0]
            aligned_seq = str(best_alignment).split('\n')[1][:100]
            alignment_score = best_alignment.score
        else:
            aligned_seq = seq[:100]
            alignment_score = 0
        
        ref_protein = translate_insect(COI_REFERENCE.replace('N', 'A'))
        
        if len(protein) > 0 and len(ref_protein) > 0:
            prot_aligner = Align.PairwiseAligner()
            prot_aligner.mode = 'global'
            prot_aligner.match_score = 3
            prot_aligner.mismatch_score = -1
            prot_aligner.open_gap_score = -2
            prot_aligner.extend_gap_score = -1
            
            prot_alignments = prot_aligner.align(ref_protein[:50], protein[:50])
            if prot_alignments:
                aligned_prot = str(prot_alignments[0]).split('\n')[1][:50]
            else:
                aligned_prot = protein[:50]
        else:
            aligned_prot = protein[:50] if protein else 'N/A'
        
        return {
            'aligned_sequence': aligned_seq if aligned_seq else seq[:100],
            'aligned_protein': aligned_prot if aligned_prot else protein[:50],
            'alignment_score': alignment_score
        }
    
    except Exception:
        return {
            'aligned_sequence': seq[:100] + '...' if len(seq) > 100 else seq,
            'aligned_protein': protein[:50] + '...' if len(protein) > 50 else protein,
            'alignment_score': 0
        }

# ========================
# QUALITY ASSESSMENT
# ========================

def calculate_sequence_quality(seq: str, internal_stops: int, motif_data: dict) -> dict:
    """Calculate overall sequence quality score"""
    if pd.isna(seq):
        return {'sequence_quality_score': 0, 'quality_grade': 'F'}
    
    seq = str(seq).upper()
    length = len(seq)
    if length == 0:
        return {'sequence_quality_score': 0, 'quality_grade': 'F'}
    
    score = 0
    
    # 1. Length score (max 25 points)
    if 600 <= length <= 700:
        score += 25
    elif 550 <= length <= 750:
        score += 20
    elif 450 <= length <= 800:
        score += 15
    elif 300 <= length < 450:
        score += 10
    else:
        score += 5
    
    # 2. Internal stops (max 30 points)
    if internal_stops == 0:
        score += 30
    elif internal_stops == 1:
        score += 15
    elif internal_stops == 2:
        score += 8
    
    # 3. Motif coverage (max 25 points)
    dna_coverage = motif_data.get('dna_motif_coverage', 0)
    protein_coverage = motif_data.get('protein_motif_coverage', 0)
    
    if dna_coverage >= 40 and protein_coverage >= 40:
        score += 25
    elif dna_coverage >= 30 and protein_coverage >= 30:
        score += 20
    elif dna_coverage >= 20 and protein_coverage >= 20:
        score += 15
    elif dna_coverage >= 10 or protein_coverage >= 10:
        score += 5
    
    # 4. AT content (max 10 points)
    at_content = (seq.count('A') + seq.count('T')) / length * 100
    if 60 <= at_content <= 75:
        score += 10
    elif 55 <= at_content <= 80:
        score += 5
    
    # 5. GC content (max 10 points)
    gc_content = (seq.count('G') + seq.count('C')) / length * 100
    if 25 <= gc_content <= 40:
        score += 10
    elif 20 <= gc_content <= 45:
        score += 5
    
    # Assign grade
    if score >= 85:
        grade = 'A+'
        quality_class = 'Excellent'
    elif score >= 75:
        grade = 'A'
        quality_class = 'High_Quality'
    elif score >= 65:
        grade = 'B+'
        quality_class = 'Good_Quality'
    elif score >= 55:
        grade = 'B'
        quality_class = 'Acceptable'
    elif score >= 45:
        grade = 'C'
        quality_class = 'Moderate'
    elif score >= 35:
        grade = 'D'
        quality_class = 'Low_Quality'
    else:
        grade = 'F'
        quality_class = 'Poor_Quality'
    
    return {
        'sequence_quality_score': score,
        'quality_grade': grade,
        'quality_classification': quality_class
    }

# ========================
# QC STATUS DETERMINATION
# ========================

def determine_qc_status(internal_stops: int, length: int, motif_data: dict) -> tuple:
    """
    Determine QC status and flags
    Returns: (status, flags_list)
    """
    flags = []
    
    if internal_stops == 0:
        status = 'PASS'
    elif internal_stops <= 2:
        status = 'WARNING'
        flags.append(f'internal_stops:{internal_stops}')
    else:
        status = 'FAIL'
        flags.append(f'many_stops:{internal_stops}')
    
    if length < 300:
        flags.append('short_sequence')
        if status == 'PASS':
            status = 'WARNING'
    elif length > 800:
        flags.append('long_sequence')
        if status == 'PASS':
            status = 'WARNING'
    
    dna_cov = motif_data.get('dna_motif_coverage', 0)
    protein_cov = motif_data.get('protein_motif_coverage', 0)
    
    if dna_cov < 10 and protein_cov < 10:
        flags.append('very_low_motif_coverage')
        if status == 'PASS':
            status = 'WARNING'
    
    return status, flags

# ========================
# CLASSIFICATION
# ========================

def get_sequence_usability(qc_status: str) -> str:
    """Determine sequence usability"""
    if qc_status == 'PASS':
        return 'Ready_for_Analysis'
    elif qc_status == 'WARNING':
        return 'Review_Required'
    else:
        return 'Exclude_from_Analysis'

def get_analysis_priority(row: dict) -> str:
    """Determine analysis priority"""
    qc = row.get('ANALYSIS_QC_status', row.get('QC_status', ''))
    coi = row.get('ANALYSIS_coi_confidence', row.get('coi_confidence', ''))
    auth = row.get('Authentication_Status', '')
    
    if auth == 'Authenticated' and qc == 'PASS' and coi == 'High':
        return 'Priority_1_Highest'
    elif auth == 'Authenticated' and qc == 'PASS':
        return 'Priority_2_High'
    elif qc == 'PASS' and coi in ['High', 'Medium']:
        return 'Priority_3_Medium'
    elif auth == 'Authenticated' and qc == 'WARNING':
        return 'Priority_4_Review'
    elif qc in ['PASS', 'WARNING']:
        return 'Priority_5_Low'
    else:
        return 'Priority_6_Exclude'

# ========================
# MAIN SEQUENCE ANALYSIS
# ========================

def comprehensive_sequence_analysis(seq_id: str, original_seq: str, metadata_row: dict = None) -> dict:
    """
    Complete analysis pipeline for a single sequence
    Preserves ALL original metadata columns
    """
    
    # Start with original metadata
    if metadata_row and isinstance(metadata_row, dict):
        result = metadata_row.copy()
    else:
        result = {}
    
    # Store original sequence data
    result['ANALYSIS_original_sequence_full'] = original_seq
    result['ANALYSIS_original_sequence'] = original_seq[:100] + '...' if len(original_seq) > 100 else original_seq
    result['ANALYSIS_original_length'] = len(original_seq)
    
    # Clean sequence
    clean_seq = clean_sequence(original_seq)
    
    # Trim sequence
    trimmed_seq, trim_applied = trim_sequence(clean_seq)
    result['ANALYSIS_trim_applied'] = 'Yes' if trim_applied else 'No'
    result['ANALYSIS_trimmed_length'] = len(trimmed_seq)
    
    # Find best ORF
    orf = find_best_orf(trimmed_seq)
    result['ANALYSIS_best_frame'] = orf['frame']
    result['ANALYSIS_best_strand'] = orf['strand']
    result['ANALYSIS_orf_score'] = orf['score']
    result['ANALYSIS_orf_length'] = orf['orf_length']
    
    # Initial sequence and protein
    corrected_seq = orf['sequence']
    corrected_protein = orf['protein']
    result['ANALYSIS_internal_stops_initial'] = orf['internal_stops']
    
    # Attempt frameshift correction if needed
    frameshift_corrected = False
    if orf['internal_stops'] > 0:
        corrected_seq, corrected_protein, frameshift_corrected = correct_frameshift(
            orf['sequence'], orf['protein']
        )
        result['ANALYSIS_frameshift_correction_applied'] = 'Yes' if frameshift_corrected else 'No'
    else:
        result['ANALYSIS_frameshift_correction_applied'] = 'No'
    
    # Final internal stops count
    final_internal_stops = corrected_protein[:-1].count('*') if len(corrected_protein) > 1 else 0
    result['ANALYSIS_internal_stops'] = final_internal_stops
    
    # Store corrected sequences
    result['ANALYSIS_corrected_sequence_full'] = corrected_seq
    result['ANALYSIS_corrected_sequence'] = corrected_seq[:100] + '...' if len(corrected_seq) > 100 else corrected_seq
    result['ANALYSIS_corrected_length'] = len(corrected_seq)
    result['ANALYSIS_corrected_protein_full'] = corrected_protein
    result['ANALYSIS_corrected_protein'] = corrected_protein[:50] + '...' if len(corrected_protein) > 50 else corrected_protein
    
    # Motif analysis
    motif_data = analyze_motifs_comprehensive(corrected_seq, corrected_protein)
    for key, value in motif_data.items():
        result[f'ANALYSIS_{key}'] = value
    
    # Nucleotide composition
    nuc_stats = analyze_nucleotide_composition(corrected_seq)
    for key, value in nuc_stats.items():
        result[f'ANALYSIS_{key}'] = value
    
    # Protein properties
    prot_stats = analyze_protein_properties(corrected_protein)
    for key, value in prot_stats.items():
        result[f'ANALYSIS_{key}'] = value
    
    # Sequence features
    seq_features = analyze_sequence_features(corrected_seq)
    for key, value in seq_features.items():
        result[f'ANALYSIS_{key}'] = value
    
    # Codon usage
    codon_stats = analyze_codon_usage(corrected_seq)
    for key, value in codon_stats.items():
        result[f'ANALYSIS_{key}'] = value
    
    # Alignment to reference
    alignment = align_to_reference(corrected_seq, corrected_protein)
    for key, value in alignment.items():
        result[f'ANALYSIS_{key}'] = value
    
    # Quality assessment
    quality = calculate_sequence_quality(corrected_seq, final_internal_stops, motif_data)
    for key, value in quality.items():
        result[f'ANALYSIS_{key}'] = value
    
    # QC status
    qc_status, flags = determine_qc_status(final_internal_stops, len(corrected_seq), motif_data)
    result['ANALYSIS_QC_status'] = qc_status
    result['ANALYSIS_flags'] = ';'.join(flags) if flags else 'None'
    
    # Usability and priority
    result['ANALYSIS_sequence_usability'] = get_sequence_usability(qc_status)
    result['ANALYSIS_analysis_priority'] = get_analysis_priority(result)
    
    # Quality issues summary
    issues = []
    if final_internal_stops > 0:
        issues.append(f'stops:{final_internal_stops}')
    if len(corrected_seq) < 350:
        issues.append(f'short:{len(corrected_seq)}bp')
    if len(corrected_seq) > 750:
        issues.append(f'long:{len(corrected_seq)}bp')
    if motif_data.get('coi_confidence') == 'Low':
        issues.append('low_COI_confidence')
    if motif_data.get('coi_confidence') == 'Very_Low':
        issues.append('very_low_COI_confidence')
    result['ANALYSIS_quality_issues'] = ';'.join(issues) if issues else 'None'
    
    # Analysis timestamp
    result['ANALYSIS_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    return result

# ========================
# BATCH PROCESSING
# ========================

def process_fasta_file(fasta_file: str, metadata_df: pd.DataFrame = None) -> pd.DataFrame:
    """Process all sequences from FASTA file and merge back to original metadata"""
    
    if not BIOPYTHON_AVAILABLE:
        print("ERROR: BioPython is required but not available")
        print("Install with: pip install biopython")
        sys.exit(1)
    
    print(f"\nProcessing FASTA file: {fasta_file}")
    
    if metadata_df is None:
        print("ERROR: Metadata file is required for this analysis")
        sys.exit(1)
    
    # เก็บลำดับคอลัมน์เดิม
    original_columns_order = metadata_df.columns.tolist()
    original_row_count = len(metadata_df)
    print(f"  Original metadata: {original_row_count:,} rows × {len(original_columns_order)} columns")
    
    # หาคอลัมน์ ASV_ID
    asv_col = None
    for col in ['asv_id', 'ASV_ID', 'sequence_id', 'ASV_Id']:
        if col in metadata_df.columns:
            asv_col = col
            break
    
    if asv_col is None:
        print("ERROR: Cannot find ASV_ID column in metadata")
        sys.exit(1)
    
    print(f"  Using ASV_ID column: '{asv_col}'")
    
    # นับจำนวน unique ASVs
    unique_asvs = metadata_df[asv_col].nunique()
    print(f"  Unique ASV_IDs in metadata: {unique_asvs:,}")
    print(f"  Duplicate ASV_IDs exist: {original_row_count > unique_asvs}")
    
    # วิเคราะห์ลำดับจาก FASTA
    print("\nAnalyzing sequences from FASTA...")
    
    analysis_results = {}
    total_count = 0
    
    try:
        total_seqs = sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))
    except:
        total_seqs = None
    
    iterator = SeqIO.parse(fasta_file, "fasta")
    if TQDM_AVAILABLE and total_seqs:
        iterator = tqdm(iterator, total=total_seqs, desc="  Processing sequences")
    
    for record in iterator:
        total_count += 1
        seq_id = record.id
        original_seq = str(record.seq)
        
        # วิเคราะห์ลำดับ (ไม่ต้องส่ง metadata_row เพราะจะ merge ทีหลัง)
        result = comprehensive_sequence_analysis(seq_id, original_seq, metadata_row=None)
        
        # เก็บเฉพาะคอลัมน์ ANALYSIS_*
        analysis_only = {k: v for k, v in result.items() if k.startswith('ANALYSIS_')}
        analysis_results[seq_id] = analysis_only
        
        if not TQDM_AVAILABLE and total_count % 1000 == 0:
            print(f"  Processed {total_count:,} sequences...")
    
    print(f"\n  Completed: {total_count:,} unique sequences analyzed")
    print(f"  Analysis results stored for {len(analysis_results):,} ASV_IDs")
    
    # สร้าง DataFrame จากผลการวิเคราะห์
    analysis_df = pd.DataFrame.from_dict(analysis_results, orient='index')
    analysis_df.index.name = asv_col
    analysis_df = analysis_df.reset_index()
    
    print(f"\n  Analysis DataFrame: {len(analysis_df):,} rows × {len(analysis_df.columns)} columns")
    
    # Merge กลับเข้ากับ metadata เดิม
    print(f"\nMerging analysis results back to original metadata...")
    print(f"  Original metadata rows: {len(metadata_df):,}")
    
    # Left join เพื่อรักษาทุกแถวของ metadata
    result_df = metadata_df.merge(
        analysis_df,
        on=asv_col,
        how='left'
    )
    
    print(f"  After merge: {len(result_df):,} rows × {len(result_df.columns)} columns")
    
    if len(result_df) != original_row_count:
        print(f"  ⚠ WARNING: Row count changed! {original_row_count:,} → {len(result_df):,}")
    else:
        print(f"  ✓ Row count preserved: {len(result_df):,} rows")
    
    # จัดเรียงคอลัมน์: metadata เดิม + ANALYSIS_*
    original_cols = [col for col in original_columns_order if col in result_df.columns]
    analysis_cols = [col for col in result_df.columns if col.startswith('ANALYSIS_')]
    other_cols = [col for col in result_df.columns 
                  if col not in original_cols and not col.startswith('ANALYSIS_')]
    
    final_column_order = original_cols + other_cols + analysis_cols
    result_df = result_df[final_column_order]
    
    # ตรวจสอบ ASVs ที่ไม่มีในผลการวิเคราะห์
    missing_asvs = result_df[result_df['ANALYSIS_original_sequence'].isna()][asv_col].nunique()
    if missing_asvs > 0:
        print(f"\n  ⚠ Note: {missing_asvs:,} unique ASV_IDs not found in FASTA file")
        print(f"     (these rows will have empty ANALYSIS_* columns)")
    
    # นับ ASVs ที่ซ้ำ
    asv_counts = result_df[asv_col].value_counts()
    duplicated_asvs = (asv_counts > 1).sum()
    if duplicated_asvs > 0:
        print(f"\n  ✓ {duplicated_asvs:,} ASV_IDs appear multiple times")
        print(f"     (same analysis results copied to all rows with same ASV_ID)")
        max_dup = asv_counts.max()
        most_dup_asv = asv_counts.idxmax()
        print(f"     Max duplicates: {max_dup} times (ASV: {most_dup_asv})")
    
    print(f"\n  Final column organization:")
    print(f"    - Original metadata (preserved order): {len(original_cols)}")
    if other_cols:
        print(f"    - Other columns: {len(other_cols)}")
    print(f"    - Analysis columns (at the end): {len(analysis_cols)}")
    print(f"    - Total: {len(result_df.columns)}")
    
    return result_df

# ========================
# SUMMARY STATISTICS
# ========================

def generate_summary_statistics(df: pd.DataFrame) -> pd.DataFrame:
    """Generate comprehensive summary statistics"""
    print("\nGenerating summary statistics...")
    
    summary_data = []
    total = len(df)
    
    qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
    coi_col = 'ANALYSIS_coi_confidence' if 'ANALYSIS_coi_confidence' in df.columns else 'coi_confidence'
    grade_col = 'ANALYSIS_quality_grade' if 'ANALYSIS_quality_grade' in df.columns else 'quality_grade'
    
    rows_with_qc = int(df[qc_col].notna().sum())
    
    for status in ['PASS', 'WARNING', 'FAIL']:
        count = int((df[qc_col] == status).sum())
        pct = (count / rows_with_qc * 100) if rows_with_qc > 0 else 0
        summary_data.append({
            'Category': 'QC_Status',
            'Value': status,
            'Count': count,
            'Percentage': pct,
            'Percentage_str': f"{pct:.2f}%"
        })
    
    for conf in ['High', 'Medium', 'Low', 'Very_Low']:
        count = int((df[coi_col] == conf).sum())
        pct = (count / total * 100) if total > 0 else 0
        summary_data.append({
            'Category': 'COI_Confidence',
            'Value': conf,
            'Count': count,
            'Percentage': pct,
            'Percentage_str': f"{pct:.2f}%"
        })
    
    for grade in ['A+', 'A', 'B+', 'B', 'C', 'D', 'F']:
        count = int((df[grade_col] == grade).sum())
        pct = (count / total * 100) if total > 0 else 0
        summary_data.append({
            'Category': 'Quality_Grade',
            'Value': grade,
            'Count': count,
            'Percentage': pct,
            'Percentage_str': f"{pct:.2f}%"
        })
    
    summary_df = pd.DataFrame(summary_data)
    print(f"  Generated {len(summary_df)} summary statistics")
    
    return summary_df

# ========================
# CODON USAGE TABLE
# ========================

def generate_codon_usage_table(df: pd.DataFrame, max_seqs: int = None) -> pd.DataFrame:
    """Generate codon usage frequency table"""
    print("\nGenerating codon usage table...")
    
    qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
    seq_col = 'ANALYSIS_corrected_sequence_full' if 'ANALYSIS_corrected_sequence_full' in df.columns else 'corrected_sequence_full'
    
    pass_seqs = df[df[qc_col] == 'PASS']
    if len(pass_seqs) == 0:
        print("  No PASS sequences found")
        return None
    
    if max_seqs and len(pass_seqs) > max_seqs:
        print(f"  Analyzing {max_seqs:,} of {len(pass_seqs):,} sequences")
        pass_seqs = pass_seqs.sample(n=max_seqs, random_state=42)
    else:
        print(f"  Analyzing all {len(pass_seqs):,} sequences")
    
    total_codon_counter = Counter()
    total_sequences = 0
    
    iterator = pass_seqs.iterrows()
    if TQDM_AVAILABLE:
        iterator = tqdm(iterator, total=len(pass_seqs), desc="  Processing")
    
    for _, row in iterator:
        seq = row.get(seq_col, '')
        if pd.notna(seq) and len(seq) >= 3:
            seq = str(seq).replace('...', '')
            codons = [seq[i:i+3] for i in range(0, len(seq)-2, 3) if len(seq[i:i+3]) == 3]
            valid_codons = [c for c in codons if c in INSECT_GENETIC_CODE]
            total_codon_counter.update(valid_codons)
            total_sequences += 1
    
    if len(total_codon_counter) == 0:
        print("  No valid codons found")
        return None
    
    codon_data = []
    for codon, count in total_codon_counter.items():
        aa = INSECT_GENETIC_CODE.get(codon, 'X')
        frequency = count / sum(total_codon_counter.values()) * 100
        avg_per_seq = count / total_sequences if total_sequences > 0 else 0
        codon_data.append({
            'codon': codon,
            'amino_acid': aa,
            'total_count': count,
            'frequency_percent': frequency,
            'avg_per_sequence': avg_per_seq
        })
    
    codon_df = pd.DataFrame(codon_data)
    codon_df = codon_df.sort_values(['amino_acid', 'total_count'], ascending=[True, False])
    print(f"  Analyzed {len(codon_df)} unique codons from {total_sequences:,} sequences")
    
    return codon_df

# ========================
# AMINO ACID COMPOSITION
# ========================

def generate_aa_composition(df: pd.DataFrame) -> pd.DataFrame:
    """Generate amino acid composition analysis"""
    print("\nGenerating amino acid composition table...")
    
    qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
    prot_col = 'ANALYSIS_corrected_protein_full' if 'ANALYSIS_corrected_protein_full' in df.columns else 'corrected_protein_full'
    
    pass_seqs = df[df[qc_col] == 'PASS']
    if len(pass_seqs) == 0:
        print("  No PASS sequences found")
        return None
    
    print(f"  Analyzing {len(pass_seqs):,} PASS sequences")
    
    aa_totals = Counter()
    total_aa_length = 0
    
    iterator = pass_seqs.iterrows()
    if TQDM_AVAILABLE:
        iterator = tqdm(iterator, total=len(pass_seqs), desc="  Processing")
    
    for _, row in iterator:
        protein = row.get(prot_col, '')
        if pd.notna(protein) and len(protein) > 0:
            protein = str(protein).replace('...', '')
            clean_protein = protein.replace('*', '').replace('X', '')
            if len(clean_protein) > 0:
                aa_counter = Counter(clean_protein)
                aa_totals.update(aa_counter)
                total_aa_length += len(clean_protein)
    
    if total_aa_length == 0:
        print("  No valid protein sequences found")
        return None
    
    aa_summary = []
    aa_properties = {
        'A': 'Aliphatic', 'I': 'Aliphatic', 'L': 'Aliphatic', 'V': 'Aliphatic',
        'F': 'Aromatic', 'W': 'Aromatic', 'Y': 'Aromatic',
        'D': 'Acidic', 'E': 'Acidic',
        'R': 'Basic', 'H': 'Basic', 'K': 'Basic',
        'S': 'Polar', 'T': 'Polar', 'N': 'Polar', 'Q': 'Polar', 'C': 'Polar',
        'G': 'Special', 'P': 'Special', 'M': 'Special'
    }
    
    for aa, count in sorted(aa_totals.items()):
        frequency = (count / total_aa_length) * 100
        aa_summary.append({
            'amino_acid': aa,
            'property': aa_properties.get(aa, 'Other'),
            'total_count': count,
            'frequency_percent': frequency,
            'sequences_analyzed': len(pass_seqs)
        })
    
    summary_df = pd.DataFrame(aa_summary)
    summary_df = summary_df.sort_values('total_count', ascending=False)
    
    print(f"  Total amino acids analyzed: {total_aa_length:,}")
    print(f"  Unique amino acids found: {len(aa_totals)}")
    
    return summary_df

# ========================
# MOTIF SUMMARY
# ========================

def generate_motif_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Generate motif detection summary"""
    print("\nGenerating motif summary...")
    
    qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
    dna_motif_col = 'ANALYSIS_dna_motifs_found' if 'ANALYSIS_dna_motifs_found' in df.columns else 'dna_motifs_found'
    prot_motif_col = 'ANALYSIS_protein_motifs_found' if 'ANALYSIS_protein_motifs_found' in df.columns else 'protein_motifs_found'
    cons_reg_col = 'ANALYSIS_conserved_regions' if 'ANALYSIS_conserved_regions' in df.columns else 'conserved_regions'
    
    pass_seqs = df[df[qc_col] == 'PASS']
    if len(pass_seqs) == 0:
        print("  No PASS sequences for motif analysis")
        return None
    
    motif_data = []
    
    for motif_name in COI_DNA_MOTIFS.keys():
        count = int(pass_seqs[dna_motif_col].astype(str).str.contains(motif_name, na=False).sum())
        pct = (count / len(pass_seqs) * 100) if len(pass_seqs) > 0 else 0
        motif_data.append({
            'motif_type': 'DNA',
            'motif_name': motif_name,
            'sequences_with_motif': count,
            'percentage': pct,
            'percentage_str': f"{pct:.2f}%"
        })
    
    for motif_name in COI_PROTEIN_MOTIFS.keys():
        count = int(pass_seqs[prot_motif_col].astype(str).str.contains(motif_name, na=False).sum())
        pct = (count / len(pass_seqs) * 100) if len(pass_seqs) > 0 else 0
        motif_data.append({
            'motif_type': 'Protein',
            'motif_name': motif_name,
            'sequences_with_motif': count,
            'percentage': pct,
            'percentage_str': f"{pct:.2f}%"
        })
    
    for region_name in COI_CONSERVED_REGIONS.keys():
        count = int(pass_seqs[cons_reg_col].astype(str).str.contains(region_name, na=False).sum())
        pct = (count / len(pass_seqs) * 100) if len(pass_seqs) > 0 else 0
        motif_data.append({
            'motif_type': 'Conserved_Region',
            'motif_name': region_name,
            'sequences_with_motif': count,
            'percentage': pct,
            'percentage_str': f"{pct:.2f}%"
        })
    
    motif_df = pd.DataFrame(motif_data)
    motif_df = motif_df.sort_values(['motif_type', 'percentage'], ascending=[True, False])
    
    print(f"  Generated summary for {len(motif_df)} motifs")
    
    return motif_df

# ========================
# FASTA EXPORT
# ========================

def export_fasta(df: pd.DataFrame, output_file: str, priority_filter: list = None) -> str:
    """Export sequences to FASTA format"""
    print(f"\nExporting FASTA file: {output_file}")
    
    priority_col = 'ANALYSIS_analysis_priority' if 'ANALYSIS_analysis_priority' in df.columns else 'analysis_priority'
    qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
    seq_col = 'ANALYSIS_corrected_sequence_full' if 'ANALYSIS_corrected_sequence_full' in df.columns else 'corrected_sequence_full'
    
    if priority_filter:
        export_df = df[df[priority_col].isin(priority_filter)]
        print(f"  Filtering by priority: {priority_filter}")
    else:
        export_df = df[df[qc_col] == 'PASS']
        print("  Using all PASS sequences")
    
    export_df = export_df[export_df[seq_col].notna()]
    if len(export_df) == 0:
        print("  No sequences to export")
        return None
    
    print(f"  Exporting {len(export_df):,} sequences...")
    
    with open(output_file, 'w') as f:
        for _, row in export_df.iterrows():
            seq = str(row[seq_col])
            asv_id = row.get('asv_id', row.get('sequence_id', 'Unknown'))
            family = row.get('family', 'Unknown')
            
            coi_conf_col = 'ANALYSIS_coi_confidence' if 'ANALYSIS_coi_confidence' in row else 'coi_confidence'
            motif_qual_col = 'ANALYSIS_motif_quality' if 'ANALYSIS_motif_quality' in row else 'motif_quality'
            
            coi_conf = row.get(coi_conf_col, 'Unknown')
            motif_qual = row.get(motif_qual_col, 'Unknown')
            qc_status = row.get(qc_col, 'Unknown')
            
            header = f">{asv_id}|Family:{family}|COI:{coi_conf}|Motif:{motif_qual}|QC:{qc_status}|Length:{len(seq)}bp"
            f.write(header + '\n')
            
            for i in range(0, len(seq), 80):
                f.write(seq[i:i+80] + '\n')
    
    print(f"  Exported to: {output_file}")
    print(f"  Total sequences: {len(export_df):,}")
    
    return output_file

# ========================
# VISUALIZATION
# ========================

def create_visualizations(df: pd.DataFrame, output_dir: str) -> int:
    """Create comprehensive visualizations"""
    if not VISUALIZATION_AVAILABLE:
        print("\n  Visualization libraries not available")
        print("  Install with: pip install matplotlib seaborn")
        return 0
    
    try:
        print("\nCreating visualizations...")
        sns.set_style("whitegrid")
        plt.rcParams['figure.facecolor'] = 'white'
        
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
        pass_df = df[df[qc_col] == 'PASS'].copy()
        viz_count = 0
        
        # Use flexible column names
        def get_col(base_name):
            analysis_col = f'ANALYSIS_{base_name}'
            return analysis_col if analysis_col in df.columns else base_name
        
        # 1. Quality Distribution
        try:
            fig, axes = plt.subplots(2, 2, figsize=(14, 10))
            fig.suptitle('Quality Metrics Distribution', fontsize=16, fontweight='bold')
            
            qc_counts = df[qc_col].value_counts()
            axes[0,0].bar(qc_counts.index, qc_counts.values, color=['green', 'orange', 'red'])
            axes[0,0].set_title('QC Status')
            axes[0,0].set_ylabel('Count')
            for i, v in enumerate(qc_counts.values):
                axes[0,0].text(i, v, f'{v:,}', ha='center', va='bottom')
            
            coi_col = get_col('coi_confidence')
            coi_counts = df[coi_col].value_counts()
            colors_coi = {'High': 'darkgreen', 'Medium': 'orange', 'Low': 'red', 'Very_Low': 'darkred'}
            axes[0,1].bar(coi_counts.index, coi_counts.values,
                         color=[colors_coi.get(x, 'gray') for x in coi_counts.index])
            axes[0,1].set_title('COI Confidence')
            axes[0,1].set_ylabel('Count')
            for i, v in enumerate(coi_counts.values):
                axes[0,1].text(i, v, f'{v:,}', ha='center', va='bottom')
            
            grade_col = get_col('quality_grade')
            grade_order = ['A+', 'A', 'B+', 'B', 'C', 'D', 'F']
            grade_counts = df[grade_col].value_counts()
            grade_counts = grade_counts.reindex(grade_order, fill_value=0)
            axes[1,0].bar(range(len(grade_counts)), grade_counts.values, color='steelblue')
            axes[1,0].set_xticks(range(len(grade_counts)))
            axes[1,0].set_xticklabels(grade_counts.index)
            axes[1,0].set_title('Quality Grade Distribution')
            axes[1,0].set_ylabel('Count')
            for i, v in enumerate(grade_counts.values):
                if v > 0:
                    axes[1,0].text(i, v, f'{v:,}', ha='center', va='bottom')
            
            motif_qual_col = get_col('motif_quality')
            motif_counts = df[motif_qual_col].value_counts()
            colors_motif = {'Excellent': 'darkgreen', 'Good': 'green', 'Fair': 'orange', 'Poor': 'red'}
            axes[1,1].bar(motif_counts.index, motif_counts.values,
                         color=[colors_motif.get(x, 'gray') for x in motif_counts.index])
            axes[1,1].set_title('Motif Quality')
            axes[1,1].set_ylabel('Count')
            for i, v in enumerate(motif_counts.values):
                axes[1,1].text(i, v, f'{v:,}', ha='center', va='bottom', fontsize=8)
            
            plt.tight_layout()
            plt.savefig(output_path / 'Quality_Distribution.png', dpi=300, bbox_inches='tight')
            plt.close()
            viz_count += 1
            print("  ✓ Quality_Distribution.png")
        except Exception as e:
            print(f"  ✗ Error creating quality distribution: {e}")
        
        # 2. Sequence Characteristics
        try:
            fig, axes = plt.subplots(2, 2, figsize=(14, 10))
            fig.suptitle('Sequence Characteristics', fontsize=16, fontweight='bold')
            
            length_col = get_col('corrected_length')
            lengths = pass_df[length_col].dropna()
            axes[0,0].hist(lengths, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
            axes[0,0].axvline(lengths.mean(), color='red', linestyle='--', linewidth=2,
                             label=f'Mean: {lengths.mean():.0f} bp')
            axes[0,0].set_title('Corrected Sequence Length')
            axes[0,0].set_xlabel('Length (bp)')
            axes[0,0].set_ylabel('Frequency')
            axes[0,0].legend()
            
            gc_col = get_col('GC_content')
            gc_content = pass_df[gc_col].dropna()
            axes[0,1].hist(gc_content, bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
            axes[0,1].axvline(gc_content.mean(), color='red', linestyle='--', linewidth=2,
                             label=f'Mean: {gc_content.mean():.1f}%')
            axes[0,1].set_title('GC Content')
            axes[0,1].set_xlabel('GC Content (%)')
            axes[0,1].set_ylabel('Frequency')
            axes[0,1].legend()
            
            motif_score_col = get_col('motif_score')
            motif_score = pass_df[motif_score_col].dropna()
            axes[1,0].hist(motif_score, bins=30, color='coral', edgecolor='black', alpha=0.7)
            axes[1,0].axvline(motif_score.mean(), color='red', linestyle='--', linewidth=2,
                             label=f'Mean: {motif_score.mean():.1f}')
            axes[1,0].set_title('Motif Score Distribution')
            axes[1,0].set_xlabel('Motif Score')
            axes[1,0].set_ylabel('Frequency')
            axes[1,0].legend()
            
            prot_len_col = get_col('protein_length')
            prot_length = pass_df[prot_len_col].dropna()
            prot_length = prot_length[prot_length > 0]
            axes[1,1].hist(prot_length, bins=30, color='plum', edgecolor='black', alpha=0.7)
            axes[1,1].axvline(prot_length.mean(), color='red', linestyle='--', linewidth=2,
                             label=f'Mean: {prot_length.mean():.0f} aa')
            axes[1,1].set_title('Protein Length')
            axes[1,1].set_xlabel('Length (aa)')
            axes[1,1].set_ylabel('Frequency')
            axes[1,1].legend()
            
            plt.tight_layout()
            plt.savefig(output_path / 'Sequence_Characteristics.png', dpi=300, bbox_inches='tight')
            plt.close()
            viz_count += 1
            print("  ✓ Sequence_Characteristics.png")
        except Exception as e:
            print(f"  ✗ Error creating sequence characteristics: {e}")
        
        # 3-6: Continue with other visualizations...
        print(f"\n  Created {viz_count} visualization files")
        print(f"  Saved to: {output_path}")
        
        return viz_count
        
    except Exception as e:
        print(f"\n  Error creating visualizations: {e}")
        return 0

# ========================
# REPORTING
# ========================

def print_comprehensive_report(df: pd.DataFrame):
    """Print comprehensive analysis report"""
    
    qc_col = 'ANALYSIS_QC_status' if 'ANALYSIS_QC_status' in df.columns else 'QC_status'
    
    print("\n" + "=" * 80)
    print("COMPREHENSIVE ANALYSIS REPORT")
    print("=" * 80)
    
    print("\n1. OVERALL STATISTICS")
    print("-" * 80)
    print(f"  Total sequences analyzed: {len(df):,}")
    
    rows_with_qc = int(df[qc_col].notna().sum())
    
    print("\n2. QC STATUS")
    print("-" * 80)
    for status in ['PASS', 'WARNING', 'FAIL']:
        count = int((df[qc_col] == status).sum())
        pct = (count / len(df) * 100) if len(df) > 0 else 0
        icon = '✓' if status == 'PASS' else '⚠' if status == 'WARNING' else '✗'
        print(f"  {icon} {status:10s}: {count:6,} ({pct:5.1f}%)")
    
    print("\n" + "=" * 80)

# ========================
# MAIN EXECUTION
# ========================

def main():
    """Main execution function"""
    
    print("=" * 80)
    print("COMPLETE INSECT COI ANALYSIS PIPELINE - UNIFIED v11.1")
    print("Preserves Original Metadata + Adds Analysis Results")
    print("=" * 80)
    
    print("\n🆕 NEW IN v11.1:")
    print("  ✓ Preserves ALL original metadata columns IN ORIGINAL ORDER")
    print("  ✓ Analysis results prefixed with 'ANALYSIS_' added at the END")
    print("  ✓ Easy to identify new vs original data")
    print("  ✓ Compatible with downstream abundance analysis")
    
    print("\nConfiguration:")
    print(f"  Metadata file: {METADATA_FILE}")
    print(f"  FASTA file: {FASTA_FILE}")
    print(f"  Output directory: {OUTPUT_DIR}")
    
    print("\nAnalysis Features:")
    print("  ✓ ORF detection (6 frames)")
    print("  ✓ Frameshift correction")
    print(f"  ✓ {len(COI_DNA_MOTIFS)} DNA-level COI motifs")
    print(f"  ✓ {len(COI_PROTEIN_MOTIFS)} Protein-level COI patterns")
    print(f"  ✓ {len(COI_CONSERVED_REGIONS)} Conserved regions")
    print("  ✓ Insect mitochondrial genetic code (TGA=W, ATA=M)")
    print("  ✓ Nucleotide composition analysis")
    print("  ✓ Protein property analysis")
    print("  ✓ Quality scoring and grading")
    print("  ✓ COI confidence assessment")
    print("  ✓ Alignment to reference")
    print("  ✓ Codon usage analysis")
    print("  ✓ Amino acid composition")
    print("  ✓ Comprehensive visualization")
    
    # Check dependencies
    print("\nChecking dependencies...")
    if not BIOPYTHON_AVAILABLE:
        print("  ✗ BioPython NOT available - REQUIRED")
        print("    Install with: pip install biopython")
        sys.exit(1)
    else:
        print("  ✓ BioPython available")
    
    if PROTPARAM_AVAILABLE:
        print("  ✓ Bio.SeqUtils.ProtParam available")
    else:
        print("  ⚠ Bio.SeqUtils.ProtParam not available (will use fallback)")
    
    if TQDM_AVAILABLE:
        print("  ✓ tqdm available (progress bars enabled)")
    else:
        print("  ⚠ tqdm not available (install with: pip install tqdm)")
    
    if VISUALIZATION_AVAILABLE:
        print("  ✓ Visualization libraries available")
    else:
        print("  ⚠ matplotlib/seaborn not available (install with: pip install matplotlib seaborn)")
    
    # Validate input files
    print("\nValidating input files...")
    
    if not Path(FASTA_FILE).exists():
        print(f"  ✗ FASTA file not found: {FASTA_FILE}")
        sys.exit(1)
    else:
        print(f"  ✓ FASTA file found")
    
    metadata_df = None
    if Path(METADATA_FILE).exists():
        print(f"  ✓ Metadata file found")
        try:
            metadata_df = pd.read_csv(METADATA_FILE, low_memory=False)
            print(f"    Loaded {len(metadata_df):,} metadata records")
            print(f"    Original metadata columns: {len(metadata_df.columns)}")
            print(f"    Column order will be preserved in output")
        except Exception as e:
            print(f"  ⚠ Error loading metadata: {e}")
            print("    Continuing without metadata...")
    else:
        print(f"  ⚠ Metadata file not found (continuing without metadata)")
    
    # Create output directory
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
    
    # Process FASTA file
    print("\n" + "=" * 80)
    print("PROCESSING SEQUENCES")
    print("=" * 80)
    
    df = process_fasta_file(FASTA_FILE, metadata_df)
    
    print(f"\nProcessing complete!")
    print(f"  Total sequences: {len(df):,}")
    print(f"  Total columns: {len(df.columns)}")
    
    # นับคอลัมน์แบบละเอียด
    if metadata_df is not None:
        original_metadata_cols = metadata_df.columns.tolist()
        original_cols = [c for c in df.columns if c in original_metadata_cols]
        analysis_cols = [c for c in df.columns if c.startswith('ANALYSIS_')]
        other_cols = [c for c in df.columns if c not in original_cols and not c.startswith('ANALYSIS_')]
        
        print(f"    - Original metadata (preserved order): {len(original_cols)}")
        if other_cols:
            print(f"    - Other columns: {len(other_cols)}")
        print(f"    - Analysis columns (at the end): {len(analysis_cols)}")
    else:
        original_cols = [c for c in df.columns if not c.startswith('ANALYSIS_')]
        analysis_cols = [c for c in df.columns if c.startswith('ANALYSIS_')]
        print(f"    - Original: {len(original_cols)}")
        print(f"    - Analysis: {len(analysis_cols)}")
    
    # Save main results
    print("\n" + "=" * 80)
    print("SAVING RESULTS")
    print("=" * 80)
    
    try:
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"\n✓ Main results: {OUTPUT_FILE}")
        print(f"  {len(df):,} rows × {len(df.columns)} columns")
        print(f"  Column order: Original metadata → Analysis results")
    except Exception as e:
        print(f"\n✗ Error saving main results: {e}")
    
    # Generate summaries
    try:
        summary_df = generate_summary_statistics(df)
        if summary_df is not None:
            summary_df.to_csv(OUTPUT_SUMMARY, index=False)
            print(f"\n✓ Summary statistics: {OUTPUT_SUMMARY}")
    except Exception as e:
        print(f"\n✗ Error generating summary: {e}")
    
    try:
        codon_df = generate_codon_usage_table(df, max_seqs=MAX_SEQUENCES_FOR_CODON)
        if codon_df is not None:
            codon_df.to_csv(OUTPUT_CODON_USAGE, index=False)
            print(f"\n✓ Codon usage table: {OUTPUT_CODON_USAGE}")
    except Exception as e:
        print(f"\n✗ Error generating codon usage: {e}")
    
    try:
        aa_df = generate_aa_composition(df)
        if aa_df is not None:
            aa_df.to_csv(OUTPUT_AA_COMPOSITION, index=False)
            print(f"\n✓ Amino acid composition: {OUTPUT_AA_COMPOSITION}")
    except Exception as e:
        print(f"\n✗ Error generating AA composition: {e}")
    
    try:
        motif_df = generate_motif_summary(df)
        if motif_df is not None:
            motif_df.to_csv(OUTPUT_MOTIF_SUMMARY, index=False)
            print(f"\n✓ Motif summary: {OUTPUT_MOTIF_SUMMARY}")
    except Exception as e:
        print(f"\n✗ Error generating motif summary: {e}")
    
    # Export FASTA files
    print("\n" + "=" * 80)
    print("EXPORTING FASTA FILES")
    print("=" * 80)
    
    try:
        export_fasta(df, OUTPUT_FASTA_HIGH_QUALITY,
                    priority_filter=['Priority_1_Highest', 'Priority_2_High'])
        
        export_fasta(df, OUTPUT_FASTA_CORRECTED, priority_filter=None)
    except Exception as e:
        print(f"\n✗ Error exporting FASTA: {e}")
    
    # Create visualizations
    if CREATE_VISUALIZATIONS and VISUALIZATION_AVAILABLE:
        print("\n" + "=" * 80)
        print("CREATING VISUALIZATIONS")
        print("=" * 80)
        
        viz_dir = f"{OUTPUT_DIR}/visualizations"
        viz_count = create_visualizations(df, viz_dir)
        
        if viz_count > 0:
            print(f"\n✓ Created {viz_count} visualization files")
            print(f"  Location: {viz_dir}")
    
    # Print report
    print_comprehensive_report(df)
    
    # Column organization
    print("\n" + "=" * 80)
    print("COLUMN ORGANIZATION")
    print("=" * 80)
    
    if metadata_df is not None:
        original_metadata_cols = metadata_df.columns.tolist()
        orig_cols = [c for c in df.columns if c in original_metadata_cols]
        analysis_cols = [c for c in df.columns if c.startswith('ANALYSIS_')]
        
        print(f"\n✓ Original Metadata Columns ({len(orig_cols)}) - ORDER PRESERVED:")
        for i, col in enumerate(orig_cols[:15], 1):
            print(f"  {i:2d}. {col}")
        if len(orig_cols) > 15:
            print(f"  ... and {len(orig_cols) - 15} more")
        
        print(f"\n✓ New Analysis Columns ({len(analysis_cols)}) - ADDED AT THE END:")
        for i, col in enumerate(analysis_cols[:15], 1):
            print(f"  {i:2d}. {col}")
        if len(analysis_cols) > 15:
            print(f"  ... and {len(analysis_cols) - 15} more")
    else:
        orig_cols = [c for c in df.columns if not c.startswith('ANALYSIS_')]
        analysis_cols = [c for c in df.columns if c.startswith('ANALYSIS_')]
        
        print(f"\nOriginal Columns ({len(orig_cols)}):")
        for col in orig_cols[:10]:
            print(f"  - {col}")
        if len(orig_cols) > 10:
            print(f"  ... and {len(orig_cols) - 10} more")
        
        print(f"\nNew Analysis Columns ({len(analysis_cols)}):")
        for col in analysis_cols[:10]:
            print(f"  - {col}")
        if len(analysis_cols) > 10:
            print(f"  ... and {len(analysis_cols) - 10} more")
    
    print("\n" + "=" * 80)
    print("OUTPUT FILES SUMMARY")
    print("=" * 80)
    
    print("\nGenerated Files:")
    print(f"\n1. Main Analysis Results")
    print(f"   Path: {OUTPUT_FILE}")
    print(f"   Info: {len(df):,} sequences × {len(df.columns)} columns")
    print(f"   Structure: [Original Metadata] + [ANALYSIS_* columns]")
    
    print(f"\n2. Summary Statistics")
    print(f"   Path: {OUTPUT_SUMMARY}")
    
    print(f"\n3. Codon Usage Table")
    print(f"   Path: {OUTPUT_CODON_USAGE}")
    
    print(f"\n4. Amino Acid Composition")
    print(f"   Path: {OUTPUT_AA_COMPOSITION}")
    
    print(f"\n5. Motif Detection Summary")
    print(f"   Path: {OUTPUT_MOTIF_SUMMARY}")
    
    print(f"\n6. High Quality FASTA")
    print(f"   Path: {OUTPUT_FASTA_HIGH_QUALITY}")
    
    print(f"\n7. All Corrected FASTA")
    print(f"   Path: {OUTPUT_FASTA_CORRECTED}")
    
    if CREATE_VISUALIZATIONS and VISUALIZATION_AVAILABLE:
        print(f"\n8. Visualizations")
        print(f"   Path: {OUTPUT_DIR}/visualizations/")
    
    print("\n" + "=" * 80)
    print("✓ ANALYSIS COMPLETE!")
    print("=" * 80)
    
    print(f"\nTotal runtime: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("\n📊 Output file structure:")
    print("   - Original metadata columns in their ORIGINAL ORDER")
    print("   - Analysis columns (ANALYSIS_*) appended at the END")
    print("   - Ready for abundance-based classification!")
    print("=" * 80 + "\n")

if __name__ == "__main__":
    main()

COMPLETE INSECT COI ANALYSIS PIPELINE - UNIFIED v11.1
Preserves Original Metadata + Adds Analysis Results

🆕 NEW IN v11.1:
  ✓ Preserves ALL original metadata columns IN ORIGINAL ORDER
  ✓ Analysis results prefixed with 'ANALYSIS_' added at the END
  ✓ Easy to identify new vs original data
  ✓ Compatible with downstream abundance analysis

Configuration:
  Metadata file: /Users/sarawut/Desktop/Manuscript_ASV_selection/raw_data/ASV_Authentication_Results_030925.csv
  FASTA file: /Users/sarawut/Desktop/Manuscript_ASV_selection/raw_data/ASV_table_sequences_66595.fasta
  Output directory: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis

Analysis Features:
  ✓ ORF detection (6 frames)
  ✓ Frameshift correction
  ✓ 10 DNA-level COI motifs
  ✓ 16 Protein-level COI patterns
  ✓ 3 Conserved regions
  ✓ Insect mitochondrial genetic code (TGA=W, ATA=M)
  ✓ Nucleotide composition analysis
  ✓ Protein property analysis
  ✓ Quality scoring and grading
  ✓ COI confide

  Processing sequences: 100%|██████████| 66595/66595 [00:58<00:00, 1138.19it/s]



  Completed: 66,595 unique sequences analyzed
  Analysis results stored for 66,595 ASV_IDs

  Analysis DataFrame: 66,595 rows × 100 columns

Merging analysis results back to original metadata...
  Original metadata rows: 175,955
  After merge: 175,955 rows × 149 columns
  ✓ Row count preserved: 175,955 rows

  ✓ 23,138 ASV_IDs appear multiple times
     (same analysis results copied to all rows with same ASV_ID)
     Max duplicates: 1087 times (ASV: uniq17)

  Final column organization:
    - Original metadata (preserved order): 50
    - Analysis columns (at the end): 99
    - Total: 149

Processing complete!
  Total sequences: 175,955
  Total columns: 149
    - Original metadata (preserved order): 50
    - Analysis columns (at the end): 99

SAVING RESULTS

✓ Main results: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/ASV_Complete_Analysis.csv
  175,955 rows × 149 columns
  Column order: Original metadata → Analysis results

Generating summary statis

  Processing: 100%|██████████| 175113/175113 [00:07<00:00, 22123.47it/s]


  Analyzed 64 unique codons from 175,113 sequences

✓ Codon usage table: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/Codon_Usage_Table.csv

Generating amino acid composition table...
  Analyzing 175,113 PASS sequences


  Processing: 100%|██████████| 175113/175113 [00:03<00:00, 47464.05it/s]


  Total amino acids analyzed: 23,977,528
  Unique amino acids found: 20

✓ Amino acid composition: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/AA_Composition.csv

Generating motif summary...
  Generated summary for 29 motifs

✓ Motif summary: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/Motif_Analysis_Summary.csv

EXPORTING FASTA FILES

Exporting FASTA file: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/ASV_High_Quality_Sequences.fasta
  Filtering by priority: ['Priority_1_Highest', 'Priority_2_High']
  No sequences to export

Exporting FASTA file: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/ASV_Corrected_Sequences.fasta
  Using all PASS sequences
  Exporting 175,113 sequences...
  Exported to: /Users/sarawut/Desktop/Manuscript_ASV_selection/data_analysis/sequences_analysis/ASV_Corrected_Sequences.fasta
  Total sequences: 175,113

CREATING VIS