In [14]:
"""
Metabarcoding Analysis Pipeline
Clean version with visualization components removed, reporting functionality maintained
"""

# Standard libraries
import os
import sys
import glob
import pickle
import json
import warnings
import traceback
from datetime import datetime
import psutil
from typing import Dict, List, Optional, Tuple, Any, Union

import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_curve, 
    auc, 
    precision_recall_curve,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    balanced_accuracy_score
)
from sklearn.inspection import permutation_importance

# For report generation
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, landscape
from pathlib import Path

# Suppress warnings
warnings.filterwarnings('ignore')

class OutputManager:
    """Manages output directory structure and file saving"""
    
    def __init__(self, base_path: str, create_dirs: bool = True):
        """
        Initialize OutputManager
        
        Args:
            base_path (str): Base directory for outputs
            create_dirs (bool): Whether to create directory structure
        """
        self.base_path = Path(base_path)
        self.paths = self._setup_paths(create_dirs)
        
    def _setup_paths(self, create_dirs: bool) -> Dict[str, Path]:
        """
        Setup output directory structure
        
        Args:
            create_dirs (bool): Whether to create directories
            
        Returns:
            Dict[str, Path]: Dictionary of output paths
        """
        paths = {
            'reports': self.base_path / 'reports',
            'data': self.base_path / 'data',
            'temp': self.base_path / 'temp'
        }
        
        if create_dirs:
            for path in paths.values():
                path.mkdir(parents=True, exist_ok=True)
                
        return paths
    
    def get_path(self, category: str) -> Path:
        """
        Get path for specific output category
        
        Args:
            category (str): Output category
            
        Returns:
            Path: Path object for requested category
        """
        return self.paths.get(category, self.base_path)

class MemoryMonitor:
    """Monitors system memory usage"""
    
    @staticmethod
    def check_memory() -> Tuple[float, float]:
        """
        Check current memory usage
        
        Returns:
            Tuple[float, float]: Used memory (GB), Available memory (GB)
        """
        try:
            memory = psutil.virtual_memory()
            used_gb = memory.used / (1024 ** 3)
            available_gb = memory.available / (1024 ** 3)
            return used_gb, available_gb
        except (ImportError, AttributeError):
            # Fallback if psutil is not available
            return 0.0, float('inf')
    
    @staticmethod
    def memory_warning(threshold_gb: float = 1.0) -> bool:
        """
        Check if available memory is below threshold
        
        Args:
            threshold_gb (float): Memory threshold in GB
            
        Returns:
            bool: True if memory is below threshold
        """
        try:
            _, available_gb = MemoryMonitor.check_memory()
            return available_gb < threshold_gb
        except:
            # If we can't check memory, assume we're ok
            return False

    @staticmethod
    def get_memory_info() -> dict:
        """
        Get detailed memory information
        
        Returns:
            dict: Dictionary containing memory statistics
        """
        try:
            memory = psutil.virtual_memory()
            return {
                'total_gb': memory.total / (1024 ** 3),
                'available_gb': memory.available / (1024 ** 3),
                'used_gb': memory.used / (1024 ** 3),
                'percent_used': memory.percent
            }
        except:
            return {
                'total_gb': 0.0,
                'available_gb': 0.0,
                'used_gb': 0.0,
                'percent_used': 0.0
            }

# Main ASV Analyzer class
class IntegratedASVAnalyzer:
    """Integrated system for ASV analysis"""
    
    def __init__(self, output_dir: str = "asv_analysis_output"):
        """Initialize ASV analyzer with improved feature selection"""
        # Validate environment
        self._validate_environment()
        
        # Analysis parameters
        self.params = {
            'random_state': 42,
            'test_size': 0.2,
            'cv_folds': 5,
            'chunk_size': 1000,
            'n_bootstrap': 1000,
            'threshold': {
                'memory_gb': 0.5,
                'correlation': 0.8,
                'feature_importance': 0.01
            }
        }
        
        # Color scheme (keeping for data categorization only)
        self.color_scheme = {
            'main': '#00008B',           # Dark Blue
            'candidate': '#FF8C00',      # Dark Orange
            'authenticated': '#007000',   # Green
            'unauthenticated': '#D2222D', # Red
            'main-authenticated': '#06d6a0',     # Blue-Green
            'main-unauthenticated': '#ffd166',   # Yellow
            'candidate-authenticated': '#118ab2', # Blue
            'candidate-unauthenticated': '#ef476f' # Pink
        }
        
        # Data storage
        self.df = None
        self.sequences = None
        
        # ML components
        self.models = {
            'main_candidate': None,
            'authentication': None,
            'scaler': StandardScaler()
        }
        
        # Feature sets - Implementing all improvement suggestions
        self.features = {
            'main_candidate': [
                'log_read',
                'rank_read',  
                # Removed 'percentage' as suggested
                # Removed 'normalized_read_count' due to correlation with log_read
                'log_group_size',  # Use log-transformed group size
                'group_size_quantile'  # Use quantile-based group size
            ],
            'authentication': [
                'read_count',            
                'total_read',            
                # Removed 'percentage' as suggested
                'nearest_main_dist',     
                'nearest_cand_dist',     
                'dist_ratio',
                'log_group_size'  # Use log-transformed group size
            ]
        }
        
        # Thresholds
        self.thresholds = {
            'main_candidate': None,
            'authentication': {
                'main': None,
                'candidate': None
            }
        }
        
        # Performance metrics
        self.metrics = {
            'main_candidate': {},
            'authentication': {},
            'advanced': {
                'mcc': None,
                'stability': None,
                'confidence_intervals': None
            }
        }
        
        # Output directory
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def _validate_environment(self) -> None:
        """Validate execution environment and dependencies"""
        required_libs = ['pandas', 'numpy', 'Bio', 'sklearn']
        missing_libs = []
        for lib in required_libs:
            try:
                __import__(lib)
            except ImportError:
                missing_libs.append(lib)
        if missing_libs:
            raise ImportError(f"Missing required libraries: {', '.join(missing_libs)}")

    def run_complete_analysis(self, metadata_path: str, fasta_path: str, 
                            output_path: str) -> bool:
        """
        Run complete ASV analysis pipeline with progress tracking
        
        Args:
            metadata_path (str): Path to metadata Excel file
            fasta_path (str): Path to FASTA sequence file
            output_path (str): Path for output files
            
        Returns:
            bool: Success status of analysis
        """
        try:
            print("\n=== STARTING COMPLETE ASV ANALYSIS ===")
            print("=" * 50)
            
            # Validate input files
            self._validate_input_files(metadata_path, fasta_path)
            
            # Track progress with tqdm
            steps = [
                ("Loading and preprocessing data", self._load_and_preprocess_data),
                ("Main/Candidate classification", self.analyze_main_candidate),
                ("Authentication analysis", self.analyze_authentication),
                ("Exporting results", self.export_results)
            ]
            
            for step_name, step_func in tqdm(steps, desc="Analysis Progress"):
                print(f"\n=== {step_name.upper()} ===")
                if step_name == "Loading and preprocessing data":
                    success = step_func(metadata_path, fasta_path)
                elif step_name == "Exporting results":
                    success = step_func(output_path)
                else:
                    success = step_func()
                    
                if not success:
                    print(f"Error in {step_name}")
                    return False
            
            print("\n=== ANALYSIS COMPLETED SUCCESSFULLY ===")
            return True
            
        except Exception as e:
            print(f"\nError in analysis pipeline: {str(e)}")
            print("\nFull error details:")
            print(traceback.format_exc())
            return False

    def _validate_input_files(self, metadata_path: str, fasta_path: str) -> None:
        """
        Validate input file existence and format
        
        Args:
            metadata_path (str): Path to metadata file
            fasta_path (str): Path to FASTA file
        
        Raises:
            FileNotFoundError: If files don't exist
            ValueError: If file formats are invalid
        """
        # Check file existence
        for path, name in [(metadata_path, "Metadata"), (fasta_path, "FASTA")]:
            if not os.path.exists(path):
                raise FileNotFoundError(f"{name} file not found: {path}")
        
        # Validate metadata file format
        if not metadata_path.endswith(('.xlsx', '.xls', '.csv')):
            raise ValueError("Metadata file must be Excel or CSV format")
            
        # Validate FASTA file
        try:
            with open(fasta_path) as f:
                first_line = f.readline()
                if not first_line.startswith('>'):
                    raise ValueError("Invalid FASTA file format")
        except Exception as e:
            raise ValueError(f"Error reading FASTA file: {str(e)}")

    def _load_and_preprocess_data(self, metadata_path: str, fasta_path: str) -> bool:
        """
        Load and preprocess data with improved error handling and memory management
        
        Args:
            metadata_path (str): Path to metadata file
            fasta_path (str): Path to FASTA file
            
        Returns:
            bool: Success status
        """
        try:
            print("\n=== LOADING AND PREPROCESSING DATA ===")
            print("-" * 50)
            
            # Load metadata with progress bar
            print("\nLoading metadata...")
            
            # Check if the file is space-delimited or standard format
            if metadata_path.endswith('.xlsx'):
                self.df = pd.read_excel(metadata_path)
            elif metadata_path.endswith('.csv'):
                self.df = pd.read_csv(metadata_path)
            else:
                # Try to load as space-delimited text file
                try:
                    self.df = pd.read_csv(metadata_path, sep=r'\s+', engine='python')
                    print("Loaded file as space-delimited text.")
                except:
                    # If that fails, try standard CSV loading
                    self.df = pd.read_csv(metadata_path)
            
            # Print columns to verify
            print("\nAvailable columns in your data:")
            print(self.df.columns.tolist())
                
            initial_count = len(self.df)
            
            # Memory check
            if MemoryMonitor.memory_warning():
                print("Warning: High memory usage detected")
            
            # Filter outgroups if we have the required columns
            if 'step1' in self.df.columns and 'step2' in self.df.columns:
                # Remove outgroups with progress tracking
                print("\nFiltering outgroups...")
                original_shape = self.df.shape
                self.df = self.df[
                    (self.df['step1'] != 'outgroup') & 
                    (self.df['step2'] != 'outgroup')
                ].reset_index(drop=True)
                
                filtered_count = original_shape[0] - len(self.df)
                print(f"Removed {filtered_count} outgroup entries")
            else:
                print("\nSkipping outgroup filtering - required columns not found.")
            
            # Load sequences with progress tracking
            print("\nLoading sequences...")
            with tqdm(total=os.path.getsize(fasta_path), 
                    desc="Reading FASTA", unit='B', unit_scale=True) as pbar:
                self.sequences = {}
                for record in SeqIO.parse(fasta_path, "fasta"):
                    self.sequences[record.id] = record
                    pbar.update(len(str(record).encode('utf-8')))
            
            # Create the basic features
            print("\nCreating basic features...")
            
            # Basic features that need to be created before further processing
            # Log transformations
            self.df['log_read'] = np.log1p(self.df['read_count'])
            
            # Proportions
            self.df['read_proportion'] = self.df['read_count'] / self.df['total_read']
            self.df['sample_proportion'] = self.df['count_read'] / self.df['sample_count']
            
            # Normalized counts
            self.df['normalized_read_count'] = stats.zscore(self.df['read_count'])
            
            # Basic ratios
            self.df['dist_ratio'] = self.df['nearest_main_dist'] / self.df['nearest_cand_dist']
            
            # Check for infinite values
            inf_mask = np.isinf(self.df['dist_ratio'])
            if inf_mask.any():
                print(f"Warning: {inf_mask.sum()} infinite values in distance ratio")
                self.df.loc[inf_mask, 'dist_ratio'] = np.nan
                
            # Now proceed with full feature creation
            try:
                print("\nCreating advanced features...")
                self._create_features()
                
                # Validate data
                self._validate_data()
                
                # Print summary
                self._print_data_summary(initial_count)
            except Exception as e:
                print(f"Warning in feature creation: {str(e)}")
                print("Will proceed with basic features only.")
            
            return True
            
        except Exception as e:
            print(f"Error in data loading: {str(e)}")
            print(traceback.format_exc())
            return False

    def _validate_data(self) -> None:
        """
        Validate loaded data integrity
        
        Raises:
            ValueError: If data validation fails
        """
        # Check for required columns
        required_columns = [
            'asv_id', 'project_readfile_id', 'read_count', 
            'total_read', 'count_read', 'percentage',
            'nearest_main_dist', 'nearest_cand_dist',
            'family_tree', 'subfamily_tree', 'step1', 'step2'
        ]
        
        missing_columns = [col for col in required_columns if col not in self.df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
            
        # Check for null values in critical columns
        null_counts = self.df[required_columns].isnull().sum()
        if null_counts.any():
            print("\nWarning: Null values found in columns:")
            print(null_counts[null_counts > 0])
            
        # Validate sequence IDs
        missing_sequences = set(self.df['asv_id']) - set(self.sequences.keys())
        if missing_sequences:
            raise ValueError(f"Missing sequences for {len(missing_sequences)} ASVs")
            
        # Validate numeric columns
        numeric_columns = ['read_count', 'total_read', 'count_read', 'percentage']
        for col in numeric_columns:
            if not pd.to_numeric(self.df[col], errors='coerce').notnull().all():
                raise ValueError(f"Non-numeric values found in {col}")

    def _create_feature_summary(self) -> pd.DataFrame:
        """
        Create summary statistics for features
        
        Returns:
            pd.DataFrame: Feature summary statistics
        """
        all_features = (
            self.features['main_candidate'] + 
            self.features['authentication']
        )
        
        summary_stats = self.df[all_features].describe()
        
        # Add additional statistics
        summary_stats.loc['skew'] = self.df[all_features].skew()
        summary_stats.loc['kurtosis'] = self.df[all_features].kurtosis()
        summary_stats.loc['missing'] = self.df[all_features].isnull().sum()
        
        return summary_stats

    def _print_data_summary(self, initial_count: int) -> None:
        """
        Print comprehensive data summary
        
        Args:
            initial_count (int): Initial number of ASVs
        """
        print("\nData Summary:")
        print(f"Initial ASVs: {initial_count}")
        print(f"ASVs after filtering: {len(self.df)}")
        
        # Print distributions
        print("\nStep1 Distribution:")
        step1_dist = self.df['step1'].value_counts()
        print(step1_dist.to_string())
        
        print("\nStep2 Distribution:")
        step2_dist = self.df['step2'].value_counts()
        print(step2_dist.to_string())
        
        # Print feature summary
        print("\nFeature Summary:")
        feature_summary = self._create_feature_summary()
        print(feature_summary.round(2).to_string())

    def _create_features(self) -> None:
        """Create all features with validation and additional metrics"""
        try:
            print("\nCreating and validating features...")
            
            steps = [
                ("Creating basic features", self._create_basic_features),
                ("Creating taxonomic features", self._create_taxonomic_features),
                ("Creating abundance features", self._create_abundance_features),
                ("Creating distance features", self._create_distance_features),
                ("Creating quality metrics", self._create_quality_metrics),
                ("Cleaning features", self._clean_features),
                ("Validating features", self._validate_features)
            ]
            
            for step_name, step_func in tqdm(steps, desc="Feature Creation"):
                try:
                    print(f"\n{step_name}...")
                    step_func()
                except Exception as e:
                    print(f"Error in {step_name.lower()}: {str(e)}")
                    print("Attempting to continue with analysis...")
            
            # Print feature summary even if some steps failed
            try:
                self._print_feature_summary()
            except Exception as e:
                print(f"Error in feature summary: {str(e)}")
            
        except Exception as e:
            print(f"Error in feature creation: {str(e)}")
            raise

    def _create_basic_features(self) -> None:
        """Create basic transformation features with improved robust transformations"""
        print("Creating basic features...")
        
        # Log transformations
        self.df['log_read'] = np.log1p(self.df['read_count'])
        
        # Rank-based transformation (extremely robust to outliers)
        self.df['rank_read'] = self.df['read_count'].rank(pct=True)
        
        # Proportions
        self.df['read_proportion'] = self.df['read_count'] / self.df['total_read']
        self.df['sample_proportion'] = self.df['count_read'] / self.df['sample_count']
        
        # Normalized counts
        self.df['normalized_read_count'] = stats.zscore(self.df['read_count'])
        
        # Basic ratios
        self.df['dist_ratio'] = self.df['nearest_main_dist'] / self.df['nearest_cand_dist']
        
        # Check for infinite values
        inf_mask = np.isinf(self.df['dist_ratio'])
        if inf_mask.any():
            print(f"Warning: {inf_mask.sum()} infinite values in distance ratio")
            self.df.loc[inf_mask, 'dist_ratio'] = np.nan
            
        # Add group size as a feature to account for variation
        group_sizes = self.df.groupby('project_readfile_id').size()
        
        # Use a more robust approach for group size normalization
        self.df['group_size'] = self.df['project_readfile_id'].map(group_sizes)
        self.df['log_group_size'] = np.log1p(self.df['group_size'])
        
        # Use quantile-based normalization instead of simple division
        quantiles = pd.qcut(group_sizes, 4, labels=False, duplicates='drop')
        self.df['group_size_quantile'] = self.df['project_readfile_id'].map(
            dict(zip(group_sizes.index, quantiles))
        )
        
        # Create a more robust group size factor
        mean_group_size = group_sizes.mean()
        self.df['group_size_factor'] = np.log1p(self.df['group_size'] / mean_group_size)

    def _create_taxonomic_features(self) -> None:
        """Create taxonomy-based features"""
        try:
            print("Creating taxonomic features...")
            
            # Calculate taxonomic depth
            self.df['taxonomic_depth'] = self.df.apply(
                lambda row: len(str(row['family_tree']).split(';')) + 
                        len(str(row['subfamily_tree']).split(';')),
                axis=1
            )
            
            # Calculate family-level statistics - use groupby.agg for better performance
            family_stats = self.df.groupby('family_tree', as_index=False).agg({
                'read_count': ['mean', 'std', 'count'],
                'count_read': ['mean', 'std']
            })
            
            # Flatten column names
            family_stats.columns = ['family_tree', 'read_count_mean', 'read_count_std', 
                                'read_count_count', 'count_read_mean', 'count_read_std']
            
            # Merge stats back to main dataframe
            self.df = self.df.merge(family_stats, on='family_tree', how='left')
            
            # Rename columns for clarity
            self.df.rename(columns={
                'read_count_mean': 'family_abundance_mean',
                'read_count_std': 'family_abundance_std',
                'count_read_mean': 'family_prevalence'
            }, inplace=True)
            
            # Calculate taxonomic similarity
            self._calculate_taxonomic_similarity()
            
            # Fill any missing values
            numeric_columns = self.df.select_dtypes(include=[np.number]).columns
            self.df[numeric_columns] = self.df[numeric_columns].fillna(0)
            
        except Exception as e:
            print(f"Error in feature creation: {str(e)}")
            raise

    def _calculate_taxonomic_similarity(self) -> None:
        """Calculate taxonomic similarity scores between ASVs"""
        try:
            # Get family and subfamily info for main ASVs
            main_tax_info = (
                self.df[self.df['step1'] == 'main']
                [['asv_id', 'family_tree', 'subfamily_tree']]
                .drop_duplicates('asv_id')  # Remove duplicates
                .set_index('asv_id')
            )
            
            # Create mapping dictionaries
            main_family_map = main_tax_info['family_tree'].to_dict()
            main_subfamily_map = main_tax_info['subfamily_tree'].to_dict()
            
            # Map taxonomic info using dictionaries
            self.df['nearest_main_family'] = self.df['nearest_main_ASV'].map(main_family_map)
            self.df['nearest_main_subfamily'] = self.df['nearest_main_ASV'].map(main_subfamily_map)
            
            # Calculate similarity score
            self.df['taxonomic_similarity'] = self.df.apply(
                lambda row: (
                    (int(row['family_tree'] == row['nearest_main_family']) +
                    int(row['subfamily_tree'] == row['nearest_main_subfamily'])) / 2
                ),
                axis=1
            )
            
            # Fill missing values
            self.df['taxonomic_similarity'].fillna(0, inplace=True)
        
        except Exception as e:
            print(f"Error in taxonomic similarity calculation: {str(e)}")
            # Set default values if calculation fails
            self.df['nearest_main_family'] = self.df['family_tree']
            self.df['nearest_main_subfamily'] = self.df['subfamily_tree']
            self.df['taxonomic_similarity'] = 0

    def _create_abundance_features(self) -> None:
        """Create abundance-based features with improved handling of normalization"""
        try:
            print("Creating abundance features...")
            
            # Group-level statistics
            group_stats = self.df.groupby('project_readfile_id').agg({
                'read_count': ['sum', 'mean', 'std', 'count'],  # Added count for normalization
                'count_read': ['mean', 'std']
            })
            
            # Calculate relative abundances
            def calculate_group_abundance(row):
                group_sum = group_stats.loc[row['project_readfile_id'], ('read_count', 'sum')]
                return row['read_count'] / group_sum
            
            self.df['relative_abundance'] = self.df.apply(calculate_group_abundance, axis=1)
            
            # Calculate abundance z-scores within groups with outlier handling
            self.df['abundance_zscore'] = self.df.groupby('project_readfile_id')['read_count'].transform(
                lambda x: (x - x.median()) / (x.quantile(0.75) - x.quantile(0.25)) if len(x) > 1 else 0
            )
            
            # Add normalization by group size to address high group size variation
            self.df['normalized_by_group_size'] = self.df.groupby('project_readfile_id')['read_count'].transform(
                lambda x: x / x.sum() if x.sum() > 0 else 0
            )
            
            # Add within-group rank for robust comparative analysis
            self.df['within_group_rank'] = self.df.groupby('project_readfile_id')['read_count'].transform(
                lambda x: x.rank(pct=True)
            )
            
            # Add more robust within-group normalization
            # For each group, normalize features relative to the group's own statistics
            for group_id, group_df in self.df.groupby('project_readfile_id'):
                # Calculate group-specific metrics
                group_median = group_df['read_count'].median()
                group_iqr = group_df['read_count'].quantile(0.75) - group_df['read_count'].quantile(0.25)
                
                # Avoid division by zero
                if group_iqr == 0:
                    group_iqr = 1
                    
                # Create robust normalized features within the group
                self.df.loc[group_df.index, 'robust_group_norm'] = (
                    (group_df['read_count'] - group_median) / group_iqr
                )
            
            # Calculate quartiles with improved handling for duplicate values
            def safe_qcut(x):
                try:
                    if len(x.unique()) >= 4:
                        return pd.qcut(x, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'], duplicates='drop')
                    else:
                        # If we can't create 4 quartiles, create fewer bins
                        unique_values = len(x.unique())
                        labels = [f'Q{i+1}' for i in range(unique_values)]
                        return pd.qcut(x, q=unique_values, labels=labels, duplicates='drop')
                except ValueError:
                    # Fallback if qcut fails
                    return pd.Series(['Q1'] * len(x), index=x.index)

            self.df['abundance_quartile'] = self.df.groupby('project_readfile_id')['read_count'].transform(safe_qcut)
            
            # Calculate prevalence metrics
            total_samples = self.df['sample_count'].max()
            self.df['prevalence_ratio'] = self.df['count_read'] / total_samples
            
            # Calculate co-occurrence within groups
            group_sizes = self.df.groupby('project_readfile_id').size()
            self.df['group_presence_ratio'] = self.df.apply(
                lambda row: row['count_read'] / group_sizes[row['project_readfile_id']],
                axis=1
            )
            
            # Fill any missing values
            numeric_columns = ['relative_abundance', 'abundance_zscore', 'prevalence_ratio', 
                            'group_presence_ratio', 'normalized_by_group_size', 'within_group_rank',
                            'robust_group_norm']
            for col in numeric_columns:
                if col in self.df.columns:
                    self.df[col] = self.df[col].fillna(0)
            
            print("Abundance features created successfully")
            
        except Exception as e:
            print(f"Error in abundance feature creation: {str(e)}")
            # Set default values if calculation fails
            default_features = ['relative_abundance', 'abundance_zscore', 'abundance_quartile',
                            'prevalence_ratio', 'group_presence_ratio', 'normalized_by_group_size',
                            'within_group_rank', 'robust_group_norm']
            for feature in default_features:
                if feature not in self.df.columns:
                    self.df[feature] = 0
            print("Set default values for abundance features")

    def _create_distance_features(self) -> None:
        """Create distance-based features"""
        print("Creating distance features...")
        
        # Basic distance metrics
        self.df['distance_sum'] = self.df['nearest_main_dist'] + self.df['nearest_cand_dist']
        self.df['distance_product'] = self.df['nearest_main_dist'] * self.df['nearest_cand_dist']
        
        # Relative distances within groups
        self.df['relative_main_dist'] = self.df.groupby('project_readfile_id')['nearest_main_dist'].transform(
            lambda x: x / x.mean()
        )
        self.df['relative_cand_dist'] = self.df.groupby('project_readfile_id')['nearest_cand_dist'].transform(
            lambda x: x / x.mean()
        )
        
        # Distance ratios and normalized distances
        self.df['log_distance_ratio'] = np.log1p(self.df['dist_ratio'])
        self.df['normalized_main_dist'] = stats.zscore(self.df['nearest_main_dist'])
        self.df['normalized_cand_dist'] = stats.zscore(self.df['nearest_cand_dist'])

    def _create_quality_metrics(self) -> None:
        """Create quality and reliability metrics"""
        print("Creating quality metrics...")
        
        # Quality scores
        self.df['quality_score'] = self._calculate_quality_scores()
        self.df['reliability_score'] = self._calculate_reliability_scores()
        
        # Confidence metrics
        self.df['classification_confidence'] = self._calculate_classification_confidence()
        
        # Sequence complexity
        if hasattr(self, 'sequences'):
            self.df['sequence_complexity'] = self.df['asv_id'].apply(
                lambda x: self._calculate_sequence_complexity(x)
            )

    def _calculate_quality_scores(self) -> pd.Series:
        """
        Calculate comprehensive quality scores
        
        Returns:
            pd.Series: Quality scores for each ASV
        """
        # Component scores
        read_score = stats.zscore(self.df['read_count'])
        coverage_score = stats.zscore(self.df['count_read'] / self.df['sample_count'])
        abundance_score = stats.zscore(self.df['percentage'])
        distance_score = -stats.zscore(self.df['distance_sum'])
        
        # Combine scores with weights
        quality_scores = (
            0.3 * read_score +
            0.3 * coverage_score +
            0.2 * abundance_score +
            0.2 * distance_score
        )
        
        # Normalize to 0-1 range
        quality_scores = (quality_scores - quality_scores.min()) / (
            quality_scores.max() - quality_scores.min()
        )
        
        return quality_scores

    def _calculate_reliability_scores(self) -> pd.Series:
        """
        Calculate reliability scores
        
        Returns:
            pd.Series: Reliability scores for each ASV
        """
        # Component scores
        abundance_reliability = self.df['read_count'] / self.df['total_read']
        coverage_reliability = self.df['count_read'] / self.df['sample_count']
        distance_reliability = 1 / (1 + self.df['distance_sum'])
        taxonomic_reliability = self.df['taxonomic_depth'] / self.df['taxonomic_depth'].max()
        
        # Combine scores
        reliability_scores = (
            0.3 * abundance_reliability +
            0.3 * coverage_reliability +
            0.2 * distance_reliability +
            0.2 * taxonomic_reliability
        )
        
        # Normalize
        reliability_scores = (reliability_scores - reliability_scores.min()) / (
            reliability_scores.max() - reliability_scores.min()
        )
        
        return reliability_scores

    def _calculate_classification_confidence(self) -> pd.Series:
        """
        Calculate classification confidence scores
        
        Returns:
            pd.Series: Confidence scores for each ASV
        """
        # Component confidences
        abundance_conf = stats.zscore(self.df['read_count'])
        coverage_conf = stats.zscore(self.df['count_read'])
        distance_conf = stats.zscore(1 / (1 + self.df['nearest_main_dist']))
        
        # Combine confidence scores
        confidence_scores = (
            0.4 * abundance_conf +
            0.3 * coverage_conf +
            0.3 * distance_conf
        )
        
        return confidence_scores

    def _calculate_sequence_complexity(self, asv_id: str) -> float:
        """
        Calculate sequence complexity score
        
        Args:
            asv_id (str): ASV identifier
            
        Returns:
            float: Sequence complexity score
        """
        if asv_id not in self.sequences:
            return np.nan
            
        sequence = str(self.sequences[asv_id].seq)
        
        # Calculate basic complexity metrics
        length = len(sequence)
        unique_bases = len(set(sequence))
        gc_content = sequence.count('G') + sequence.count('C')
        
        # Combine metrics
        complexity_score = (
            0.4 * (unique_bases / length) +
            0.3 * (gc_content / length) +
            0.3 * (1 - (sequence.count('N') / length))
        )
        
        return complexity_score

    def _clean_features(self) -> None:
        """Clean and validate all features with improved handling of outliers and missing values"""
        print("Cleaning features...")
        
        # Get all features
        all_features = (
            self.features['main_candidate'] + 
            self.features['authentication']
        )
        
        # Handle reference ASV missing values
        if 'nearest_main_ASV' in self.df.columns and self.df['nearest_main_ASV'].isna().any():
            print(f"Cleaning missing values in nearest_main_ASV")
            # Fill with a placeholder or the most common value
            most_common = self.df['nearest_main_ASV'].mode()[0]
            self.df['nearest_main_ASV'].fillna(most_common, inplace=True)

        if 'nearest_cand_ASV' in self.df.columns and self.df['nearest_cand_ASV'].isna().any():
            print(f"Cleaning missing values in nearest_cand_ASV")
            most_common = self.df['nearest_cand_ASV'].mode()[0]
            self.df['nearest_cand_ASV'].fillna(most_common, inplace=True)
        
        # Handle taxonomic missing values
        if 'subfamily_tree' in self.df.columns and self.df['subfamily_tree'].isna().any():
            print(f"Cleaning missing values in subfamily_tree")
            self.df['subfamily_tree'].fillna("Unknown", inplace=True)
        
        if 'nearest_main_family' in self.df.columns and self.df['nearest_main_family'].isna().any():
            print(f"Cleaning missing values in nearest_main_family")
            self.df['nearest_main_family'].fillna(self.df['family_tree'], inplace=True)
        
        if 'nearest_main_subfamily' in self.df.columns and self.df['nearest_main_subfamily'].isna().any():
            print(f"Cleaning missing values in nearest_main_subfamily")
            self.df['nearest_main_subfamily'].fillna("Unknown", inplace=True)
        
        # Handle missing values in abundance features
        if 'abundance_zscore' in self.df.columns and self.df['abundance_zscore'].isna().any():
            print(f"Cleaning missing values in abundance_zscore")
            self.df['abundance_zscore'].fillna(self.df['abundance_zscore'].median(), inplace=True)
        
        if 'relative_cand_dist' in self.df.columns and self.df['relative_cand_dist'].isna().any():
            print(f"Cleaning missing values in relative_cand_dist")
            self.df['relative_cand_dist'].fillna(0, inplace=True)
        
        # Special handling for log_read outliers
        if 'log_read' in self.df.columns:
            # Use a more aggressive approach for this specific feature
            q1, q3 = np.percentile(self.df['log_read'], [25, 75])
            iqr = q3 - q1
            lower_bound = max(q1 - 2.0 * iqr, self.df['log_read'].min())  # More strict lower bound
            upper_bound = q3 + 2.0 * iqr  # More strict upper bound
            
            # Apply winsorization
            outliers_before = ((self.df['log_read'] < lower_bound) | (self.df['log_read'] > upper_bound)).sum()
            self.df['log_read'] = np.clip(self.df['log_read'], lower_bound, upper_bound)
            print(f"Applied stricter winsorization to log_read (capped {outliers_before} outliers)")
        
        # Special handling for group_size_factor outliers
        if 'group_size_factor' in self.df.columns:
            # Already log-transformed in _create_basic_features
            # Apply additional winsorization
            q1, q3 = np.percentile(self.df['group_size_factor'], [25, 75])
            iqr = q3 - q1
            lower_bound = max(q1 - 2.0 * iqr, self.df['group_size_factor'].min())
            upper_bound = q3 + 2.0 * iqr
            
            outliers_before = ((self.df['group_size_factor'] < lower_bound) | 
                            (self.df['group_size_factor'] > upper_bound)).sum()
            self.df['group_size_factor'] = np.clip(self.df['group_size_factor'], lower_bound, upper_bound)
            print(f"Applied winsorization to group_size_factor (capped {outliers_before} outliers)")
        
        # Clean each feature
        for feature in tqdm(all_features, desc="Cleaning features"):
            # Skip features we've already handled specially
            if feature in ['log_read', 'group_size_factor']:
                continue
                
            # Replace infinities
            self.df[feature] = self.df[feature].replace([np.inf, -np.inf], np.nan)
            
            # Handle missing values
            missing_count = self.df[feature].isna().sum()
            if missing_count > 0:
                print(f"Cleaning {missing_count} missing values in {feature}")
                if 'dist' in feature:
                    self.df[feature].fillna(self.df[feature].max(), inplace=True)
                else:
                    # Use median instead of 0 for better statistical properties
                    self.df[feature].fillna(self.df[feature].median(), inplace=True)
                    
            # Handle outliers using more robust approach
            z_scores = np.abs(stats.zscore(self.df[feature], nan_policy='omit'))
            outliers = z_scores > 4  # Increased threshold from 3 to 4 for better robustness
            if outliers.any():
                print(f"Found {outliers.sum()} outliers in {feature}")
                
                # Use winsorization instead of simple replacement with median
                q1, q3 = np.percentile(self.df[feature], [25, 75])
                iqr = q3 - q1
                lower_bound = max(q1 - 1.5 * iqr, self.df[feature].min())  # Ensure non-negative
                upper_bound = q3 + 1.5 * iqr
                
                # Apply winsorization: cap extreme values without losing them entirely
                self.df.loc[self.df[feature] < lower_bound, feature] = lower_bound
                self.df.loc[self.df[feature] > upper_bound, feature] = upper_bound
                
    def _validate_features(self) -> None:
        """Validate feature creation and cleaning"""
        # Check for remaining missing values
        missing_values = self.df[self.features['main_candidate'] + 
                            self.features['authentication']].isnull().sum()
        if missing_values.any():
            print("\nWarning: Missing values remain in features:")
            print(missing_values[missing_values > 0])
        
        # Check for infinite values
        inf_values = np.isinf(self.df[self.features['main_candidate'] + 
                                    self.features['authentication']]).sum()
        if inf_values.any():
            print("\nWarning: Infinite values remain in features:")
            print(inf_values[inf_values > 0])
            
        # Check for constant features
        for feature in self.features['main_candidate'] + self.features['authentication']:
            if self.df[feature].nunique() == 1:
                print(f"\nWarning: Feature '{feature}' has constant value")
        
        # Check value ranges
        for feature in self.features['main_candidate'] + self.features['authentication']:
            feature_range = self.df[feature].agg(['min', 'max'])
            if np.isinf(feature_range).any() or np.isnan(feature_range).any():
                print(f"\nWarning: Feature '{feature}' has invalid range: {feature_range.to_dict()}")
                
    def _print_feature_summary(self) -> None:
        """Print summary of created features"""
        print("\nFeature Summary:")
        print("-" * 50)
        
        # Get all numerical columns
        numeric_columns = self.df.select_dtypes(include=[np.number]).columns
        
        try:
            # Calculate and print basic statistics
            stats_df = self.df[numeric_columns].describe()
            print("\nBasic Statistics:")
            print(stats_df.round(3))
        except Exception as e:
            print(f"Error calculating basic statistics: {str(e)}")
        
        try:
            # Print missing values
            missing_values = self.df[numeric_columns].isnull().sum()
            if missing_values.any():
                print("\nMissing Values:")
                print(missing_values[missing_values > 0])
        except Exception as e:
            print(f"Error checking missing values: {str(e)}")
        
        try:
            # Print feature correlations with target variables
            print("\nFeature Correlations with Targets:")
            for feature in numeric_columns:
                try:
                    mc_corr = self.df[feature].corr(
                        (self.df['step1'] == 'main').astype(int)
                    )
                    auth_corr = self.df[feature].corr(
                        (self.df['step2'] == 'authenticated').astype(int)
                    )
                    print(f"{feature}:")
                    print(f"  Main/Candidate correlation: {mc_corr:.3f}")
                    print(f"  Authentication correlation: {auth_corr:.3f}")
                except Exception as e:
                    print(f"Error calculating correlations for {feature}: {str(e)}")
        except Exception as e:
            print(f"Error in correlation analysis: {str(e)}")
            
    def analyze_main_candidate(self) -> bool:
        """
        Perform main/candidate classification analysis
        
        Returns:
            bool: Success status
        """
        print("\n=== MAIN/CANDIDATE CLASSIFICATION ===")
        print("-" * 50)
        
        try:
            steps = [
                ("Preparing data", self._prepare_mc_data),
                ("Selecting best model", self._select_best_mc_model),
                ("Evaluating model", self._evaluate_mc_model),
                ("Applying classification", self._apply_mc_classification)
            ]
            
            # Track progress
            for step_name, step_func in tqdm(steps, desc="Classification Progress"):
                print(f"\nExecuting: {step_name}")
                if step_name == "Evaluating model":
                    # These steps need the prepared data
                    X_test, y_test = self.mc_data['test']
                    success = step_func(X_test, y_test)
                elif step_name == "Selecting best model":
                    X_train, y_train = self.mc_data['train']
                    success = step_func(X_train, y_train)
                else:
                    success = step_func()
                    
                if not success:
                    print(f"Error in {step_name}")
                    return False
            
            return True
            
        except Exception as e:
            print(f"Error in main/candidate analysis: {str(e)}")
            print(traceback.format_exc())
            return False
        
    def _prepare_mc_data(self):
        """Prepare data for main/candidate classification with improved scaling"""
        try:
            print("\nPreparing classification data...")
            
            # Store data splits in instance variable
            self.mc_data = {}
            
            # Prepare features and target
            X = self.df[self.features['main_candidate']].copy()
            y = (self.df['step1'] == 'main').astype(int)
            
            # Split data with stratification by group to handle group size variation
            if 'group_size' in self.df.columns:
                # Create a stratification feature that considers both class and group size
                strat = pd.qcut(self.df['group_size'], 4, duplicates='drop').astype(str) + '_' + y.astype(str)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.params['test_size'],
                    random_state=self.params['random_state'],
                    stratify=strat
                )
            else:
                # Fall back to standard stratification
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.params['test_size'],
                    random_state=self.params['random_state'],
                    stratify=y
                )
            
            # Use RobustScaler instead of StandardScaler for better outlier handling
            from sklearn.preprocessing import RobustScaler
            robust_scaler = RobustScaler()
            X_train_scaled = robust_scaler.fit_transform(X_train)
            X_test_scaled = robust_scaler.transform(X_test)
            
            # Store the scaler for later use
            self.models['scaler'] = robust_scaler
            
            # Store splits
            self.mc_data['train'] = (X_train_scaled, y_train)
            self.mc_data['test'] = (X_test_scaled, y_test)
            
            # Print class distribution
            print("\nClass Distribution:")
            for label, count in zip(['Candidate', 'Main'], np.bincount(y)):
                print(f"{label}: {count} ({count/len(y)*100:.2f}%)")
            
            return True
            
        except Exception as e:
            print(f"Error in data preparation: {str(e)}")
            return False
        
    def _select_best_mc_model(self, X_train, y_train):
        """Select best model for classification with improved regularization"""
        try:
            print("\nPerforming model selection...")
            
            # Define models to evaluate with more robust configurations
            models = {
                'RandomForest': {
                    'model': RandomForestClassifier(random_state=self.params['random_state']),
                    'params': {
                        'n_estimators': [100, 200],
                        'max_depth': [10, None],
                        'min_samples_split': [5, 10],
                        'class_weight': ['balanced'],
                        'min_samples_leaf': [2, 4]  # Added to prevent overfitting
                    }
                },
                'GradientBoosting': {
                    'model': GradientBoostingClassifier(random_state=self.params['random_state']),
                    'params': {
                        'n_estimators': [100, 200],
                        'learning_rate': [0.05, 0.1],
                        'max_depth': [4, 5],
                        'subsample': [0.8, 1.0]  # Added to improve robustness
                    }
                }
            }
            
            # Find best model
            best_score = 0
            best_model = None
            cv_results = {}
            
            for name, config in tqdm(models.items(), desc="Evaluating models"):
                # Use nested cross-validation for more reliable model selection
                from sklearn.model_selection import cross_validate
                
                # First, perform grid search
                grid_search = GridSearchCV(
                    config['model'],
                    config['params'],
                    cv=self.params['cv_folds'],
                    scoring='f1',
                    n_jobs=-1,
                    verbose=0
                )
                grid_search.fit(X_train, y_train)
                
                # Store results
                cv_results[name] = {
                    'best_score': grid_search.best_score_,
                    'best_params': grid_search.best_params_,
                    'cv_results': grid_search.cv_results_
                }
                
                print(f"\n{name} Results:")
                print(f"Best F1 score: {grid_search.best_score_:.4f}")
                print(f"Best parameters: {grid_search.best_params_}")
                
                if grid_search.best_score_ > best_score:
                    best_score = grid_search.best_score_
                    best_model = grid_search.best_estimator_
            
            # Store results
            self.models['main_candidate'] = best_model
            self.metrics['model_selection'] = cv_results
            
            print(f"\nSelected model: {type(best_model).__name__}")
            print(f"Best cross-validation F1 score: {best_score:.4f}")
            
            return True
            
        except Exception as e:
            print(f"Error in model selection: {str(e)}")
            return False
        
    def _evaluate_mc_model(self, X_test, y_test):
        """
        Evaluate classification model performance
        
        Args:
            X_test: Test features
            y_test: Test labels
        """
        try:
            print("\nEvaluating model performance...")
            
            # Get predictions
            y_pred = self.models['main_candidate'].predict(X_test)
            y_prob = self.models['main_candidate'].predict_proba(X_test)[:, 1]
            
            # Calculate ROC curve
            fpr, tpr, thresholds = roc_curve(y_test, y_prob)
            roc_auc = auc(fpr, tpr)
            
            # Calculate precision-recall curve
            precision, recall, pr_thresholds = precision_recall_curve(y_test, y_prob)
            pr_auc = auc(recall, precision)
            
            # Find optimal threshold
            optimal_idx = np.argmax(tpr - fpr)
            self.thresholds['main_candidate'] = thresholds[optimal_idx]
            
            # Store performance metrics
            self.metrics['main_candidate'] = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1': f1_score(y_test, y_pred),
                'roc_auc': roc_auc,
                'pr_auc': pr_auc,
                'optimal_threshold': self.thresholds['main_candidate']
            }
            
            # Print results
            print(f"\nOptimal threshold: {self.thresholds['main_candidate']:.4f}")
            print(f"ROC AUC: {roc_auc:.4f}")
            print(f"PR AUC: {pr_auc:.4f}")
            
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=['Candidate', 'Main']))
            
            return True
            
        except Exception as e:
            print(f"Error in model evaluation: {str(e)}")
            return False
        
    def _apply_mc_classification(self) -> bool:
        """
        Apply classification to all data and analyze results
        
        Returns:
            bool: Success status
        """
        try:
            print("\nApplying classification to all data...")
            
            # Scale features
            X_all = self.models['scaler'].transform(self.df[self.features['main_candidate']])
            
            # Get predictions
            probabilities = self.models['main_candidate'].predict_proba(X_all)[:, 1]
            predictions = (probabilities >= self.thresholds['main_candidate']).astype(int)
            
            # Store results
            self.df['predicted_status'] = np.where(predictions == 1, 'main', 'candidate')
            self.df['mc_probability'] = probabilities
            
            # Calculate agreement metrics
            results = pd.crosstab(
                self.df['predicted_status'],
                self.df['step1'],
                margins=True
            )
            
            # Calculate per-class metrics
            class_metrics = {}
            for status in ['main', 'candidate']:
                true_mask = self.df['step1'] == status
                pred_mask = self.df['predicted_status'] == status
                
                class_metrics[status] = {
                    'precision': precision_score(true_mask, pred_mask),
                    'recall': recall_score(true_mask, pred_mask),
                    'f1': f1_score(true_mask, pred_mask)
                }
            
            # Store results
            self.metrics['classification_results'] = {
                'confusion_matrix': results,
                'agreement': (self.df['predicted_status'] == self.df['step1']).mean() * 100,
                'class_metrics': class_metrics
            }
            
            # Print results
            self._print_classification_application_results()
            
            return True
            
        except Exception as e:
            print(f"Error in classification application: {str(e)}")
            return False
        
    def _print_classification_application_results(self) -> None:
        """Print detailed classification application results"""
        results = self.metrics['classification_results']
        
        print("\nClassification Results:")
        print("-" * 30)
        print("\nConfusion Matrix:")
        print(results['confusion_matrix'])
        
        print(f"\nOverall Agreement: {results['agreement']:.2f}%")
        
        print("\nPer-Class Metrics:")
        for status, metrics in results['class_metrics'].items():
            print(f"\n{status.title()}:")
            for metric, value in metrics.items():
                print(f"  {metric.title()}: {value:.4f}")
        
        # Additional insights
        print("\nAdditional Insights:")
        
        # Analyze high confidence predictions
        high_conf_mask = (self.df['mc_probability'] >= 0.9) | (self.df['mc_probability'] <= 0.1)
        high_conf_agreement = (
            self.df.loc[high_conf_mask, 'predicted_status'] == 
            self.df.loc[high_conf_mask, 'step1']
        ).mean() * 100
        
        print(f"High confidence predictions: {high_conf_mask.sum()} "
              f"({high_conf_mask.mean()*100:.1f}% of total)")
        print(f"Agreement for high confidence predictions: {high_conf_agreement:.2f}%")
        
        # Analyze disagreements
        disagreement_mask = self.df['predicted_status'] != self.df['step1']
        if disagreement_mask.any():
            print("\nDisagreement Analysis:")
            disagreements = self.df[disagreement_mask].groupby(
                ['step1', 'predicted_status']
            ).size().unstack(fill_value=0)
            print(disagreements)

    def analyze_authentication(self):
        """Perform authentication analysis with comprehensive evaluation"""
        print("\n=== AUTHENTICATION ANALYSIS ===")
        print("-" * 50)
        
        try:
            steps = [
                ("Preparing authentication data", self._prepare_auth_data),
                ("Training authentication model", self._train_auth_model),
                ("Evaluating authentication model", self._evaluate_auth_model),
                ("Applying authentication", self._apply_authentication)
            ]
            
            for step_name, step_func in tqdm(steps, desc="Authentication Analysis"):
                print(f"\nExecuting: {step_name}")
                
                if step_name == "Evaluating authentication model":
                    X_test, y_test = self.auth_data['test']
                    success = step_func(X_test, y_test)
                elif step_name == "Training authentication model":
                    X_train, y_train = self.auth_data['train']
                    success = step_func(X_train, y_train)
                else:
                    success = step_func()
                    
                if not success:
                    print(f"Error in {step_name}")
                    return False
            
            return True
            
        except Exception as e:
            print(f"Error in authentication analysis: {str(e)}")
            return False

    def _prepare_auth_data(self) -> bool:
        """
        Prepare data for authentication analysis with improved scaling and stratification
        
        Returns:
            bool: Success status
        """
        try:
            print("\nPreparing authentication data...")
            
            # Store data splits
            self.auth_data = {}
            
            # Prepare features
            X = self.df[self.features['authentication']].copy()
            
            # Prepare target (authenticated = 1, unauthenticated = 0)
            y = (self.df['step2'] == 'authenticated').astype(int)
            
            # Split data with stratification by group to handle group size variation
            if 'group_size' in self.df.columns:
                # Create a stratification feature that considers both class and group size
                strat = pd.qcut(self.df['group_size'], 4, duplicates='drop').astype(str) + '_' + y.astype(str)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.params['test_size'],
                    random_state=self.params['random_state'],
                    stratify=strat
                )
            else:
                # Fall back to standard stratification
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y,
                    test_size=self.params['test_size'],
                    random_state=self.params['random_state'],
                    stratify=y
                )
            
            # Use QuantileTransformer for even more robust handling of outliers
            from sklearn.preprocessing import QuantileTransformer
            quantile_scaler = QuantileTransformer(output_distribution='normal')
            X_train_scaled = quantile_scaler.fit_transform(X_train)
            X_test_scaled = quantile_scaler.transform(X_test)
            
            # Store splits
            self.auth_data['train'] = (X_train_scaled, y_train)
            self.auth_data['test'] = (X_test_scaled, y_test)
            
            # Store scaler
            self.models['auth_scaler'] = quantile_scaler
            
            # Print class distribution
            print("\nAuthentication Class Distribution:")
            for label, count in zip(['Unauthenticated', 'Authenticated'], np.bincount(y)):
                print(f"{label}: {count} ({count/len(y)*100:.2f}%)")
            
            return True
            
        except Exception as e:
            print(f"Error in authentication data preparation: {str(e)}")
            return False

    def _train_auth_model(self, X_train: np.ndarray, y_train: np.ndarray) -> bool:
        """
        Train authentication model with advanced features and optimization
        
        Args:
            X_train: Training features
            y_train: Training labels
            
        Returns:
            bool: Success status
        """
        try:
            print("\nTraining authentication model...")
            
            # Initialize model with optimized parameters
            self.models['authentication'] = RandomForestClassifier(
                n_estimators=300,
                max_depth=None,
                min_samples_split=5,
                class_weight='balanced',
                random_state=self.params['random_state']
            )
            
            # Perform cross-validation
            cv_scores = cross_val_score(
                self.models['authentication'],
                X_train,
                y_train,
                cv=self.params['cv_folds'],
                scoring='f1'
            )
            
            print("\nCross-validation scores:")
            print(f"Mean F1: {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")
            
            # Train final model
            self.models['authentication'].fit(X_train, y_train)
            
            # Calculate and store feature importance
            self.metrics['auth_feature_importance'] = pd.DataFrame({
                'feature': self.features['authentication'],
                'importance': self.models['authentication'].feature_importances_
            }).sort_values('importance', ascending=False)
            
            print("\nFeature Importance:")
            print(self.metrics['auth_feature_importance'])
            
            return True
            
        except Exception as e:
            print(f"Error in authentication model training: {str(e)}")
            return False

    def _evaluate_auth_model(self, X_test: np.ndarray, y_test: np.ndarray) -> bool:
        """
        Evaluate authentication model with comprehensive metrics
        
        Args:
            X_test: Test features
            y_test: Test labels
            
        Returns:
            bool: Success status
        """
        try:
            print("\nEvaluating authentication model performance...")
            
            # Get predictions and probabilities
            y_pred = self.models['authentication'].predict(X_test)
            y_prob = self.models['authentication'].predict_proba(X_test)[:, 1]
            
            # Calculate comprehensive metrics
            metrics = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1': f1_score(y_test, y_pred),
                'roc_auc': roc_auc_score(y_test, y_prob),
                'matthews_corrcoef': matthews_corrcoef(y_test, y_pred)
            }
            
            # Calculate ROC and PR curves
            fpr, tpr, thresholds = roc_curve(y_test, y_prob)
            precision, recall, pr_thresholds = precision_recall_curve(y_test, y_prob)
            
            # Find optimal thresholds
            optimal_threshold_metrics = self._find_optimal_thresholds(
                y_test, y_prob, fpr, tpr, precision, recall
            )
            
            # Store results
            self.metrics['authentication'] = {
                **metrics,
                **optimal_threshold_metrics
            }
            
            # Print results
            print("\nAuthentication Model Performance:")
            for metric, value in metrics.items():
                print(f"{metric.title()}: {value:.4f}")
            
            return True
            
        except Exception as e:
            print(f"Error in authentication model evaluation: {str(e)}")
            return False

    def _find_optimal_thresholds(self, y_test: np.ndarray, y_prob: np.ndarray,
                               fpr: np.ndarray, tpr: np.ndarray,
                               precision: np.ndarray, recall: np.ndarray) -> dict:
        """
        Find optimal thresholds for authentication
        
        Args:
            y_test: Test labels
            y_prob: Prediction probabilities
            fpr: False positive rates
            tpr: True positive rates
            precision: Precision values
            recall: Recall values
            
        Returns:
            dict: Threshold metrics
        """
        # Find threshold maximizing F1 score
        thresholds = np.linspace(0, 1, 100)
        f1_scores = []
        
        for threshold in thresholds:
            y_pred = (y_prob >= threshold).astype(int)
            f1_scores.append(f1_score(y_test, y_pred))
        
        optimal_f1_idx = np.argmax(f1_scores)
        f1_threshold = thresholds[optimal_f1_idx]
        
        # Find threshold maximizing balanced accuracy
        balanced_accuracy_scores = []
        
        for threshold in thresholds:
            y_pred = (y_prob >= threshold).astype(int)
            balanced_accuracy_scores.append(
                balanced_accuracy_score(y_test, y_pred)
            )
        
        optimal_ba_idx = np.argmax(balanced_accuracy_scores)
        ba_threshold = thresholds[optimal_ba_idx]
        
        # Store thresholds
        self.thresholds['authentication'] = {
            'main': f1_threshold,
            'candidate': ba_threshold  # Use more conservative threshold for candidates
        }
        
        return {
            'f1_threshold': f1_threshold,
            'f1_threshold_score': max(f1_scores),
            'ba_threshold': ba_threshold,
            'ba_threshold_score': max(balanced_accuracy_scores)
        }
    
    def _apply_authentication(self) -> bool:
        """
        Apply authentication rules with separate thresholds
        
        Returns:
            bool: Success status
        """
        try:
            print("\nApplying authentication rules...")
            
            # Scale features
            X = self.models['auth_scaler'].transform(self.df[self.features['authentication']])
            
            # Get probabilities
            probabilities = self.models['authentication'].predict_proba(X)[:, 1]
            
            # Initialize authentication status and store probabilities
            self.df['auth_status'] = 'unauthenticated'
            self.df['auth_probability'] = probabilities
            
            # Authenticate ASVs based on their status
            for status in ['main', 'candidate']:
                mask = self.df['predicted_status'] == status
                if mask.any():
                    threshold = self.thresholds['authentication'][status]
                    
                    # Additional criteria for candidates
                    if status == 'candidate':
                        self.df.loc[
                            mask & 
                            (probabilities >= threshold) &
                            (self.df['dist_ratio'] >= 1.5),  # Distance check
                            'auth_status'
                        ] = 'authenticated'
                    else:
                        self.df.loc[
                            mask & 
                            (probabilities >= threshold),
                            'auth_status'
                        ] = 'authenticated'
            
            # Calculate and store authentication metrics
            self._calculate_authentication_metrics()
            
            # Print results
            self._print_auth_results()
            
            return True
            
        except Exception as e:
            print(f"Error in authentication application: {str(e)}")
            return False

    def _calculate_authentication_metrics(self) -> None:
        """Calculate comprehensive authentication metrics"""
        # Overall metrics
        self.metrics['authentication_results'] = {
            'total_authenticated': (self.df['auth_status'] == 'authenticated').sum(),
            'authentication_rate': (self.df['auth_status'] == 'authenticated').mean() * 100,
            'agreement': (self.df['auth_status'] == self.df['step2']).mean() * 100
        }
        
        # Metrics by ASV type
        for status in ['main', 'candidate']:
            mask = self.df['predicted_status'] == status
            if mask.any():
                self.metrics['authentication_results'][f'{status}_metrics'] = {
                    'total': mask.sum(),
                    'authenticated': (
                        (self.df['auth_status'] == 'authenticated') & mask
                    ).sum(),
                    'authentication_rate': (
                        (self.df['auth_status'] == 'authenticated') & mask
                    ).mean() * 100,
                    'agreement': (
                        (self.df['auth_status'] == self.df['step2']) & mask
                    ).mean() * 100
                }
        
        # Distance-based metrics
        self.metrics['authentication_results']['distance_metrics'] = {
            'authenticated_mean_dist_ratio': self.df.loc[
                self.df['auth_status'] == 'authenticated',
                'dist_ratio'
            ].mean(),
            'authenticated_med_dist_ratio': self.df.loc[
                self.df['auth_status'] == 'authenticated',
                'dist_ratio'
            ].median()
        }

    def _print_auth_results(self) -> None:
        """Print comprehensive authentication results"""
        results = self.metrics['authentication_results']
        
        print("\nAuthentication Results:")
        print("-" * 30)
        
        print(f"\nOverall Results:")
        print(f"Total ASVs authenticated: {results['total_authenticated']}")
        print(f"Overall authentication rate: {results['authentication_rate']:.2f}%")
        print(f"Agreement with original labels: {results['agreement']:.2f}%")
        
        print("\nResults by ASV Type:")
        for status in ['main', 'candidate']:
            if f'{status}_metrics' in results:
                metrics = results[f'{status}_metrics']
                print(f"\n{status.title()} ASVs:")
                print(f"Total: {metrics['total']}")
                print(f"Authenticated: {metrics['authenticated']}")
                print(f"Authentication rate: {metrics['authentication_rate']:.2f}%")
                print(f"Agreement: {metrics['agreement']:.2f}%")
        
        print("\nDistance Metrics for Authenticated ASVs:")
        dist_metrics = results['distance_metrics']
        print(f"Mean distance ratio: {dist_metrics['authenticated_mean_dist_ratio']:.2f}")
        print(f"Median distance ratio: {dist_metrics['authenticated_med_dist_ratio']:.2f}")
        
        # Print confusion matrix
        print("\nConfusion Matrix:")
        cm = pd.crosstab(
            self.df['auth_status'],
            self.df['step2'],
            margins=True
        )
        print(cm)

    def export_results(self, output_path: str) -> bool:
        """
        Export comprehensive analysis results
        
        Args:
            output_path: Path for output files
            
        Returns:
            bool: Success status
        """
        try:
            print("\n=== EXPORTING ANALYSIS RESULTS ===")
            print("-" * 50)
            
            # Create output directory if needed
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            # Export to different formats
            self._export_excel_results(output_path)
            
            # Export JSON results
            json_path = output_path.replace('.xlsx', '.json')
            self._export_json_results(json_path)
            
            # Generate detailed report
            report_path = output_path.replace('.xlsx', '_report.txt')
            self._generate_detailed_report(report_path)
            
            # Generate PDF report
            pdf_path = output_path.replace('.xlsx', '_report.pdf')
            self._generate_pdf_report(pdf_path)
            
            print("\nResults exported successfully:")
            print(f"- Excel file: {output_path}")
            print(f"- JSON file: {json_path}")
            print(f"- Report file: {report_path}")
            print(f"- PDF report: {pdf_path}")
            
            return True
            
        except Exception as e:
            print(f"Error in results export: {str(e)}")
            return False
    
    def _create_feature_importance_df(self) -> pd.DataFrame:
        """
        Create feature importance DataFrame
        
        Returns:
            pd.DataFrame: Feature importance for all models
        """
        try:
            # Main/Candidate feature importance
            mc_importance = pd.DataFrame({
                'feature': self.features['main_candidate'],
                'importance': self.models['main_candidate'].feature_importances_,
                'analysis_type': 'Main/Candidate'
            })
            
            # Authentication feature importance
            auth_importance = pd.DataFrame({
                'feature': self.features['authentication'],
                'importance': self.models['authentication'].feature_importances_,
                'analysis_type': 'Authentication'
            })
            
            # Combine and sort
            importance_df = pd.concat([mc_importance, auth_importance], ignore_index=True)
            importance_df = importance_df.sort_values(['analysis_type', 'importance'], 
                                                    ascending=[True, False])
            
            return importance_df
            
        except Exception as e:
            print(f"Warning: Error creating feature importance DataFrame: {str(e)}")
            # Return empty DataFrame with correct columns if error occurs
            return pd.DataFrame(columns=['feature', 'importance', 'analysis_type'])

    def _export_excel_results(self, output_path: str) -> None:
        """Export results to Excel with multiple sheets"""
        try:
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                # Main results
                self._create_main_results().to_excel(
                    writer, sheet_name='Main_Results', index=False
                )
                
                # Performance metrics
                self._create_metrics_df().to_excel(
                    writer, sheet_name='Performance_Metrics', index=True
                )
                
                # Feature importance
                try:
                    self._create_feature_importance_df().to_excel(
                        writer, sheet_name='Feature_Importance', index=False
                    )
                except Exception as e:
                    print(f"Warning: Could not export feature importance: {str(e)}")
                
                # Group statistics
                try:
                    self._create_group_stats_df().to_excel(
                        writer, sheet_name='Group_Statistics', index=True
                    )
                except Exception as e:
                    print(f"Warning: Could not export group statistics: {str(e)}")
                
                # Statistical summary
                try:
                    self._create_stats_summary_df().to_excel(
                        writer, sheet_name='Statistical_Summary', index=True
                    )
                except Exception as e:
                    print(f"Warning: Could not export statistical summary: {str(e)}")
                
                # Authentication analysis
                try:
                    self._create_authentication_analysis_df().to_excel(
                        writer, sheet_name='Authentication_Analysis', index=True
                    )
                except Exception as e:
                    print(f"Warning: Could not export authentication analysis: {str(e)}")
                    
        except Exception as e:
            print(f"Error in Excel export: {str(e)}")
            raise

    def _create_authentication_analysis_df(self) -> pd.DataFrame:
        """
        Create authentication analysis DataFrame
        
        Returns:
            pd.DataFrame: Authentication analysis results
        """
        try:
            # Create authentication analysis summary
            auth_df = pd.DataFrame({
                'Metric': [
                    'Total ASVs',
                    'Authenticated ASVs',
                    'Authentication Rate (%)',
                    'Agreement with Original (%)',
                    'Mean Distance Ratio',
                    'Median Distance Ratio'
                ],
                'All': [
                    len(self.df),
                    (self.df['auth_status'] == 'authenticated').sum(),
                    (self.df['auth_status'] == 'authenticated').mean() * 100,
                    (self.df['auth_status'] == self.df['step2']).mean() * 100,
                    self.df['dist_ratio'].mean(),
                    self.df['dist_ratio'].median()
                ]
            })
            
            # Add metrics by predicted status
            for status in ['main', 'candidate']:
                mask = self.df['predicted_status'] == status
                if mask.any():
                    auth_df[status.title()] = [
                        mask.sum(),
                        (self.df.loc[mask, 'auth_status'] == 'authenticated').sum(),
                        (self.df.loc[mask, 'auth_status'] == 'authenticated').mean() * 100,
                        (self.df.loc[mask, 'auth_status'] == 
                        self.df.loc[mask, 'step2']).mean() * 100,
                        self.df.loc[mask, 'dist_ratio'].mean(),
                        self.df.loc[mask, 'dist_ratio'].median()
                    ]
            
            return auth_df.round(2)
            
        except Exception as e:
            print(f"Warning: Error creating authentication analysis DataFrame: {str(e)}")
            return pd.DataFrame()

    def _create_main_results(self) -> pd.DataFrame:
        """
        Create main results DataFrame
        
        Returns:
            pd.DataFrame: Main results
        """
        return pd.DataFrame({
            # Basic information
            'asv_id': self.df['asv_id'],
            'project_readfile_id': self.df['project_readfile_id'],
            'family_tree': self.df['family_tree'],
            'subfamily_tree': self.df['subfamily_tree'],
            
            # Original and predicted status
            'original_step1': self.df['step1'],
            'predicted_status': self.df['predicted_status'],
            'mc_probability': self.df['mc_probability'],
            'original_step2': self.df['step2'],
            'auth_status': self.df['auth_status'],
            'auth_probability': self.df['auth_probability'],
            
            # Feature values
            'read_count': self.df['read_count'],
            'total_read': self.df['total_read'],
            'count_read': self.df['count_read'],
            'percentage': self.df['percentage'],
            'nearest_main_dist': self.df['nearest_main_dist'],
            'nearest_cand_dist': self.df['nearest_cand_dist'],
            'dist_ratio': self.df['dist_ratio'],
            
            # Additional metrics
            'quality_score': self.df['quality_score'],
            'reliability_score': self.df['reliability_score']
        })

    def _create_metrics_df(self) -> pd.DataFrame:
        """
        Create comprehensive metrics DataFrame
        
        Returns:
            pd.DataFrame: Performance metrics
        """
        metrics = {
            'Main/Candidate Classification': {
                key: self.metrics['main_candidate'].get(key, None)
                for key in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
            },
            'Authentication': {
                key: self.metrics['authentication'].get(key, None)
                for key in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
            }
        }
        
        # Add thresholds
        metrics['Main/Candidate Classification']['threshold'] = self.thresholds['main_candidate']
        metrics['Authentication'].update({
            'main_threshold': self.thresholds['authentication']['main'],
            'candidate_threshold': self.thresholds['authentication']['candidate']
        })
        
        return pd.DataFrame.from_dict(metrics, orient='columns')

    def _create_group_stats_df(self) -> pd.DataFrame:
        """
        Create group statistics DataFrame
        
        Returns:
            pd.DataFrame: Group-wise statistics
        """
        try:
            # Calculate group statistics
            group_stats = self.df.groupby('project_readfile_id').agg({
                'asv_id': 'count',
                'predicted_status': lambda x: (x == 'main').mean() * 100,
                'auth_status': lambda x: (x == 'authenticated').mean() * 100,
                'read_count': ['sum', 'mean', 'std'],
                'total_read': 'first',
                'count_read': 'mean'
            }).round(2)
            
            # Rename columns for clarity
            group_stats.columns = [
                'ASV_Count',
                'Main_ASV_Percentage',
                'Authentication_Rate',
                'Total_Reads',
                'Mean_Reads',
                'Std_Reads',
                'Sequencing_Depth',
                'Average_Coverage'
            ]
            
            return group_stats
            
        except Exception as e:
            print(f"Error creating group statistics: {str(e)}")
            return pd.DataFrame()

    def _create_stats_summary_df(self) -> pd.DataFrame:
        """
        Create statistical summary DataFrame
        
        Returns:
            pd.DataFrame: Statistical summary
        """
        try:
            summary_stats = pd.DataFrame({
                'Metric': [
                    'Total ASVs',
                    'Main ASVs (%)',
                    'Candidate ASVs (%)',
                    'Authenticated ASVs (%)',
                    'Unauthenticated ASVs (%)',
                    'Average Read Count',
                    'Median Read Count',
                    'Average Sample Coverage',
                    'Average Distance Ratio',
                    'Total Groups',
                    'Average ASVs per Group'
                ],
                'Value': [
                    len(self.df),
                    (self.df['predicted_status'] == 'main').mean() * 100,
                    (self.df['predicted_status'] == 'candidate').mean() * 100,
                    (self.df['auth_status'] == 'authenticated').mean() * 100,
                    (self.df['auth_status'] == 'unauthenticated').mean() * 100,
                    self.df['read_count'].mean(),
                    self.df['read_count'].median(),
                    (self.df['count_read'] / self.df['sample_count']).mean() * 100,
                    self.df['dist_ratio'].mean(),
                    self.df['project_readfile_id'].nunique(),
                    len(self.df) / self.df['project_readfile_id'].nunique()
                ]
            }).round(2)
            
            return summary_stats
            
        except Exception as e:
            print(f"Error creating statistical summary: {str(e)}")
            return pd.DataFrame()

    def _export_json_results(self, json_path: str) -> None:
        """
        Export results in JSON format
        
        Args:
            json_path: Path for JSON file
        """
        results = {
            'metadata': {
                'total_asvs': len(self.df),
                'features': self.features,
                'thresholds': self.thresholds
            },
            'results': self.df[
                ['asv_id', 'predicted_status', 'auth_status', 
                 'mc_probability', 'auth_probability']
            ].to_dict('records'),
            'metrics': {
                'main_candidate': self.metrics['main_candidate'],
                'authentication': self.metrics['authentication'],
                'authentication_results': self.metrics['authentication_results']
            },
            'feature_importance': {
                'main_candidate': dict(zip(
                    self.features['main_candidate'],
                    self.models['main_candidate'].feature_importances_
                )),
                'authentication': dict(zip(
                    self.features['authentication'],
                    self.models['authentication'].feature_importances_
                ))
            }
        }
        
        # Convert numpy/pandas types to native Python types
        results = self._convert_to_serializable(results)
        
        with open(json_path, 'w') as f:
            json.dump(results, f, indent=2)

    def _convert_to_serializable(self, obj: Any) -> Any:
        """
        Convert object to JSON serializable format
        
        Args:
            obj: Input object
            
        Returns:
            JSON serializable object
        """
        if isinstance(obj, (np.integer, np.floating, np.int64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, pd.Series):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {key: self._convert_to_serializable(value) 
                for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self._convert_to_serializable(item) for item in obj]
        return obj

    def _generate_detailed_report(self, report_path: str) -> None:
        """
        Generate comprehensive analysis report
        
        Args:
            report_path: Path for report file
        """
        try:
            with open(report_path, 'w') as f:
                # Header
                f.write("ASV ANALYSIS DETAILED REPORT\n")
                f.write("=" * 50 + "\n\n")
                
                # 1. Executive Summary
                f.write("1. EXECUTIVE SUMMARY\n")
                f.write("-" * 20 + "\n")
                total_asvs = len(self.df)
                main_count = (self.df['predicted_status'] == 'main').sum()
                auth_count = (self.df['auth_status'] == 'authenticated').sum()
                
                f.write(f"Total ASVs analyzed: {total_asvs:,}\n")
                f.write(f"Main vs Candidate Ratio: {main_count}:{total_asvs-main_count} "
                    f"({main_count/total_asvs*100:.1f}%)\n")
                f.write(f"Authentication Rate: {auth_count/total_asvs*100:.1f}%\n\n")
                
                # 2. Classification Analysis
                f.write("\n2. CLASSIFICATION PERFORMANCE\n")
                f.write("-" * 20 + "\n")
                for metric, value in self.metrics['main_candidate'].items():
                    if isinstance(value, (int, float)):
                        f.write(f"{metric}: {value:.4f}\n")
                
                # 3. Authentication Analysis
                f.write("\n3. AUTHENTICATION PERFORMANCE\n")
                f.write("-" * 20 + "\n")
                for metric, value in self.metrics['authentication'].items():
                    if isinstance(value, (int, float)):
                        f.write(f"{metric}: {value:.4f}\n")
                
                # 4. Feature Analysis
                f.write("\n4. FEATURE IMPORTANCE\n")
                f.write("-" * 20 + "\n")
                
                # Main/Candidate features
                f.write("\nMain/Candidate Classification Features:\n")
                importance = pd.DataFrame({
                    'feature': self.features['main_candidate'],
                    'importance': self.models['main_candidate'].feature_importances_
                }).sort_values('importance', ascending=False)
                
                for _, row in importance.iterrows():
                    f.write(f"{row['feature']}: {row['importance']:.4f}\n")
                
                # Authentication features
                f.write("\nAuthentication Features:\n")
                importance = pd.DataFrame({
                    'feature': self.features['authentication'],
                    'importance': self.models['authentication'].feature_importances_
                }).sort_values('importance', ascending=False)
                
                for _, row in importance.iterrows():
                    f.write(f"{row['feature']}: {row['importance']:.4f}\n")
                
                # 5. Group Analysis
                f.write("\n5. GROUP ANALYSIS\n")
                f.write("-" * 20 + "\n")
                
                group_stats = self.df.groupby('project_readfile_id').agg({
                    'asv_id': 'count',
                    'predicted_status': lambda x: (x == 'main').mean() * 100,
                    'auth_status': lambda x: (x == 'authenticated').mean() * 100
                })
                
                f.write(f"\nTotal Groups: {len(group_stats)}\n")
                f.write(f"Average ASVs per Group: {group_stats['asv_id'].mean():.1f}\n")
                f.write(f"Average Main ASV Percentage: {group_stats['predicted_status'].mean():.1f}%\n")
                f.write(f"Average Authentication Rate: {group_stats['auth_status'].mean():.1f}%\n")
                
                # 6. Threshold Information
                f.write("\n6. THRESHOLD INFORMATION\n")
                f.write("-" * 20 + "\n")
                f.write(f"\nMain/Candidate Classification Threshold: {self.thresholds['main_candidate']:.4f}\n")
                f.write("\nAuthentication Thresholds:\n")
                f.write(f"Main ASVs: {self.thresholds['authentication']['main']:.4f}\n")
                f.write(f"Candidate ASVs: {self.thresholds['authentication']['candidate']:.4f}\n")
                
            print(f"Detailed report generated: {report_path}")
            
        except Exception as e:
            print(f"Error generating detailed report: {str(e)}")

    def _generate_pdf_report(self, pdf_path: str) -> None:
        """
        Generate PDF report with ReportLab
        
        Args:
            pdf_path: Path for PDF report
        """
        try:
            # Create document
            doc = SimpleDocTemplate(
                pdf_path,
                pagesize=letter,
                rightMargin=72,
                leftMargin=72,
                topMargin=72,
                bottomMargin=72
            )
            
            # Initialize styles
            styles = getSampleStyleSheet()
            title_style = ParagraphStyle(
                'CustomTitle',
                parent=styles['Heading1'],
                fontSize=24,
                spaceAfter=30
            )
            heading_style = ParagraphStyle(
                'CustomHeading',
                parent=styles['Heading2'],
                fontSize=18,
                spaceAfter=20
            )
            subheading_style = ParagraphStyle(
                'CustomSubHeading',
                parent=styles['Heading3'],
                fontSize=14,
                spaceAfter=10
            )
            body_style = ParagraphStyle(
                'CustomBody',
                parent=styles['Normal'],
                fontSize=12,
                spaceAfter=12
            )
            
            # Build content
            content = []
            
            # Title
            content.append(Paragraph("ASV Analysis Report", title_style))
            content.append(Spacer(1, 20))
            
            # Executive Summary
            self._add_executive_summary(content, heading_style, body_style)
            
            # Classification Analysis
            self._add_classification_analysis(content, heading_style, subheading_style, body_style)
            
            # Authentication Analysis
            self._add_authentication_analysis(content, heading_style, subheading_style, body_style)
            
            # Feature Analysis
            self._add_feature_analysis(content, heading_style, subheading_style, body_style)
            
            # Group Analysis
            self._add_group_analysis(content, heading_style, subheading_style, body_style)
            
            # Save document
            doc.build(content)
            print(f"PDF report generated: {pdf_path}")
            
        except Exception as e:
            print(f"Error generating PDF report: {str(e)}")
            print(traceback.format_exc())

    def _add_executive_summary(self, content: List, heading_style: ParagraphStyle, 
                             body_style: ParagraphStyle) -> None:
        """Add executive summary section to report"""
        content.append(Paragraph("1. Executive Summary", heading_style))
        
        # Calculate key metrics
        total_asvs = len(self.df)
        main_count = (self.df['predicted_status'] == 'main').sum()
        auth_count = (self.df['auth_status'] == 'authenticated').sum()
        
        summary_text = f"""
        This analysis processed {total_asvs:,} ASVs, identifying {main_count:,} ({main_count/total_asvs*100:.1f}%) 
        as main ASVs and {auth_count:,} ({auth_count/total_asvs*100:.1f}%) as authenticated sequences.
        
        Key Findings:
        • Main/Candidate Classification achieved {self.metrics['main_candidate']['f1']:.1%} F1 score
        • Authentication Analysis achieved {self.metrics['authentication']['f1']:.1%} F1 score
        • Overall agreement with original labels: {self.metrics['authentication_results']['agreement']:.1f}%
        """
        
        content.append(Paragraph(summary_text, body_style))
        content.append(Spacer(1, 20))

    def _add_classification_analysis(self, content: List, heading_style: ParagraphStyle, 
                                  subheading_style: ParagraphStyle, 
                                  body_style: ParagraphStyle) -> None:
        """Add classification analysis section to report"""
        content.append(Paragraph("2. Main/Candidate Classification Analysis", heading_style))
        
        # Performance Metrics
        content.append(Paragraph("2.1 Performance Metrics", subheading_style))
        metrics_text = f"""
        Classification Performance:
        • Accuracy: {self.metrics['main_candidate']['accuracy']:.3f}
        • Precision: {self.metrics['main_candidate']['precision']:.3f}
        • Recall: {self.metrics['main_candidate']['recall']:.3f}
        • F1 Score: {self.metrics['main_candidate']['f1']:.3f}
        • ROC AUC: {self.metrics['main_candidate']['roc_auc']:.3f}
        
        Optimal Classification Threshold: {self.thresholds['main_candidate']:.3f}
        """
        content.append(Paragraph(metrics_text, body_style))
        
        # Feature Importance
        content.append(Paragraph("2.2 Feature Importance", subheading_style))
        if hasattr(self.models['main_candidate'], 'feature_importances_'):
            importance = pd.DataFrame({
                'Feature': self.features['main_candidate'],
                'Importance': self.models['main_candidate'].feature_importances_
            }).sort_values('Importance', ascending=False)
            
            importance_text = "Key Features:\n" + "\n".join([
                f"• {row['Feature']}: {row['Importance']:.3f}"
                for _, row in importance.head().iterrows()
            ])
            content.append(Paragraph(importance_text, body_style))
        
        content.append(Spacer(1, 20))

    def _add_authentication_analysis(self, content: List, heading_style: ParagraphStyle, 
                                  subheading_style: ParagraphStyle, 
                                  body_style: ParagraphStyle) -> None:
        """Add authentication analysis section to report"""
        content.append(Paragraph("3. Authentication Analysis", heading_style))
        
        # Performance Metrics
        content.append(Paragraph("3.1 Authentication Performance", subheading_style))
        auth_text = f"""
        Overall Performance:
        • Accuracy: {self.metrics['authentication']['accuracy']:.3f}
        • Precision: {self.metrics['authentication']['precision']:.3f}
        • Recall: {self.metrics['authentication']['recall']:.3f}
        • F1 Score: {self.metrics['authentication']['f1']:.3f}
        • ROC AUC: {self.metrics['authentication']['roc_auc']:.3f}
        
        Authentication Thresholds:
        • Main ASVs: {self.thresholds['authentication']['main']:.3f}
        • Candidate ASVs: {self.thresholds['authentication']['candidate']:.3f}
        """
        content.append(Paragraph(auth_text, body_style))
        
        # Results by ASV Type
        content.append(Paragraph("3.2 Results by ASV Type", subheading_style))
        for status in ['main', 'candidate']:
            metrics = self.metrics['authentication_results'][f'{status}_metrics']
            type_text = f"""
            {status.title()} ASVs:
            • Total: {metrics['total']:,}
            • Authenticated: {metrics['authenticated']:,}
            • Authentication Rate: {metrics['authentication_rate']:.1f}%
            • Agreement with Original Labels: {metrics['agreement']:.1f}%
            """
            content.append(Paragraph(type_text, body_style))
        
        content.append(Spacer(1, 20))

    def _add_feature_analysis(self, content: List, heading_style: ParagraphStyle, 
                            subheading_style: ParagraphStyle, 
                            body_style: ParagraphStyle) -> None:
        """Add feature analysis section to report"""
        content.append(Paragraph("4. Feature Analysis", heading_style))
        
        # Feature Statistics
        content.append(Paragraph("4.1 Feature Statistics", subheading_style))
        for feature_set in ['main_candidate', 'authentication']:
            stats_text = f"\n{feature_set.replace('_', ' ').title()} Features:\n"
            for feature in self.features[feature_set]:
                stats = self.df[feature].describe()
                stats_text += f"""
                {feature}:
                • Mean: {stats['mean']:.3f}
                • Std: {stats['std']:.3f}
                • Min: {stats['min']:.3f}
                • Max: {stats['max']:.3f}
                """
            content.append(Paragraph(stats_text, body_style))
        
        # Feature Correlations
        content.append(Paragraph("4.2 Feature Correlations", subheading_style))
        corr_matrix = self.df[self.features['main_candidate'] + 
                            self.features['authentication']].corr()
        
        # Find strong correlations
        strong_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i,j]) > 0.7:
                    strong_corr.append(
                        f"• {corr_matrix.columns[i]} - {corr_matrix.columns[j]}: "
                        f"{corr_matrix.iloc[i,j]:.3f}"
                    )
        
        if strong_corr:
            corr_text = "Strong Feature Correlations:\n" + "\n".join(strong_corr)
            content.append(Paragraph(corr_text, body_style))
        
        content.append(Spacer(1, 20))

    def _add_group_analysis(self, content: List, heading_style: ParagraphStyle, 
                          subheading_style: ParagraphStyle, 
                          body_style: ParagraphStyle) -> None:
        """Add group analysis section to report"""
        content.append(Paragraph("5. Group Analysis", heading_style))
        
        # Group Statistics
        group_stats = self.df.groupby('project_readfile_id').agg({
            'asv_id': 'count',
            'predicted_status': lambda x: (x == 'main').mean() * 100,
            'auth_status': lambda x: (x == 'authenticated').mean() * 100
        })
        
        stats_text = f"""
        Group Analysis Summary:
        • Total Groups: {len(group_stats)}
        • Average ASVs per Group: {group_stats['asv_id'].mean():.1f}
        • Average Main ASV Percentage: {group_stats['predicted_status'].mean():.1f}%
        • Average Authentication Rate: {group_stats['auth_status'].mean():.1f}%
        
        Group Size Distribution:
        • Minimum: {group_stats['asv_id'].min():.0f}
        • Maximum: {group_stats['asv_id'].max():.0f}
        • Median: {group_stats['asv_id'].median():.0f}
        """
        
        content.append(Paragraph(stats_text, body_style))
        content.append(Spacer(1, 20))

    def generate_summary_report(self) -> str:
        """
        Generate a text summary of the analysis
        
        Returns:
            str: Summary report
        """
        summary = []
        
        # Header
        summary.append("ASV ANALYSIS SUMMARY REPORT")
        summary.append("=" * 50)
        
        # Basic Statistics
        summary.append("\n1. BASIC STATISTICS")
        summary.append("-" * 20)
        summary.append(f"Total ASVs analyzed: {len(self.df):,}")
        summary.append(f"Main ASVs: {(self.df['predicted_status'] == 'main').sum():,}")
        summary.append(f"Authenticated ASVs: {(self.df['auth_status'] == 'authenticated').sum():,}")
        
        # Classification Performance
        summary.append("\n2. CLASSIFICATION PERFORMANCE")
        summary.append("-" * 20)
        mc_metrics = self.metrics['main_candidate']
        summary.append(f"Accuracy: {mc_metrics['accuracy']:.3f}")
        summary.append(f"F1 Score: {mc_metrics['f1']:.3f}")
        summary.append(f"ROC AUC: {mc_metrics['roc_auc']:.3f}")
        
        # Authentication Performance
        summary.append("\n3. AUTHENTICATION PERFORMANCE")
        summary.append("-" * 20)
        auth_metrics = self.metrics['authentication']
        summary.append(f"Accuracy: {auth_metrics['accuracy']:.3f}")
        summary.append(f"F1 Score: {auth_metrics['f1']:.3f}")
        summary.append(f"ROC AUC: {auth_metrics['roc_auc']:.3f}")
        
        # Group Analysis
        summary.append("\n4. GROUP ANALYSIS")
        summary.append("-" * 20)
        group_stats = self.df.groupby('project_readfile_id').agg({
            'asv_id': 'count',
            'predicted_status': lambda x: (x == 'main').mean() * 100,
            'auth_status': lambda x: (x == 'authenticated').mean() * 100
        })
        summary.append(f"Total Groups: {len(group_stats)}")
        summary.append(f"Average ASVs per Group: {group_stats['asv_id'].mean():.1f}")
        summary.append(f"Average Authentication Rate: {group_stats['auth_status'].mean():.1f}%")
        
        # Recommendations
        summary.append("\n5. RECOMMENDATIONS")
        summary.append("-" * 20)
        
        # Add recommendations based on analysis results
        if mc_metrics['f1'] < 0.8:
            summary.append("• Consider reviewing main/candidate classification criteria")
        if auth_metrics['f1'] < 0.8:
            summary.append("• Authentication thresholds may need adjustment")
        
        # Add any groups with unusual patterns
        unusual_groups = group_stats[
            (group_stats['auth_status'] < group_stats['auth_status'].mean() - 
             group_stats['auth_status'].std())
        ].index
        if len(unusual_groups) > 0:
            summary.append("\nGroups requiring review:")
            for group in unusual_groups:
                summary.append(f"• {group}")
        
        return "\n".join(summary)

    def save_analysis_state(self, filepath: str) -> bool:
        """
        Save analysis state to file for later use
        
        Args:
            filepath: Path to save state file
            
        Returns:
            bool: Success status
        """
        try:
            print(f"\nSaving analysis state to {filepath}...")
            
            # Create state dictionary
            state = {
                'version': '1.0',
                'timestamp': pd.Timestamp.now().isoformat(),
                'data': {
                    'df': self.df.to_dict(),
                    'features': self.features,
                    'thresholds': self.thresholds,
                    'metrics': self.metrics,
                    'parameters': self.params
                },
                'models': {
                    'main_candidate': self._serialize_model(self.models['main_candidate']),
                    'authentication': self._serialize_model(self.models['authentication']),
                    'scaler': self._serialize_model(self.models['scaler'])
                }
            }
            
            # Save to file
            with open(filepath, 'wb') as f:
                pickle.dump(state, f)
            
            print("Analysis state saved successfully")
            return True
            
        except Exception as e:
            print(f"Error saving analysis state: {str(e)}")
            return False

    def load_analysis_state(self, filepath: str) -> bool:
        """
        Load analysis state from file
        
        Args:
            filepath: Path to state file
            
        Returns:
            bool: Success status
        """
        try:
            print(f"\nLoading analysis state from {filepath}...")
            
            # Load state dictionary
            with open(filepath, 'rb') as f:
                state = pickle.load(f)
            
            # Validate version
            if state.get('version', '0.0') != '1.0':
                print("Warning: State file version mismatch")
            
            # Restore data
            self.df = pd.DataFrame.from_dict(state['data']['df'])
            self.features = state['data']['features']
            self.thresholds = state['data']['thresholds']
            self.metrics = state['data']['metrics']
            self.params = state['data']['parameters']
            
            # Restore models
            self.models = {
                'main_candidate': self._deserialize_model(state['models']['main_candidate']),
                'authentication': self._deserialize_model(state['models']['authentication']),
                'scaler': self._deserialize_model(state['models']['scaler'])
            }
            
            print("Analysis state loaded successfully")
            print(f"State timestamp: {state['timestamp']}")
            return True
            
        except Exception as e:
            print(f"Error loading analysis state: {str(e)}")
            return False

    def _serialize_model(self, model: Any) -> bytes:
        """
        Serialize model object to bytes
        
        Args:
            model: Model object to serialize
            
        Returns:
            bytes: Serialized model
        """
        return pickle.dumps(model)

    def _deserialize_model(self, model_bytes: bytes) -> Any:
        """
        Deserialize model from bytes
        
        Args:
            model_bytes: Serialized model
            
        Returns:
            Any: Deserialized model
        """
        return pickle.loads(model_bytes)

    def validate_results(self) -> List[str]:
        """
        Perform comprehensive validation of analysis results
        
        Returns:
            List[str]: List of validation warnings
        """
        warnings = []
        
        # Check for missing values
        missing_values = self.df.isnull().sum()
        if missing_values.any():
            warnings.append(f"Missing values found in columns: "
                          f"{missing_values[missing_values > 0].index.tolist()}")
        
        # Check model performance
        if self.metrics['main_candidate']['f1'] < 0.7:
            warnings.append("Main/Candidate classification F1 score below 0.7")
        if self.metrics['authentication']['f1'] < 0.7:
            warnings.append("Authentication F1 score below 0.7")
        
        # Check group distributions
        group_sizes = self.df.groupby('project_readfile_id').size()
        if group_sizes.std() / group_sizes.mean() > 0.5:
            warnings.append("High variation in group sizes detected")
        
        # Check authentication rates
        auth_rates = self.df.groupby('project_readfile_id')['auth_status'].apply(
            lambda x: (x == 'authenticated').mean()
        )
        if auth_rates.std() > 0.2:
            warnings.append("High variation in authentication rates across groups")
        
        # Check feature distributions
        for feature in self.features['main_candidate'] + self.features['authentication']:
            z_scores = np.abs(stats.zscore(self.df[feature]))
            outliers = (z_scores > 3).sum()
            if outliers > len(self.df) * 0.01:
                warnings.append(f"High number of outliers in feature {feature}")
        
        return warnings

    def suggest_improvements(self) -> Dict[str, List[str]]:
        """
        Suggest potential improvements based on analysis results
        
        Returns:
            Dict[str, List[str]]: Dictionary of improvement suggestions
        """
        suggestions = {
            'classification': [],
            'authentication': [],
            'features': [],
            'general': []
        }
        
        # Classification suggestions
        mc_metrics = self.metrics['main_candidate']
        if mc_metrics['precision'] < mc_metrics['recall']:
            suggestions['classification'].append(
                "Consider increasing classification threshold for higher precision"
            )
        if mc_metrics['f1'] < 0.8:
            suggestions['classification'].append(
                "Review feature importance and consider adding new features"
            )
        
        # Authentication suggestions
        auth_metrics = self.metrics['authentication']
        if auth_metrics['precision'] < 0.8:
            suggestions['authentication'].append(
                "Consider stricter authentication criteria for candidates"
            )
        
        # Feature suggestions
        feature_imp = pd.DataFrame({
            'feature': self.features['main_candidate'],
            'importance': self.models['main_candidate'].feature_importances_
        }).sort_values('importance', ascending=False)
        
        if feature_imp['importance'].iloc[-1] < 0.01:
            suggestions['features'].append(
                f"Consider removing low-importance feature: {feature_imp.iloc[-1]['feature']}"
            )
        
        # Check feature correlations
        corr_matrix = self.df[self.features['main_candidate']].corr()
        high_corr = np.where(np.abs(corr_matrix) > 0.9)
        high_corr_pairs = set()
        for i, j in zip(*high_corr):
            if i != j:
                pair = tuple(sorted([corr_matrix.index[i], corr_matrix.index[j]]))
                high_corr_pairs.add(pair)
        
        if high_corr_pairs:
            suggestions['features'].append(
                "Consider removing one feature from highly correlated pairs: " +
                ", ".join([f"({f1}, {f2})" for f1, f2 in high_corr_pairs])
            )
        
        # General suggestions
        group_sizes = self.df.groupby('project_readfile_id').size()
        if group_sizes.std() / group_sizes.mean() > 0.5:
            suggestions['general'].append(
                "Consider normalizing features within groups due to high size variation"
            )
        
        return suggestions

    def export_for_validation(self, output_path: str) -> bool:
        """
        Export results in format suitable for expert validation
        
        Args:
            output_path: Path for validation file
            
        Returns:
            bool: Success status
        """
        try:
            # Create validation DataFrame
            validation_df = pd.DataFrame({
                'ASV_ID': self.df['asv_id'],
                'Group_ID': self.df['project_readfile_id'],
                'Original_Status': self.df['step1'],
                'Predicted_Status': self.df['predicted_status'],
                'Status_Confidence': self.df['mc_probability'],
                'Original_Authentication': self.df['step2'],
                'Predicted_Authentication': self.df['auth_status'],
                'Auth_Confidence': self.df['auth_probability'],
                'Read_Count': self.df['read_count'],
                'Sample_Coverage': self.df['count_read'],
                'Distance_Ratio': self.df['dist_ratio']
            })
            
            # Add validation columns
            validation_df['Status_Correct'] = ''
            validation_df['Auth_Correct'] = ''
            validation_df['Notes'] = ''
            
            # Export to Excel
            validation_df.to_excel(output_path, index=False)
            
            print(f"Validation file exported to {output_path}")
            return True
            
        except Exception as e:
            print(f"Error exporting validation file: {str(e)}")
            return False

def run_complete_pipeline(metadata_path: str, fasta_path: str, output_dir: str) -> None:
    """
    Run complete ASV analysis pipeline with comprehensive outputs
    
    Args:
        metadata_path: Path to metadata Excel file
        fasta_path: Path to FASTA sequence file
        output_dir: Directory for outputs
    """
    try:
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize analyzer
        print("\nInitializing ASV analyzer...")
        analyzer = IntegratedASVAnalyzer(output_dir)
        
        # Define output paths
        results_path = os.path.join(output_dir, 'asv_analysis_results.xlsx')
        report_path = os.path.join(output_dir, 'analysis_report.pdf')
        state_path = os.path.join(output_dir, 'analysis_state.pkl')
        validation_path = os.path.join(output_dir, 'validation_file.xlsx')
        
        # Run main analysis
        success = analyzer.run_complete_analysis(
            metadata_path=metadata_path,
            fasta_path=fasta_path,
            output_path=results_path
        )
        
        if success:
            # Save analysis state
            analyzer.save_analysis_state(state_path)
            
            # Export validation file
            analyzer.export_for_validation(validation_path)
            
            # Validate results and get warnings
            warnings = analyzer.validate_results()
            if warnings:
                print("\nValidation Warnings:")
                for warning in warnings:
                    print(f"- {warning}")
                
                # Save warnings to file
                with open(os.path.join(output_dir, 'validation_warnings.txt'), 'w') as f:
                    f.write("\n".join(warnings))
            
            # Get improvement suggestions
            suggestions = analyzer.suggest_improvements()
            print("\nImprovement Suggestions:")
            for category, items in suggestions.items():
                if items:
                    print(f"\n{category.title()}:")
                    for item in items:
                        print(f"- {item}")
            
            # Save suggestions to file
            with open(os.path.join(output_dir, 'improvement_suggestions.txt'), 'w') as f:
                for category, items in suggestions.items():
                    if items:
                        f.write(f"\n{category.title()}:\n")
                        for item in items:
                            f.write(f"- {item}\n")
            
            print("\nAnalysis pipeline completed successfully!")
            print(f"Results directory: {output_dir}")
            
        else:
            print("\nAnalysis pipeline failed!")
            
    except Exception as e:
        print(f"\nError in analysis pipeline: {str(e)}")
        print("\nFull error details:")
        print(traceback.format_exc())

# Main execution if run as script
if __name__ == "__main__":
    print("\nMetabarcoding ASV Analysis Pipeline")
    print("=" * 50)
    
    # Example file paths
    metadata_path = "/Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/Metabarcoding_Machine_Learning.csv"
    fasta_path = "/Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/MBCTH.fasta"
    output_dir = "/Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result"
    
    # Run pipeline
    run_complete_pipeline(metadata_path, fasta_path, output_dir)


Metabarcoding ASV Analysis Pipeline

Initializing ASV analyzer...

=== STARTING COMPLETE ASV ANALYSIS ===


Analysis Progress:   0%|          | 0/4 [00:00<?, ?it/s]


=== LOADING AND PREPROCESSING DATA ===

=== LOADING AND PREPROCESSING DATA ===
--------------------------------------------------

Loading metadata...

Available columns in your data:
['well', 'project_readfile_id', 'image_id', 'sample_count', 'asv_count', 'asv_id', 'read_count', 'total_read', 'count_read', 'percentage', 'nearest_main_ASV', 'nearest_main_dist', 'nearest_cand_ASV', 'nearest_cand_dist', 'family_tree', 'subfamily_tree', 'step1', 'step2']

Filtering outgroups...
Removed 1286 outgroup entries

Loading sequences...


Reading FASTA:  33%|███▎      | 781k/2.37M [00:00<00:00, 25.2MB/s]



Creating basic features...

Creating advanced features...

Creating and validating features...





Creating basic features...
Creating basic features...

Creating taxonomic features...
Creating taxonomic features...





Creating abundance features...
Creating abundance features...




Abundance features created successfully

Creating distance features...
Creating distance features...

Creating quality metrics...
Creating quality metrics...

Cleaning features...
Cleaning features...
Cleaning missing values in nearest_main_ASV
Cleaning missing values in nearest_cand_ASV
Cleaning missing values in subfamily_tree
Cleaning missing values in nearest_main_family
Cleaning missing values in nearest_main_subfamily
Cleaning missing values in relative_cand_dist
Applied stricter winsorization to log_read (capped 291 outliers)
Applied winsorization to group_size_factor (capped 0 outliers)



Cleaning features: 100%|██████████| 10/10 [00:00<00:00, 1208.42it/s]
Feature Creation: 100%|██████████| 7/7 [00:00<00:00,  8.20it/s]
Analysis Progress:  25%|██▌       | 1/4 [00:01<00:03,  1.04s/it]

Found 2 outliers in log_group_size
Found 89 outliers in read_count
Found 147 outliers in total_read
Found 16 outliers in nearest_main_dist
Found 69 outliers in nearest_cand_dist
Found 106 outliers in dist_ratio

Validating features...

Feature Summary:
--------------------------------------------------

Basic Statistics:
       sample_count  asv_count  read_count  total_read  count_read   
count      7656.000   7656.000    7656.000    7656.000    7656.000  \
mean         20.990     82.090      17.541     170.915       3.653   
std          18.205     54.396      22.208     254.706       3.725   
min           1.000      1.000       1.000       4.000       1.000   
25%           8.000     40.000       2.000       7.000       1.000   
50%          15.000     64.000       5.000      18.000       2.000   
75%          28.000    126.000      25.000     260.000       5.000   
max          69.000    231.000      59.500     639.500      25.000   

       percentage  nearest_main_dist  nearest_




Executing: Preparing data

Preparing classification data...

Class Distribution:
Candidate: 5701 (74.46%)
Main: 1955 (25.54%)

Executing: Selecting best model

Performing model selection...



[A
[A


RandomForest Results:
Best F1 score: 0.9997
Best parameters: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}



Evaluating models: 100%|██████████| 2/2 [00:09<00:00,  4.75s/it]
Classification Progress: 100%|██████████| 4/4 [00:09<00:00,  2.40s/it]
Analysis Progress:  50%|█████     | 2/4 [00:10<00:12,  6.07s/it]


GradientBoosting Results:
Best F1 score: 0.9994
Best parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}

Selected model: RandomForestClassifier
Best cross-validation F1 score: 0.9997

Executing: Evaluating model

Evaluating model performance...

Optimal threshold: 0.9900
ROC AUC: 1.0000
PR AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

   Candidate       1.00      1.00      1.00      1141
        Main       1.00      1.00      1.00       391

    accuracy                           1.00      1532
   macro avg       1.00      1.00      1.00      1532
weighted avg       1.00      1.00      1.00      1532


Executing: Applying classification

Applying classification to all data...

Classification Results:
------------------------------

Confusion Matrix:
step1             candidate  main   All
predicted_status                       
candidate              5701     1  5702
main                      0  1954 




Executing: Preparing authentication data

Preparing authentication data...

Authentication Class Distribution:
Unauthenticated: 5369 (70.13%)
Authenticated: 2287 (29.87%)

Executing: Training authentication model

Training authentication model...

Cross-validation scores:
Mean F1: 0.9992 (±0.0022)


Authentication Analysis: 100%|██████████| 4/4 [00:04<00:00,  1.01s/it]
Analysis Progress:  75%|███████▌  | 3/4 [00:14<00:05,  5.14s/it]


Feature Importance:
             feature  importance
0         read_count    0.411708
4         dist_ratio    0.227022
3  nearest_cand_dist    0.175075
2  nearest_main_dist    0.109529
1         total_read    0.071033
5     log_group_size    0.005633

Executing: Evaluating authentication model

Evaluating authentication model performance...

Authentication Model Performance:
Accuracy: 0.9980
Precision: 1.0000
Recall: 0.9934
F1: 0.9967
Roc_Auc: 0.9998
Matthews_Corrcoef: 0.9953

Executing: Applying authentication

Applying authentication rules...

Authentication Results:
------------------------------

Overall Results:
Total ASVs authenticated: 2120
Overall authentication rate: 27.69%
Agreement with original labels: 97.82%

Results by ASV Type:

Main ASVs:
Total: 1954
Authenticated: 1954
Authentication rate: 25.52%
Agreement: 25.52%

Candidate ASVs:
Total: 5702
Authenticated: 166
Authentication rate: 2.17%
Agreement: 72.30%

Distance Metrics for Authenticated ASVs:
Mean distance ratio: 

Analysis Progress: 100%|██████████| 4/4 [00:16<00:00,  4.09s/it]

Detailed report generated: /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result/asv_analysis_results_report.txt
PDF report generated: /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result/asv_analysis_results_report.pdf

Results exported successfully:
- Excel file: /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result/asv_analysis_results.xlsx
- JSON file: /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result/asv_analysis_results.json
- Report file: /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result/asv_analysis_results_




Validation file exported to /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result/validation_file.xlsx

- High variation in group sizes detected

Improvement Suggestions:

Features:
- Consider removing low-importance feature: group_size_quantile
- Consider removing one feature from highly correlated pairs: (log_read, rank_read)

General:
- Consider normalizing features within groups due to high size variation

Analysis pipeline completed successfully!
Results directory: /Users/sarawut/Library/CloudStorage/OneDrive-ImperialCollegeLondon/2024_R/R_analysis/Chapter2_Data_generation/Metabarcoding_Machine_Learning/result
