In [1]:
#!/usr/bin/env python3
"""
Methylation Expression Matrix Generator - Memory Optimized for Jupyter
Generate three types of methylation expression matrices with minimal memory usage
"""

import pandas as pd
import numpy as np
import os
import gc
from collections import defaultdict
from intervaltree import IntervalTree

class MethylationMatrixGenerator:
    def __init__(self, batch_size=2):
        # Path configuration
        self.raw_data_dir = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data"
        self.samples_dir = os.path.join(self.raw_data_dir, "MethylationLevels_samples")
        self.output_dir = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data"
        
        # Memory optimization settings
        self.batch_size = batch_size  # Process only 2 samples at a time
        self.chunk_size = 100000      # Process 100k CpGs per chunk
        
        # Annotation file paths
        self.all_dmr_file = os.path.join(self.raw_data_dir, "3DNA_all.csv")
        self.significant_dmr_file = os.path.join(self.raw_data_dir, "4DNA_DMRs.csv")
        
        # Ensure output directory exists
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Data storage using IntervalTree for efficient interval queries
        self.all_region_trees = defaultdict(IntervalTree)
        self.promoter_region_trees = defaultdict(IntervalTree)
        self.significant_region_trees = defaultdict(IntervalTree)
        self.sample_names = []  # Only store sample names, not data
        
    def get_gene_name(self, row):
        """Extract gene name with fallback strategy"""
        gene_name = str(row['HGNC.symbol']).strip()
        if not gene_name or gene_name == 'nan':
            if 'annot.symbol' in row:
                gene_name = str(row['annot.symbol']).strip()
        if not gene_name or gene_name == 'nan':
            return None
        return gene_name
    
    def get_region_priority(self, annot_type):
        """Get region type priority for deduplication"""
        annot_type = str(annot_type).lower()
        if any(keyword in annot_type for keyword in ['promoter', 'hg38_genes_promoters', '1to5kb']):
            return 3
        elif 'exon' in annot_type:
            return 2
        elif 'intron' in annot_type:
            return 1
        else:
            return 0
    
    def is_promoter_region(self, annot_type):
        """Check if annotation type indicates promoter region (strict definition)"""
        annot_type = str(annot_type).lower()
        promoter_keywords = ['promoter', '1to5kb', 'hg38_genes_promoters']
        return any(keyword in annot_type for keyword in promoter_keywords)
        
    def load_dmr_annotations(self):
        """Load DNA methylation annotation files with memory optimization"""
        print("Loading DNA methylation annotation files...")
        
        # Load and process all DMR data in chunks
        print("Processing all DNA methylation regions...")
        all_regions_dict = {}
        
        chunksize = 50000  # Process 50k rows at a time
        for chunk in pd.read_csv(self.all_dmr_file, chunksize=chunksize, low_memory=False):
            for _, row in chunk.iterrows():
                try:
                    gene_name = self.get_gene_name(row)
                    if not gene_name:
                        continue
                        
                    chr_info = str(row['seqnames'])
                    if not chr_info.startswith('chr'):
                        chr_info = 'chr' + chr_info
                        
                    start_pos = int(row['start'])
                    end_pos = int(row['end'])
                    annot_type = row['annot.type']
                    
                    region_key = (gene_name, chr_info, start_pos, end_pos)
                    current_priority = self.get_region_priority(annot_type)
                    
                    if region_key not in all_regions_dict:
                        all_regions_dict[region_key] = (current_priority, annot_type)
                    else:
                        existing_priority, existing_annot = all_regions_dict[region_key]
                        if current_priority > existing_priority:
                            all_regions_dict[region_key] = (current_priority, annot_type)
                            
                except Exception as e:
                    continue
            
            # Force garbage collection after each chunk
            gc.collect()
        
        print(f"Loaded {len(all_regions_dict)} unique regions")
        
        # Build IntervalTrees from deduplicated regions
        promoter_count = 0
        for (gene_name, chr_info, start_pos, end_pos), (priority, annot_type) in all_regions_dict.items():
            self.all_region_trees[chr_info].addi(start_pos, end_pos + 1, gene_name)
            
            if self.is_promoter_region(annot_type):
                self.promoter_region_trees[chr_info].addi(start_pos, end_pos + 1, gene_name)
                promoter_count += 1
        
        # Clear memory
        del all_regions_dict
        gc.collect()
        
        # Load and process significant DMR data
        print("Processing significant DMR regions...")
        significant_count = 0
        
        for chunk in pd.read_csv(self.significant_dmr_file, chunksize=chunksize, low_memory=False):
            for _, row in chunk.iterrows():
                try:
                    gene_name = self.get_gene_name(row)
                    if not gene_name:
                        continue
                        
                    chr_info = str(row['seqnames'])
                    if not chr_info.startswith('chr'):
                        chr_info = 'chr' + chr_info
                        
                    start_pos = int(row['start'])
                    end_pos = int(row['end'])
                    annot_type = row['annot.type']
                    
                    if self.is_promoter_region(annot_type):
                        self.significant_region_trees[chr_info].addi(start_pos, end_pos + 1, gene_name)
                        significant_count += 1
                        
                except Exception as e:
                    continue
            
            gc.collect()
        
        all_count = sum(len(tree) for tree in self.all_region_trees.values())
        print(f"All DNA methylation regions: {all_count}")
        print(f"Promoter regions: {promoter_count}")
        print(f"Significant promoter DMR regions: {significant_count}")
        
    def get_sample_files(self):
        """Get list of sample files"""
        sample_files = [f for f in os.listdir(self.samples_dir) 
                       if f.endswith('.txt') and 'mC_level_Identification_stat' in f]
        self.sample_names = [f.split('.')[0] for f in sample_files]
        return [(f.split('.')[0], os.path.join(self.samples_dir, f)) for f in sample_files]
    
    def load_sample_chunk(self, sample_name, file_path):
        """Load a single sample's data in chunks to save memory"""
        print(f"Loading sample: {sample_name}")
        
        sample_cpgs = {}
        total_sites = 0
        
        try:
            # Read in chunks to control memory usage
            for chunk in pd.read_csv(file_path, delimiter='\t', header=None, 
                                   chunksize=self.chunk_size, on_bad_lines='skip', low_memory=False):
                
                for _, row in chunk.iterrows():
                    try:
                        if len(row) < 7:
                            continue
                        
                        total_sites += 1
                        
                        # Filter for CG context only
                        context = str(row.iloc[6]).strip()
                        if context != 'CG':
                            continue
                            
                        chr_name = str(row.iloc[0]).strip()
                        if chr_name == '' or chr_name == 'nan':
                            continue
                            
                        if not chr_name.startswith('chr'):
                            chr_name = 'chr' + chr_name
                        
                        pos = int(row.iloc[1])
                        ml_value = float(row.iloc[5])
                        
                        if 0 <= ml_value <= 1:
                            sample_cpgs[(chr_name, pos)] = ml_value
                        
                    except (ValueError, IndexError, TypeError):
                        continue
                
                # Show progress and force garbage collection
                if total_sites % (self.chunk_size * 10) == 0:
                    print(f"  Processed {total_sites:,} sites...")
                    gc.collect()
            
            print(f"Sample {sample_name}: {len(sample_cpgs):,} CpG sites loaded")
            return sample_cpgs
            
        except Exception as e:
            print(f"Error loading sample {sample_name}: {e}")
            return {}
    
    def process_sample_against_regions(self, sample_name, sample_cpgs, region_trees):
        """Process a single sample against region trees"""
        gene_values = defaultdict(list)
        matches = 0
        
        # Process by chromosome to improve cache locality
        chr_grouped = defaultdict(list)
        for (chr_name, pos), ml_value in sample_cpgs.items():
            chr_grouped[chr_name].append((pos, ml_value))
        
        for chr_name, pos_values in chr_grouped.items():
            if chr_name not in region_trees:
                continue
                
            tree = region_trees[chr_name]
            for pos, ml_value in pos_values:
                overlapping_intervals = tree[pos]
                if overlapping_intervals:
                    matches += len(overlapping_intervals)
                    for interval in overlapping_intervals:
                        gene_values[interval.data].append(ml_value)
        
        # Calculate averages immediately to save memory
        gene_averages = {}
        for gene, values in gene_values.items():
            gene_averages[gene] = np.mean(values)
        
        print(f"  Sample {sample_name}: {matches:,} CpG-region matches")
        return gene_averages
    
    def build_gene_expression_matrix(self, region_trees, matrix_name):
        """Build gene expression matrix with memory optimization"""
        print(f"Building {matrix_name}...")
        
        sample_files = self.get_sample_files()
        all_gene_data = defaultdict(dict)
        
        # Process samples in batches
        for i in range(0, len(sample_files), self.batch_size):
            batch = sample_files[i:i + self.batch_size]
            print(f"Processing batch {i//self.batch_size + 1}/{(len(sample_files)-1)//self.batch_size + 1}")
            
            for sample_name, file_path in batch:
                # Load sample data
                sample_cpgs = self.load_sample_chunk(sample_name, file_path)
                
                if sample_cpgs:
                    # Process against regions
                    gene_averages = self.process_sample_against_regions(
                        sample_name, sample_cpgs, region_trees)
                    
                    # Store results
                    for gene, avg_value in gene_averages.items():
                        all_gene_data[gene][sample_name] = avg_value
                
                # Clear sample data from memory
                del sample_cpgs
                gc.collect()
        
        # Build final matrix
        if all_gene_data:
            all_genes = sorted(list(all_gene_data.keys()))
            all_samples = sorted(self.sample_names)
            
            print(f"Building matrix with {len(all_genes)} genes and {len(all_samples)} samples...")
            
            matrix_data = []
            for gene in all_genes:
                row_data = []
                for sample in all_samples:
                    value = all_gene_data[gene].get(sample, np.nan)
                    row_data.append(value)
                matrix_data.append(row_data)
            
            df = pd.DataFrame(matrix_data, index=all_genes, columns=all_samples)
            df.index.name = 'Gene_Symbol'
            
            print(f"Matrix shape: {df.shape}")
            return df
        else:
            print("No matching genes found")
            return pd.DataFrame()
    
    def generate_matrices(self):
        """Generate three methylation expression matrices"""
        print("Starting methylation expression matrix generation...")
        
        # 1. All DNA methylation regions
        print("\n=== Matrix 1: All DNA Methylation ===")
        df1 = self.build_gene_expression_matrix(self.all_region_trees, "All DNA methylation")
        output_file1 = os.path.join(self.output_dir, "1_PD_AllGenes_AllCpGs.csv")
        df1.to_csv(output_file1)
        print(f"Matrix 1 saved: {output_file1}")
        
        # Clear memory before next matrix
        gc.collect()
        
        # 2. Promoter region CpGs
        print("\n=== Matrix 2: Promoter Region CpGs ===")
        df2 = self.build_gene_expression_matrix(self.promoter_region_trees, "Promoter regions")
        output_file2 = os.path.join(self.output_dir, "1_PD_PromoterRegion_CpGs.csv")
        df2.to_csv(output_file2)
        print(f"Matrix 2 saved: {output_file2}")
        
        gc.collect()
        
        # 3. Promoter region significant CpGs
        print("\n=== Matrix 3: Promoter Significant DMRs ===")
        df3 = self.build_gene_expression_matrix(self.significant_region_trees, "Significant promoter DMRs")
        output_file3 = os.path.join(self.output_dir, "1_PD_PromoterRegion_SignificantCpGs.csv")
        df3.to_csv(output_file3)
        print(f"Matrix 3 saved: {output_file3}")
        
        return df1, df2, df3
    
    def display_matrix_preview(self, df, matrix_name):
        """Display first 5 rows of matrix with dimensions"""
        print(f"\n=== {matrix_name} ===")
        print(f"Dimensions: {df.shape[0]} rows x {df.shape[1]} columns")
        if not df.empty:
            print("First 5 rows:")
            print(df.head())
        else:
            print("Empty matrix")
    
    def run(self):
        """Run complete pipeline with memory optimization"""
        print("=== Memory-Optimized Methylation Matrix Generation ===")
        print(f"Batch size: {self.batch_size} samples")
        print(f"Chunk size: {self.chunk_size:,} CpGs per chunk")
        
        # Step 1: Load DMR annotations
        self.load_dmr_annotations()
        gc.collect()
        
        # Step 2 & 3: Generate matrices (sample loading integrated)
        matrices = self.generate_matrices()
        
        if matrices:
            df1, df2, df3 = matrices
            
            # Display previews
            self.display_matrix_preview(df1, "Matrix 1: All DNA Methylation")
            self.display_matrix_preview(df2, "Matrix 2: Promoter Region CpGs") 
            self.display_matrix_preview(df3, "Matrix 3: Promoter Significant DMRs")
            
            print(f"\nAll files saved to: {self.output_dir}")
        
        print("=== Generation Completed ===")
        return matrices

def main():
    """Main function"""
    # For Jupyter, use smaller batch size to prevent memory issues
    generator = MethylationMatrixGenerator(batch_size=1)  # Process 1 sample at a time
    matrices = generator.run()

if __name__ == "__main__":
    main()

=== Memory-Optimized Methylation Matrix Generation ===
Batch size: 1 samples
Chunk size: 100,000 CpGs per chunk
Loading DNA methylation annotation files...
Processing all DNA methylation regions...
Loaded 96497 unique regions
Processing significant DMR regions...
All DNA methylation regions: 96497
Promoter regions: 38079
Significant promoter DMR regions: 69553
Starting methylation expression matrix generation...

=== Matrix 1: All DNA Methylation ===
Building All DNA methylation...
Processing batch 1/50
Loading sample: P64d
  Processed 1,000,000 sites...
  Processed 2,000,000 sites...
  Processed 3,000,000 sites...
Sample P64d: 3,900,440 CpG sites loaded
  Sample P64d: 2,271,384 CpG-region matches
Processing batch 2/50
Loading sample: P257d
  Processed 1,000,000 sites...
  Processed 2,000,000 sites...
  Processed 3,000,000 sites...
  Processed 4,000,000 sites...
Sample P257d: 4,075,533 CpG sites loaded
  Sample P257d: 2,346,850 CpG-region matches
Processing batch 3/50
Loading sample: P

In [5]:
#!/usr/bin/env python3
"""
P26 Sample Matrix Generation Test
Test P26 sample processing using the same logic as the original matrix generator
"""

import pandas as pd
import numpy as np
import os
import gc
from collections import defaultdict
from intervaltree import IntervalTree

class P26MatrixTester:
    def __init__(self):
        # Path configuration
        self.raw_data_dir = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data"
        self.samples_dir = os.path.join(self.raw_data_dir, "MethylationLevels_samples")
        self.output_dir = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data"
        
        # P26 specific file
        self.p26_file = os.path.join(self.samples_dir, "P26d.mC_level_Identification_stat.txt")
        
        # Memory optimization settings
        self.chunk_size = 100000  # Process 100k CpGs per chunk
        
        # Annotation file paths
        self.all_dmr_file = os.path.join(self.raw_data_dir, "3DNA_all.csv")
        self.significant_dmr_file = os.path.join(self.raw_data_dir, "4DNA_DMRs.csv")
        
        # Data storage using IntervalTree
        self.all_region_trees = defaultdict(IntervalTree)
        self.promoter_region_trees = defaultdict(IntervalTree)
        self.significant_region_trees = defaultdict(IntervalTree)
        
    def get_gene_name(self, row):
        """Extract gene name with fallback strategy"""
        gene_name = str(row['HGNC.symbol']).strip()
        if not gene_name or gene_name == 'nan':
            if 'annot.symbol' in row:
                gene_name = str(row['annot.symbol']).strip()
        if not gene_name or gene_name == 'nan':
            return None
        return gene_name
    
    def get_region_priority(self, annot_type):
        """Get region type priority for deduplication"""
        annot_type = str(annot_type).lower()
        if any(keyword in annot_type for keyword in ['promoter', 'hg38_genes_promoters', '1to5kb']):
            return 3
        elif 'exon' in annot_type:
            return 2
        elif 'intron' in annot_type:
            return 1
        else:
            return 0
    
    def is_promoter_region(self, annot_type):
        """Check if annotation type indicates promoter region"""
        annot_type = str(annot_type).lower()
        promoter_keywords = ['promoter', '1to5kb', 'hg38_genes_promoters']
        return any(keyword in annot_type for keyword in promoter_keywords)
    
    def load_dmr_annotations(self):
        """Load DNA methylation annotation files - same as original"""
        print("=" * 80)
        print("LOADING DMR ANNOTATIONS FOR P26 TEST")
        print("=" * 80)
        
        # Check if annotation files exist
        if not os.path.exists(self.all_dmr_file):
            print(f"❌ All DMR file not found: {self.all_dmr_file}")
            return False
        
        if not os.path.exists(self.significant_dmr_file):
            print(f"❌ Significant DMR file not found: {self.significant_dmr_file}")
            return False
        
        print("Loading all DNA methylation regions...")
        all_regions_dict = {}
        
        chunksize = 50000
        processed_chunks = 0
        
        try:
            for chunk in pd.read_csv(self.all_dmr_file, chunksize=chunksize, low_memory=False):
                processed_chunks += 1
                print(f"  Processing chunk {processed_chunks}...")
                
                for _, row in chunk.iterrows():
                    try:
                        gene_name = self.get_gene_name(row)
                        if not gene_name:
                            continue
                            
                        chr_info = str(row['seqnames'])
                        if not chr_info.startswith('chr'):
                            chr_info = 'chr' + chr_info
                            
                        start_pos = int(row['start'])
                        end_pos = int(row['end'])
                        annot_type = row['annot.type']
                        
                        region_key = (gene_name, chr_info, start_pos, end_pos)
                        current_priority = self.get_region_priority(annot_type)
                        
                        if region_key not in all_regions_dict:
                            all_regions_dict[region_key] = (current_priority, annot_type)
                        else:
                            existing_priority, existing_annot = all_regions_dict[region_key]
                            if current_priority > existing_priority:
                                all_regions_dict[region_key] = (current_priority, annot_type)
                                
                    except Exception as e:
                        continue
                
                gc.collect()
                
                # Limit processing for testing (first 5 chunks = ~250k regions)
                if processed_chunks >= 5:
                    print(f"  Limited to first {processed_chunks} chunks for testing")
                    break
        
        except Exception as e:
            print(f"❌ Error loading all DMR file: {e}")
            return False
        
        print(f"Loaded {len(all_regions_dict)} unique regions from {processed_chunks} chunks")
        
        # Build IntervalTrees
        promoter_count = 0
        for (gene_name, chr_info, start_pos, end_pos), (priority, annot_type) in all_regions_dict.items():
            self.all_region_trees[chr_info].addi(start_pos, end_pos + 1, gene_name)
            
            if self.is_promoter_region(annot_type):
                self.promoter_region_trees[chr_info].addi(start_pos, end_pos + 1, gene_name)
                promoter_count += 1
        
        del all_regions_dict
        gc.collect()
        
        # Load significant DMRs (also limited for testing)
        print("Loading significant DMR regions...")
        significant_count = 0
        processed_sig_chunks = 0
        
        try:
            for chunk in pd.read_csv(self.significant_dmr_file, chunksize=chunksize, low_memory=False):
                processed_sig_chunks += 1
                
                for _, row in chunk.iterrows():
                    try:
                        gene_name = self.get_gene_name(row)
                        if not gene_name:
                            continue
                            
                        chr_info = str(row['seqnames'])
                        if not chr_info.startswith('chr'):
                            chr_info = 'chr' + chr_info
                            
                        start_pos = int(row['start'])
                        end_pos = int(row['end'])
                        annot_type = row['annot.type']
                        
                        if self.is_promoter_region(annot_type):
                            self.significant_region_trees[chr_info].addi(start_pos, end_pos + 1, gene_name)
                            significant_count += 1
                            
                    except Exception as e:
                        continue
                
                gc.collect()
                
                # Limit for testing
                if processed_sig_chunks >= 3:
                    break
        
        except Exception as e:
            print(f"❌ Error loading significant DMR file: {e}")
        
        all_count = sum(len(tree) for tree in self.all_region_trees.values())
        
        print(f"📊 Annotation Summary:")
        print(f"   All DNA methylation regions: {all_count:,}")
        print(f"   Promoter regions: {promoter_count:,}")
        print(f"   Significant promoter DMR regions: {significant_count:,}")
        
        return True
    
    def load_p26_sample_data(self):
        """Load P26 sample data using same logic as original"""
        print("\n" + "=" * 80)
        print("LOADING P26 SAMPLE DATA")
        print("=" * 80)
        
        if not os.path.exists(self.p26_file):
            print(f"❌ P26 file not found: {self.p26_file}")
            return {}
        
        print(f"📁 Loading P26 from: {self.p26_file}")
        
        sample_cpgs = {}
        total_sites = 0
        valid_cg_sites = 0
        invalid_values = 0
        out_of_range = 0
        parse_errors = 0
        
        try:
            # Read file in chunks exactly like the original code
            chunk_count = 0
            for chunk in pd.read_csv(self.p26_file, delimiter='\t', header=None, 
                                   chunksize=self.chunk_size, on_bad_lines='skip', low_memory=False):
                
                chunk_count += 1
                print(f"  Processing chunk {chunk_count}...")
                
                for _, row in chunk.iterrows():
                    try:
                        total_sites += 1
                        
                        # Check minimum columns (same as original)
                        if len(row) < 7:
                            parse_errors += 1
                            continue
                        
                        # Filter for CG context only (same as original)
                        context = str(row.iloc[6]).strip()
                        if context != 'CG':
                            continue
                        
                        valid_cg_sites += 1
                        
                        # Get chromosome name (same as original)
                        chr_name = str(row.iloc[0]).strip()
                        if chr_name == '' or chr_name == 'nan':
                            invalid_values += 1
                            continue
                            
                        if not chr_name.startswith('chr'):
                            chr_name = 'chr' + chr_name
                        
                        # Get position and methylation level (same as original)
                        pos = int(row.iloc[1])
                        ml_value = float(row.iloc[5])
                        
                        # Validate methylation level range (same as original)
                        if 0 <= ml_value <= 1:
                            sample_cpgs[(chr_name, pos)] = ml_value
                        else:
                            out_of_range += 1
                        
                    except (ValueError, IndexError, TypeError) as e:
                        parse_errors += 1
                        continue
                
                # Show progress (same as original)
                if total_sites % (self.chunk_size * 10) == 0:
                    print(f"    Processed {total_sites:,} sites, found {len(sample_cpgs):,} valid CpGs...")
                    gc.collect()
                
                # For testing, limit to first few chunks
                if chunk_count >= 10:  # Process ~1M sites for testing
                    print(f"  Limited to first {chunk_count} chunks for testing")
                    break
            
            print(f"\n📊 P26 Data Loading Results:")
            print(f"   Total sites processed: {total_sites:,}")
            print(f"   CG context sites: {valid_cg_sites:,}")
            print(f"   Valid CpG sites loaded: {len(sample_cpgs):,}")
            print(f"   Invalid values: {invalid_values:,}")
            print(f"   Out of range values: {out_of_range:,}")
            print(f"   Parse errors: {parse_errors:,}")
            
            if valid_cg_sites > 0:
                success_rate = len(sample_cpgs) / valid_cg_sites * 100
                print(f"   Success rate: {success_rate:.2f}%")
            
            return sample_cpgs
            
        except Exception as e:
            print(f"❌ Error loading P26 sample: {e}")
            return {}
    
    def process_p26_against_regions(self, sample_cpgs, region_trees, region_name):
        """Process P26 against region trees using same logic as original"""
        print(f"\n📍 Processing P26 against {region_name}...")
        
        if not sample_cpgs:
            print(f"   ❌ No CpG data to process")
            return {}
        
        gene_values = defaultdict(list)
        matches = 0
        chromosomes_checked = set()
        
        # Group by chromosome for efficiency (same as original)
        chr_grouped = defaultdict(list)
        for (chr_name, pos), ml_value in sample_cpgs.items():
            chr_grouped[chr_name].append((pos, ml_value))
        
        print(f"   Found CpGs on {len(chr_grouped)} chromosomes: {sorted(chr_grouped.keys())}")
        
        for chr_name, pos_values in chr_grouped.items():
            chromosomes_checked.add(chr_name)
            
            if chr_name not in region_trees:
                print(f"   ⚠️  No regions defined for {chr_name}")
                continue
            
            tree = region_trees[chr_name]
            chr_matches = 0
            
            for pos, ml_value in pos_values:
                overlapping_intervals = tree[pos]
                if overlapping_intervals:
                    matches += len(overlapping_intervals)
                    chr_matches += len(overlapping_intervals)
                    for interval in overlapping_intervals:
                        gene_values[interval.data].append(ml_value)
            
            if chr_matches > 0:
                print(f"   {chr_name}: {chr_matches:,} matches from {len(pos_values):,} CpGs")
        
        # Calculate averages (same as original)
        gene_averages = {}
        for gene, values in gene_values.items():
            gene_averages[gene] = np.mean(values)
        
        print(f"   📊 Processing Results:")
        print(f"      Total CpG-region matches: {matches:,}")
        print(f"      Unique genes with data: {len(gene_averages):,}")
        print(f"      Chromosomes with matches: {len([c for c in chromosomes_checked if c in region_trees])}")
        
        # Show some example genes
        if gene_averages:
            example_genes = list(gene_averages.items())[:10]
            print(f"   🧬 Example genes and methylation levels:")
            for gene, avg_value in example_genes:
                cpg_count = len(gene_values[gene])
                print(f"      {gene}: {avg_value:.4f} (from {cpg_count} CpGs)")
        
        return gene_averages
    
    def test_p26_matrix_generation(self):
        """Test P26 matrix generation for all three types"""
        print("\n" + "=" * 80)
        print("TESTING P26 MATRIX GENERATION")
        print("=" * 80)
        
        # Load P26 data
        p26_cpgs = self.load_p26_sample_data()
        
        if not p26_cpgs:
            print("❌ Cannot proceed without P26 data")
            return
        
        # Test against all three region types
        results = {}
        
        # 1. All DNA methylation regions
        print(f"\n=== Test 1: All DNA Methylation Regions ===")
        all_gene_data = self.process_p26_against_regions(
            p26_cpgs, self.all_region_trees, "All DNA methylation regions")
        results['all_regions'] = all_gene_data
        
        # 2. Promoter regions
        print(f"\n=== Test 2: Promoter Regions ===")
        promoter_gene_data = self.process_p26_against_regions(
            p26_cpgs, self.promoter_region_trees, "Promoter regions")
        results['promoter_regions'] = promoter_gene_data
        
        # 3. Significant promoter DMRs
        print(f"\n=== Test 3: Significant Promoter DMRs ===")
        significant_gene_data = self.process_p26_against_regions(
            p26_cpgs, self.significant_region_trees, "Significant promoter DMRs")
        results['significant_regions'] = significant_gene_data
        
        return results
    
    def generate_p26_test_matrices(self, results):
        """Generate test matrices with only P26 data"""
        print("\n" + "=" * 80)
        print("GENERATING P26 TEST MATRICES")
        print("=" * 80)
        
        matrices = {}
        output_files = {}
        
        matrix_configs = [
            ("all_regions", "P26_test_AllCpGs.csv", "All DNA Methylation"),
            ("promoter_regions", "P26_test_PromoterCpGs.csv", "Promoter Regions"),
            ("significant_regions", "P26_test_SignificantCpGs.csv", "Significant Promoter DMRs")
        ]
        
        for result_key, filename, description in matrix_configs:
            print(f"\n📊 Creating {description} matrix...")
            
            gene_data = results.get(result_key, {})
            
            if gene_data:
                # Create single-sample matrix
                genes = sorted(gene_data.keys())
                values = [gene_data[gene] for gene in genes]
                
                df = pd.DataFrame({
                    'P26d': values
                }, index=genes)
                df.index.name = 'Gene_Symbol'
                
                # Save matrix
                output_path = os.path.join(self.output_dir, filename)
                df.to_csv(output_path)
                
                matrices[result_key] = df
                output_files[result_key] = output_path
                
                print(f"   ✅ Matrix created: {df.shape[0]} genes × 1 sample")
                print(f"   💾 Saved to: {output_path}")
                
                # Show statistics
                print(f"   📈 Data statistics:")
                print(f"      Mean methylation: {df['P26d'].mean():.4f}")
                print(f"      Std deviation: {df['P26d'].std():.4f}")
                print(f"      Min value: {df['P26d'].min():.4f}")
                print(f"      Max value: {df['P26d'].max():.4f}")
                print(f"      Missing values: {df['P26d'].isna().sum()}")
            else:
                print(f"   ❌ No data available for {description}")
                matrices[result_key] = pd.DataFrame()
        
        return matrices, output_files
    
    def compare_with_existing_matrices(self):
        """Compare P26 test results with existing matrices"""
        print("\n" + "=" * 80)
        print("COMPARING WITH EXISTING MATRICES")
        print("=" * 80)
        
        existing_files = [
            ("1_PD_AllGenes_AllCpGs.csv", "All DNA Methylation"),
            ("1_PD_PromoterRegion_CpGs.csv", "Promoter Regions"),
            ("1_PD_PromoterRegion_SignificantCpGs.csv", "Significant Promoter DMRs")
        ]
        
        for filename, description in existing_files:
            file_path = os.path.join(self.output_dir, filename)
            print(f"\n📋 Checking {description}:")
            print(f"   File: {filename}")
            
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, index_col=0)
                    print(f"   ✅ Matrix exists: {df.shape}")
                    
                    if 'P26d' in df.columns:
                        p26_data = df['P26d']
                        missing_rate = p26_data.isna().sum() / len(p26_data) * 100
                        print(f"   📊 P26d in existing matrix:")
                        print(f"      Missing rate: {missing_rate:.2f}%")
                        if missing_rate < 100:
                            valid_data = p26_data.dropna()
                            print(f"      Valid values: {len(valid_data):,}")
                            print(f"      Mean: {valid_data.mean():.4f}")
                    else:
                        print(f"   ❌ P26d not found in existing matrix")
                        print(f"   Sample columns: {list(df.columns[:10])}")
                
                except Exception as e:
                    print(f"   ❌ Error reading existing matrix: {e}")
            else:
                print(f"   ⚠️  Matrix file not found")
    
    def generate_summary_report(self, results, matrices):
        """Generate comprehensive summary report"""
        print("\n" + "=" * 80)
        print("GENERATING SUMMARY REPORT")
        print("=" * 80)
        
        report_path = os.path.join(self.output_dir, "P26_matrix_test_report.txt")
        
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("P26 Matrix Generation Test Report\n")
            f.write("=" * 50 + "\n\n")
            
            f.write("OBJECTIVE:\n")
            f.write("Test P26 sample processing using the exact same logic as the original\n")
            f.write("matrix generation pipeline to identify data quality issues.\n\n")
            
            f.write("RESULTS SUMMARY:\n")
            for result_key, gene_data in results.items():
                f.write(f"{result_key.replace('_', ' ').title()}:\n")
                f.write(f"  Genes with data: {len(gene_data):,}\n")
                if gene_data:
                    values = list(gene_data.values())
                    f.write(f"  Mean methylation: {np.mean(values):.4f}\n")
                    f.write(f"  Std deviation: {np.std(values):.4f}\n")
                f.write("\n")
            
            f.write("CONCLUSION:\n")
            total_genes = sum(len(data) for data in results.values())
            if total_genes > 0:
                f.write("✅ P26 sample processed successfully\n")
                f.write("   Data quality appears acceptable for matrix inclusion\n")
                f.write("   Exclusion may be due to other factors (batch effects, etc.)\n")
            else:
                f.write("❌ P26 sample failed to generate meaningful data\n")
                f.write("   This explains exclusion from final matrices\n")
        
        print(f"📄 Report saved to: {report_path}")
    
    def run_complete_test(self):
        """Run complete P26 matrix generation test"""
        print("🧪 P26 MATRIX GENERATION TEST")
        print("=" * 80)
        print("Testing P26 sample using exact same logic as original matrix generator")
        print("=" * 80)
        
        # Step 1: Load annotations
        if not self.load_dmr_annotations():
            print("❌ Failed to load annotations")
            return
        
        # Step 2: Test P26 matrix generation
        results = self.test_p26_matrix_generation()
        
        if not results:
            print("❌ Failed to process P26 data")
            return
        
        # Step 3: Generate test matrices
        matrices, output_files = self.generate_p26_test_matrices(results)
        
        # Step 4: Compare with existing matrices
        self.compare_with_existing_matrices()
        
        # Step 5: Generate summary report
        self.generate_summary_report(results, matrices)
        
        # Final summary
        print("\n" + "=" * 80)
        print("TEST COMPLETE - SUMMARY")
        print("=" * 80)
        
        total_genes = sum(len(data) for data in results.values())
        
        if total_genes > 0:
            print(f"✅ P26 TEST SUCCESSFUL")
            print(f"   Total genes processed: {total_genes:,}")
            print(f"   Test matrices generated: {len(output_files)}")
            print(f"   📍 P26 data quality appears acceptable")
            print(f"   🔍 Exclusion likely due to downstream processing issues")
        else:
            print(f"❌ P26 TEST FAILED")
            print(f"   No meaningful data generated")
            print(f"   📍 This explains why P26 was excluded from matrices")
        
        print(f"\n📁 Test files saved to: {self.output_dir}")
        return results, matrices

def main():
    """Main function"""
    tester = P26MatrixTester()
    results, matrices = tester.run_complete_test()

if __name__ == "__main__":
    main()

🧪 P26 MATRIX GENERATION TEST
Testing P26 sample using exact same logic as original matrix generator
LOADING DMR ANNOTATIONS FOR P26 TEST
Loading all DNA methylation regions...
  Processing chunk 1...
  Processing chunk 2...
  Processing chunk 3...
  Processing chunk 4...
  Processing chunk 5...
  Limited to first 5 chunks for testing
Loaded 68356 unique regions from 5 chunks
Loading significant DMR regions...
📊 Annotation Summary:
   All DNA methylation regions: 68,356
   Promoter regions: 15,275
   Significant promoter DMR regions: 21,090

TESTING P26 MATRIX GENERATION

LOADING P26 SAMPLE DATA
📁 Loading P26 from: /Users/heweilin/Desktop/P056_Code_2/Raw_Data/MethylationLevels_samples/P26d.mC_level_Identification_stat.txt
  Processing chunk 1...

📊 P26 Data Loading Results:
   Total sites processed: 26,650
   CG context sites: 26,650
   Valid CpG sites loaded: 26,650
   Invalid values: 0
   Out of range values: 0
   Parse errors: 0
   Success rate: 100.00%

=== Test 1: All DNA Methylati