In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
miRNA-CpG dual regulation cross-integration analysis
Identifies core genes simultaneously regulated by both post-transcriptional (miRNA) 
and epigenetic (DNA methylation) mechanisms

Author: Generated for P056 Project
Date: 2025
"""

import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

def load_data():
    """Load all necessary data files"""
    print("=" * 60)
    print("miRNA-CpG Dual Regulation Cross-Integration Analysis")
    print("=" * 60)
    print("Loading data files...")
    
    # File paths
    mirna_mrna_file = "/Users/heweilin/Desktop/P056/10Integrated_miRNA_mRNA.xlsx"
    cpg_neg_file = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/5_CpG_mRNA_negative_regulation.csv"
    
    try:
        # Load miRNA-mRNA data
        print("Loading miRNA-mRNA regulation data...")
        mirna_up_mrna_down = pd.read_excel(mirna_mrna_file, sheet_name="mirUP_mrnaDown")
        mirna_down_mrna_up = pd.read_excel(mirna_mrna_file, sheet_name="mirDown_mrnaUP")
        
        # Merge miRNA negative regulation data
        mirna_negative = pd.concat([
            mirna_up_mrna_down.assign(mirna_regulation_type="miRNA_UP_mRNA_DOWN"),
            mirna_down_mrna_up.assign(mirna_regulation_type="miRNA_DOWN_mRNA_UP")
        ], ignore_index=True)
        
        print(f"   - miRNA upregulated → mRNA downregulated: {len(mirna_up_mrna_down)} pairs")
        print(f"   - miRNA downregulated → mRNA upregulated: {len(mirna_down_mrna_up)} pairs")
        print(f"   - Total miRNA negative regulation: {len(mirna_negative)} pairs")
        
        # Load CpG-mRNA negative regulation data
        print("Loading CpG-mRNA negative regulation data...")
        cpg_negative = pd.read_csv(cpg_neg_file)
        print(f"   - CpG negative regulation pairs: {len(cpg_negative)} pairs")
        
        return mirna_negative, cpg_negative
        
    except Exception as e:
        print(f"Data loading failed: {e}")
        return None, None

def extract_target_genes(mirna_data, cpg_data):
    """Extract target gene information"""
    print("\nExtracting target gene information...")
    
    # Extract target genes from miRNA data - using 'Target.Gene' column
    mirna_genes = set(mirna_data['Target.Gene'].dropna())
    
    # Extract genes from CpG data
    cpg_genes = set(cpg_data['Gene_Symbol'].dropna())
    
    print(f"   - Genes regulated by miRNA: {len(mirna_genes)}")
    print(f"   - Genes regulated by CpG: {len(cpg_genes)}")
    
    return mirna_genes, cpg_genes

def find_dual_regulated_genes(mirna_data, cpg_data):
    """Identify dual-regulated genes"""
    print("\n🔍 Identifying dual-regulated genes...")
    
    # Extract gene sets
    mirna_genes, cpg_genes = extract_target_genes(mirna_data, cpg_data)
    
    # Find intersection
    dual_regulated = mirna_genes & cpg_genes
    
    print(f"   - Dual-regulated genes: {len(dual_regulated)}")
    print(f"   - Overlap rate (relative to miRNA): {len(dual_regulated)/len(mirna_genes)*100:.1f}%")
    print(f"   - Overlap rate (relative to CpG): {len(dual_regulated)/len(cpg_genes)*100:.1f}%")
    
    return dual_regulated, mirna_genes, cpg_genes

def analyze_dual_regulation_details(dual_genes, mirna_data, cpg_data):
    """Analyze detailed information of dual regulation"""
    print("\n📋 Detailed analysis of dual-regulated genes...")
    
    dual_analysis = []
    
    for gene in sorted(dual_genes):
        # Get miRNA regulation information
        if 'Target.Gene' in mirna_data.columns:
            mirna_info = mirna_data[mirna_data['Target.Gene'] == gene]
        elif 'Target Gene' in mirna_data.columns:
            mirna_info = mirna_data[mirna_data['Target Gene'] == gene]
        elif 'mrna' in mirna_data.columns:
            mirna_info = mirna_data[mirna_data['mrna'] == gene]
        else:
            continue
            
        # Get CpG regulation information
        cpg_info = cpg_data[cpg_data['Gene_Symbol'] == gene]
        
        if len(mirna_info) > 0 and len(cpg_info) > 0:
            # Select the most significant regulation pair
            mirna_best = mirna_info.loc[mirna_info['pvalue_mirna'].idxmin()] if 'pvalue_mirna' in mirna_info.columns else mirna_info.iloc[0]
            cpg_best = cpg_info.loc[cpg_info['p_adj'].idxmin()]
            
            dual_analysis.append({
                'Gene_Symbol': gene,
                'miRNA': mirna_best.get('mirna', 'Unknown'),
                'miRNA_logFC': mirna_best.get('log2FoldChange_mirna', np.nan),
                'miRNA_pvalue': mirna_best.get('pvalue_mirna', np.nan),
                'miRNA_regulation_type': mirna_best.get('mirna_regulation_type', 'Unknown'),
                'CpG_ID': cpg_best.get('CpG_ID', 'Unknown'),
                'CpG_beta': cpg_best.get('beta', np.nan),
                'CpG_p_adj': cpg_best.get('p_adj', np.nan),
                'CpG_r_squared': cpg_best.get('r_squared', np.nan),
                'CpG_effect_size': cpg_best.get('effect_size', 'Unknown')
            })
    
    dual_df = pd.DataFrame(dual_analysis)
    return dual_df

def classify_dual_regulation_patterns(dual_df):
    """Classify dual regulation patterns"""
    print("\n🔄 Classifying dual regulation patterns...")
    
    # Classify regulation patterns
    patterns = {
        'miRNA_UP_CpG_NEG': [],  # miRNA upregulated + CpG negative regulation → synergistic mRNA suppression
        'miRNA_DOWN_CpG_NEG': [], # miRNA downregulated + CpG negative regulation → competitive regulation
        'HIGH_SIGNIFICANCE': [],   # Both regulations highly significant
        'HIGH_EFFECT': []         # Both regulations with large effects
    }
    
    for _, row in dual_df.iterrows():
        gene = row['Gene_Symbol']
        
        # Synergistic suppression pattern (miRNA upregulated + CpG negative regulation)
        if row['miRNA_regulation_type'] == 'miRNA_UP_mRNA_DOWN':
            patterns['miRNA_UP_CpG_NEG'].append(gene)
        
        # Competitive regulation pattern (miRNA downregulated + CpG negative regulation)
        elif row['miRNA_regulation_type'] == 'miRNA_DOWN_mRNA_UP':
            patterns['miRNA_DOWN_CpG_NEG'].append(gene)
        
        # High significance (both regulations significant)
        if (row['miRNA_pvalue'] < 0.01 and row['CpG_p_adj'] < 0.01):
            patterns['HIGH_SIGNIFICANCE'].append(gene)
        
        # Large effect (CpG large effect + miRNA high logFC)
        if (row['CpG_effect_size'] == 'Large' and abs(row['miRNA_logFC']) > 1.0):
            patterns['HIGH_EFFECT'].append(gene)
    
    return patterns

def print_top_dual_regulated_genes(dual_df, top_n=15):
    """Print top dual-regulated genes"""
    print(f"\nTop {top_n} dual-regulated genes (sorted by CpG significance):")
    print("=" * 80)
    
    # Sort by CpG significance
    top_genes = dual_df.sort_values('CpG_p_adj').head(top_n)
    
    print(f"{'Rank':<4} {'Gene':<15} {'miRNA':<15} {'miRNA_FC':<10} {'CpG_Beta':<10} {'CpG_p_adj':<12} {'Pattern':<20}")
    print("-" * 95)
    
    for i, (_, row) in enumerate(top_genes.iterrows(), 1):
        pattern = "Synergistic" if row['miRNA_regulation_type'] == 'miRNA_UP_mRNA_DOWN' else "Competitive"
        print(f"{i:<4} {row['Gene_Symbol']:<15} {str(row['miRNA'])[:14]:<15} "
              f"{row['miRNA_logFC']:<10.2f} {row['CpG_beta']:<10.3f} "
              f"{row['CpG_p_adj']:<12.2e} {pattern:<20}")

def analyze_regulation_strength(dual_df):
    """Analyze regulation strength"""
    print("\nRegulation strength analysis...")
    print("=" * 40)
    
    # Calculate combined regulation strength
    dual_df['Combined_Strength'] = abs(dual_df['miRNA_logFC']) * abs(dual_df['CpG_beta'])
    
    # Statistical analysis
    print(f"miRNA logFC statistics:")
    print(f"   - Mean: {dual_df['miRNA_logFC'].mean():.3f}")
    print(f"   - Median: {dual_df['miRNA_logFC'].median():.3f}")
    print(f"   - Standard deviation: {dual_df['miRNA_logFC'].std():.3f}")
    
    print(f"\nCpG Beta statistics:")
    print(f"   - Mean: {dual_df['CpG_beta'].mean():.3f}")
    print(f"   - Median: {dual_df['CpG_beta'].median():.3f}")
    print(f"   - Standard deviation: {dual_df['CpG_beta'].std():.3f}")
    
    print(f"\nCpG R² statistics:")
    print(f"   - Mean: {dual_df['CpG_r_squared'].mean():.3f}")
    print(f"   - Median: {dual_df['CpG_r_squared'].median():.3f}")
    
    # Combined strength ranking
    print(f"\nTop 10 strongest combined regulations:")
    print("-" * 60)
    top_combined = dual_df.nlargest(10, 'Combined_Strength')
    
    for i, (_, row) in enumerate(top_combined.iterrows(), 1):
        print(f"{i:2d}. {row['Gene_Symbol']:<15} Combined strength: {row['Combined_Strength']:.3f}")

def print_pattern_summary(patterns):
    """Print regulation pattern summary"""
    print("\nRegulation pattern summary:")
    print("=" * 50)
    
    print(f"Synergistic suppression pattern (miRNA upregulated + CpG negative regulation): {len(patterns['miRNA_UP_CpG_NEG'])} genes")
    if patterns['miRNA_UP_CpG_NEG']:
        print(f"   Representative genes: {', '.join(patterns['miRNA_UP_CpG_NEG'][:5])}")
    
    print(f"\nCompetitive regulation pattern (miRNA downregulated + CpG negative regulation): {len(patterns['miRNA_DOWN_CpG_NEG'])} genes")
    if patterns['miRNA_DOWN_CpG_NEG']:
        print(f"   Representative genes: {', '.join(patterns['miRNA_DOWN_CpG_NEG'][:5])}")
    
    print(f"\nHigh significance dual regulation: {len(patterns['HIGH_SIGNIFICANCE'])} genes")
    if patterns['HIGH_SIGNIFICANCE']:
        print(f"   Representative genes: {', '.join(patterns['HIGH_SIGNIFICANCE'][:5])}")
    
    print(f"\nHigh effect dual regulation: {len(patterns['HIGH_EFFECT'])} genes")
    if patterns['HIGH_EFFECT']:
        print(f"   Representative genes: {', '.join(patterns['HIGH_EFFECT'][:5])}")

def generate_functional_insights(dual_df):
    """Generate functional insights"""
    print("\nFunctional insights:")
    print("=" * 40)
    
    # Effect size distribution
    effect_dist = dual_df['CpG_effect_size'].value_counts()
    print("CpG effect size distribution:")
    for effect, count in effect_dist.items():
        print(f"   - {effect}: {count} genes ({count/len(dual_df)*100:.1f}%)")
    
    # Significance level distribution
    high_sig_cpg = (dual_df['CpG_p_adj'] < 0.001).sum()
    high_sig_mirna = (dual_df['miRNA_pvalue'] < 0.001).sum()
    
    print(f"\nSignificance distribution:")
    print(f"   - CpG high significance (p_adj < 0.001): {high_sig_cpg} genes")
    print(f"   - miRNA high significance (p < 0.001): {high_sig_mirna} genes")
    
    # R² distribution
    high_r2 = (dual_df['CpG_r_squared'] > 0.6).sum()
    print(f"   - High explanatory power (R² > 0.6): {high_r2} genes")

def main():
    """Main analysis pipeline"""
    # Load data
    mirna_data, cpg_data = load_data()
    if mirna_data is None or cpg_data is None:
        return
    
    # Identify dual-regulated genes
    dual_genes, mirna_genes, cpg_genes = find_dual_regulated_genes(mirna_data, cpg_data)
    
    if len(dual_genes) == 0:
        print("❌ No dual-regulated genes found")
        return
    
    # Analyze dual regulation details
    dual_df = analyze_dual_regulation_details(dual_genes, mirna_data, cpg_data)
    
    # Classify regulation patterns
    patterns = classify_dual_regulation_patterns(dual_df)
    
    # Print results
    print_top_dual_regulated_genes(dual_df)
    analyze_regulation_strength(dual_df)
    print_pattern_summary(patterns)
    generate_functional_insights(dual_df)
    
    # Final summary
    print("\n" + "=" * 60)
    print("Dual Regulation Analysis Summary")
    print("=" * 60)
    print(f"Total gene count statistics:")
    print(f"   - miRNA-regulated genes: {len(mirna_genes)}")
    print(f"   - CpG-regulated genes: {len(cpg_genes)}")
    print(f"   - Dual-regulated genes: {len(dual_genes)}")
    print(f"   - Dual regulation ratio: {len(dual_genes)/min(len(mirna_genes), len(cpg_genes))*100:.1f}%")
    
    print(f"\nKey findings:")
    print(f"   - Identified {len(dual_genes)} core genes simultaneously regulated by both miRNA and CpG")
    print(f"   - These genes represent key nodes of dual post-transcriptional and epigenetic regulation")
    print(f"   - Provides important therapeutic target candidates for vitamin B12-related regulatory networks")

if __name__ == "__main__":
    main()