In [27]:
# DMR-based CpG-mRNA Negative Regulation Analysis
# Author: Generated for epigenetic regulation analysis
# Date: 2025
# Purpose: Identify classic negative regulatory pairs (CpG ↑ → mRNA ↓) based on DMR annotations

import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Set file paths
cpg_file = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/4_CpG_expr_standardized.csv"
mrna_file = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/4_mRNA_expr_standardized.csv"
clinical_file = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/7Clinical_data50.csv"
annotation_file = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/3DNA_all.csv"
output_dir = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/"

print("DMR-based CpG-mRNA Negative Regulation Analysis")
print("=" * 55)

# Load data files
print("Loading data files...")

# Load CpG data
cpg_data = pd.read_csv(cpg_file)
cpg_data = cpg_data.set_index('Gene_Symbol')
print(f"CpG data: {cpg_data.shape[0]} features x {cpg_data.shape[1]} samples")

# Load mRNA data
mrna_data = pd.read_csv(mrna_file)
mrna_data = mrna_data.set_index(mrna_data.columns[0])  # Set first column as index
print(f"mRNA data: {mrna_data.shape[0]} features x {mrna_data.shape[1]} samples")

# Load clinical data
clinical_data = pd.read_csv(clinical_file)
print(f"Clinical data: {clinical_data.shape[0]} samples x {clinical_data.shape[1]} variables")

# Load annotation data
annotation_data = pd.read_csv(annotation_file)
print(f"Annotation data: {annotation_data.shape[0]} DMR-gene annotations")

print("\nAnnotation data overview:")
print("Available annotation types:")
print(annotation_data['annot.type'].value_counts().head(10))

# Sample matching
print("\nMatching samples across datasets...")

# Get sample names
cpg_samples = cpg_data.columns.tolist()
mrna_samples = mrna_data.columns.tolist()

# Extract base patient IDs
cpg_base = [s.rstrip('dms') for s in cpg_samples]
mrna_base = [s.rstrip('dms') for s in mrna_samples]

# Match with clinical data (NTUID with P prefix)
clinical_base = ['P' + str(ntuid) for ntuid in clinical_data['NTUID']]

# Find common samples
common_base = list(set(cpg_base) & set(mrna_base) & set(clinical_base))
print(f"Common samples: {len(common_base)}")

# Create sample mapping
def get_matched_samples(sample_list, base_list, common_base):
    matched_samples = []
    matched_indices = []
    for base_id in sorted(common_base):
        for i, base in enumerate(base_list):
            if base == base_id:
                matched_samples.append(sample_list[i])
                matched_indices.append(i)
                break
    return matched_samples, matched_indices

# Match samples
cpg_matched_samples, cpg_indices = get_matched_samples(cpg_samples, cpg_base, common_base)
mrna_matched_samples, mrna_indices = get_matched_samples(mrna_samples, mrna_base, common_base)
clinical_indices = [clinical_data[clinical_data['NTUID'] == int(base.replace('P', ''))].index[0] 
                   for base in sorted(common_base)]

# Subset data to matched samples
cpg_matched = cpg_data.iloc[:, cpg_indices].copy()
mrna_matched = mrna_data.iloc[:, mrna_indices].copy()
clinical_matched = clinical_data.iloc[clinical_indices].copy()

# Rename columns consistently
common_names = [f"Sample_{i+1}" for i in range(len(common_base))]
cpg_matched.columns = common_names
mrna_matched.columns = common_names

print(f"Final matched data: {len(common_names)} samples")

# Prepare clinical covariates
print("\nPreparing clinical covariates...")

# Define covariate strategy
essential_covariates = ['batch_mRNA', 'batch_DNA', 'batch_miRNA']
demographic_covariates = ['age', 'BMI', 'parity', 'v3n_Gender']
excluded_covariates = ['B12supplem', 'v1p_MultivitTab', 'v1p_FolicAcid', 'smoking']
target_variables = ['B12_status', 'B12_mol']

# Select available covariates
recommended_covariates = essential_covariates + demographic_covariates
available_covariates = [col for col in recommended_covariates if col in clinical_matched.columns]

print(f"✅ Using covariates: {', '.join(available_covariates)}")
print(f"⛔ Excluded variables: {', '.join(excluded_covariates)}")
print(f"❌ Target variables (never as covariates): {', '.join(target_variables)}")

# Prepare covariate matrix
covariates = clinical_matched[available_covariates].copy()

# Handle data types
for col in covariates.columns:
    if col in ['batch_mRNA', 'batch_DNA', 'batch_miRNA', 'v3n_Gender']:
        # Convert categorical variables to numeric
        covariates[col] = pd.factorize(covariates[col])[0]
    else:
        # Convert continuous variables to numeric
        covariates[col] = pd.to_numeric(covariates[col], errors='coerce')

# Remove samples with missing covariates
complete_mask = covariates.notna().all(axis=1)
if not complete_mask.all():
    print(f"Removing {(~complete_mask).sum()} samples with missing covariates")
    covariates = covariates[complete_mask]
    cpg_matched = cpg_matched.loc[:, complete_mask]
    mrna_matched = mrna_matched.loc[:, complete_mask]

final_n_samples = cpg_matched.shape[1]
print(f"Final sample size: {final_n_samples}")

# Process annotation data to create CpG-mRNA mapping
print("\nProcessing annotation data...")

# Extract gene information from annotation
# Use HGNC.symbol as the primary gene identifier
annotation_processed = annotation_data[['HGNC.symbol', 'Gene.stable.ID']].dropna().drop_duplicates()

# Create mapping from ENSEMBL ID to gene symbol
ensembl_to_symbol = dict(zip(annotation_processed['Gene.stable.ID'], 
                            annotation_processed['HGNC.symbol']))

print(f"Total unique gene annotations: {len(ensembl_to_symbol)}")

# Map CpG and mRNA features to gene symbols
print("\nMapping features to gene symbols...")

# CpG features are already gene symbols
cpg_gene_mapping = {cpg: cpg for cpg in cpg_matched.index}

# Map mRNA ENSEMBL IDs to gene symbols
mrna_gene_mapping = {}
for ensembl_id in mrna_matched.index:
    # Clean ENSEMBL ID (remove version if present)
    clean_ensembl = ensembl_id.split('.')[0]
    if clean_ensembl in ensembl_to_symbol:
        mrna_gene_mapping[ensembl_id] = ensembl_to_symbol[clean_ensembl]

print(f"CpG features mapped: {len(cpg_gene_mapping)}")
print(f"mRNA features mapped: {len(mrna_gene_mapping)}")

# Find overlapping genes between CpG and mRNA
cpg_genes = set(cpg_gene_mapping.values())
mrna_genes = set(mrna_gene_mapping.values())
common_genes = cpg_genes & mrna_genes

print(f"Common genes with both CpG and mRNA data: {len(common_genes)}")

# Create CpG-mRNA pairs based on annotation
print("\nCreating annotated CpG-mRNA pairs...")

annotated_pairs = []
for cpg_feature, cpg_gene in cpg_gene_mapping.items():
    if cpg_gene in common_genes:
        # Find all mRNA features for this gene
        mrna_features = [mrna_id for mrna_id, gene in mrna_gene_mapping.items() 
                        if gene == cpg_gene]
        
        for mrna_feature in mrna_features:
            annotated_pairs.append({
                'CpG_ID': cpg_feature,
                'mRNA_ID': mrna_feature,
                'Gene_Symbol': cpg_gene
            })

pairs_df = pd.DataFrame(annotated_pairs)
print(f"Total annotated CpG-mRNA pairs: {len(pairs_df)}")

# Function to perform linear regression
def perform_regression_analysis(cpg_expr, mrna_expr, covariates_df):
    """
    Perform linear regression: mRNA ~ CpG + covariates
    Returns regression statistics
    """
    try:
        # Prepare data
        y = mrna_expr.values
        X_cpg = cpg_expr.values.reshape(-1, 1)
        X_cov = covariates_df.values
        
        # Combine CpG and covariates
        X = np.column_stack([X_cpg, X_cov])
        
        # Check for valid data
        valid_mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
        if valid_mask.sum() < 10:  # Need at least 10 samples
            return None
            
        y_clean = y[valid_mask]
        X_clean = X[valid_mask]
        
        # Fit linear regression
        model = LinearRegression()
        model.fit(X_clean, y_clean)
        
        # Calculate statistics
        y_pred = model.predict(X_clean)
        residuals = y_clean - y_pred
        n = len(y_clean)
        p = X_clean.shape[1]
        
        # Calculate R-squared
        ss_res = np.sum(residuals ** 2)
        ss_tot = np.sum((y_clean - np.mean(y_clean)) ** 2)
        r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
        
        # Calculate standard errors and t-statistics
        mse = ss_res / (n - p)
        X_transpose_X_inv = np.linalg.inv(X_clean.T @ X_clean)
        se = np.sqrt(mse * np.diag(X_transpose_X_inv))
        
        # CpG coefficient (first coefficient)
        beta_cpg = model.coef_[0]
        se_cpg = se[0]
        t_stat = beta_cpg / se_cpg if se_cpg > 0 else 0
        
        # Calculate p-value (two-tailed t-test)
        df = n - p
        p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df)) if df > 0 else 1
        
        return {
            'beta': beta_cpg,
            'se': se_cpg,
            't_stat': t_stat,
            'p_value': p_value,
            'r_squared': r_squared,
            'n_obs': n
        }
        
    except Exception as e:
        return None

# Perform regression analysis for all annotated pairs
print("\nPerforming regression analysis...")
print("This may take several minutes...")

results = []
total_pairs = len(pairs_df)

for i, row in pairs_df.iterrows():
    if i % 1000 == 0:
        print(f"Progress: {i}/{total_pairs} pairs processed")
    
    cpg_id = row['CpG_ID']
    mrna_id = row['mRNA_ID']
    gene_symbol = row['Gene_Symbol']
    
    # Get expression data
    cpg_expr = cpg_matched.loc[cpg_id]
    mrna_expr = mrna_matched.loc[mrna_id]
    
    # Perform regression
    reg_result = perform_regression_analysis(cpg_expr, mrna_expr, covariates)
    
    if reg_result is not None:
        result_row = {
            'CpG_ID': cpg_id,
            'mRNA_ID': mrna_id,
            'Gene_Symbol': gene_symbol,
            **reg_result
        }
        results.append(result_row)

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(f"\nValid regression results: {len(results_df)}")

if len(results_df) == 0:
    print("No valid regression results obtained. Exiting.")
    exit()

# Apply FDR correction
print("Applying FDR correction...")
_, results_df['p_adj'] = fdrcorrection(results_df['p_value'])

# Filter for classic negative regulation (β < 0 and FDR significant)
fdr_threshold = 0.05
negative_significant = results_df[
    (results_df['beta'] < 0) & 
    (results_df['p_adj'] < fdr_threshold)
].copy()

print(f"\nClassic negative regulatory pairs (β < 0, FDR < {fdr_threshold}):")
print(f"Total pairs: {len(negative_significant)}")

if len(negative_significant) > 0:
    # Sort by significance
    negative_significant = negative_significant.sort_values('p_adj')
    
    # Add effect size categories
    negative_significant['abs_beta'] = abs(negative_significant['beta'])
    negative_significant['effect_size'] = pd.cut(
        negative_significant['abs_beta'],
        bins=[0, 0.3, 0.5, float('inf')],
        labels=['Small', 'Medium', 'Large']
    )
    
    # Summary statistics
    print(f"\nEffect size distribution:")
    effect_counts = negative_significant['effect_size'].value_counts()
    for effect, count in effect_counts.items():
        print(f"  {effect} effect: {count} pairs")
    
    print(f"\nStatistical summary:")
    print(f"  Mean β coefficient: {negative_significant['beta'].mean():.3f} ± {negative_significant['beta'].std():.3f}")
    print(f"  Mean |β|: {negative_significant['abs_beta'].mean():.3f}")
    print(f"  Mean R²: {negative_significant['r_squared'].mean():.3f}")
    print(f"  Median p_adj: {negative_significant['p_adj'].median():.2e}")
    
    # Top regulatory CpG sites
    cpg_summary = negative_significant.groupby('CpG_ID').agg({
        'mRNA_ID': 'count',
        'abs_beta': 'mean',
        'p_adj': 'min'
    }).rename(columns={'mRNA_ID': 'n_targets'}).sort_values('n_targets', ascending=False)
    
    print(f"\nTop 10 CpG sites with most targets:")
    print(cpg_summary.head(10).to_string())

# Save results
print("\nSaving results...")

# 1. All regression results
all_results_file = f"{output_dir}5_CpG_mRNA_DMR_all_results.csv"
results_df.to_csv(all_results_file, index=False)

# 2. Classic negative regulatory pairs
negative_results_file = f"{output_dir}5_CpG_mRNA_negative_regulation.csv"
if len(negative_significant) > 0:
    negative_significant.to_csv(negative_results_file, index=False)
else:
    # Create empty file with headers
    empty_df = pd.DataFrame(columns=results_df.columns.tolist() + ['abs_beta', 'effect_size'])
    empty_df.to_csv(negative_results_file, index=False)

# 3. CpG regulatory network summary
if len(negative_significant) > 0:
    network_file = f"{output_dir}5_CpG_mRNA_regulatory_network.csv"
    cpg_summary.reset_index().to_csv(network_file, index=False)

# 4. Gene-level summary
if len(negative_significant) > 0:
    gene_summary = negative_significant.groupby('Gene_Symbol').agg({
        'CpG_ID': 'nunique',
        'mRNA_ID': 'nunique', 
        'abs_beta': 'mean',
        'p_adj': 'min'
    }).rename(columns={
        'CpG_ID': 'n_cpg_sites',
        'mRNA_ID': 'n_mrna_isoforms'
    }).sort_values('n_cpg_sites', ascending=False)
    
    gene_summary_file = f"{output_dir}5_CpG_mRNA_gene_summary.csv"
    gene_summary.reset_index().to_csv(gene_summary_file, index=False)

# 5. Analysis summary
summary_info = pd.DataFrame({
    'Parameter': [
        'Total_annotated_pairs', 'Valid_regression_results', 'Negative_significant_pairs',
        'FDR_threshold', 'Covariates_used', 'Final_sample_size', 'Analysis_focus', 'Analysis_date'
    ],
    'Value': [
        len(pairs_df), len(results_df), len(negative_significant),
        fdr_threshold, ';'.join(available_covariates), final_n_samples,
        'Classic_negative_regulation_CpG_up_mRNA_down', str(pd.Timestamp.now().date())
    ]
})

summary_file = f"{output_dir}5_CpG_mRNA_analysis_summary.csv"
summary_info.to_csv(summary_file, index=False)

print(f"\nOutput files saved:")
print(f"1. All results: {all_results_file}")
print(f"2. Negative regulation pairs: {negative_results_file}")
if len(negative_significant) > 0:
    print(f"3. CpG regulatory network: {network_file}")
    print(f"4. Gene-level summary: {gene_summary_file}")
print(f"5. Analysis summary: {summary_file}")

print(f"\n🎯 Analysis completed successfully!")
print(f"📊 Found {len(negative_significant)} classic negative regulatory relationships")
print(f"📈 Based on {len(pairs_df)} annotated CpG-mRNA pairs from DMR annotations")

DMR-based CpG-mRNA Negative Regulation Analysis
Loading data files...
CpG data: 17584 features x 49 samples
mRNA data: 58735 features x 49 samples
Clinical data: 50 samples x 21 variables
Annotation data: 1046209 DMR-gene annotations

Annotation data overview:
Available annotation types:
annot.type
hg38_genes_introns      519134
hg38_genes_exons        200607
hg38_genes_1to5kb       105667
hg38_cpg_inter           73292
hg38_genes_promoters     48139
hg38_genes_3UTRs         29908
hg38_cpg_shores          27132
hg38_genes_5UTRs         17665
hg38_cpg_islands         13065
hg38_cpg_shelves         11600
Name: count, dtype: int64

Matching samples across datasets...
Common samples: 49
Final matched data: 49 samples

Preparing clinical covariates...
✅ Using covariates: batch_mRNA, batch_DNA, batch_miRNA, age, BMI, parity, v3n_Gender
⛔ Excluded variables: B12supplem, v1p_MultivitTab, v1p_FolicAcid, smoking
❌ Target variables (never as covariates): B12_status, B12_mol
Final sample size: 49
