In [16]:
import pandas as pd

file_paths = {
    "CpG Methylation Matrix": "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/1_PD_PromoterRegion_CpGs.csv",
    "miRNA Expression Matrix": "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/6miRNA_TPM.csv",
    "mRNA Expression Matrix": "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/5mRNA_TPM.csv",
    "Covariate Matrix": "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/7Clinical_data50.csv"
}

for name, path in file_paths.items():
    print(f"\n=== {name} ===")
    try:
        df = pd.read_csv(path, index_col=0)
        print(df.head())
    except Exception as e:
        print(f"Error reading {name}: {e}")




=== CpG Methylation Matrix ===
                P102d     P105d     P111d     P113d     P117d     P118d  \
Gene_Symbol                                                               
A1BG         0.582906  0.456747  0.535035  0.638074  0.615732  0.505413   
A1BG-AS1     0.500771  0.376418  0.397345  0.521447  0.550180  0.339344   
A2M-AS1      0.666854  0.713421  0.561839  0.573033  0.694567  0.603831   
A2ML1        0.808487  0.788236  0.829482  0.878738  0.859099  0.831974   
A2ML1-AS1    0.834131  0.876004  0.814006  0.841255  0.826644  0.819913   

                P123d     P127d     P128d     P131d  ...      P66d      P68d  \
Gene_Symbol                                          ...                       
A1BG         0.554163  0.541218  0.586789  0.689665  ...  0.646065  0.626514   
A1BG-AS1     0.444468  0.453535  0.412788  0.600901  ...  0.563106  0.554267   
A2M-AS1      0.629768  0.699405  0.587279  0.601171  ...  0.729268  0.492997   
A2ML1        0.850209  0.821716  0.862571 

In [27]:
import pandas as pd

# === Step 1: Load input files ===
# CpG–mRNA regulation data (classical negative regulation)
path_dna = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/9Integrated_DNA_mRNA.xlsx"
df_dna_hypo = pd.read_excel(path_dna, sheet_name="Hypo_Up")      # CpG ↓ → mRNA ↑
df_dna_hyper = pd.read_excel(path_dna, sheet_name="Hyper_Down")  # CpG ↑ → mRNA ↓

# miRNA–mRNA regulation data (classical negative regulation)
path_mirna = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/10Integrated_miRNA_mRNA.xlsx"
df_mirna_up = pd.read_excel(path_mirna, sheet_name="mirUP_mrnaDown")    # miRNA ↑ → mRNA ↓
df_mirna_down = pd.read_excel(path_mirna, sheet_name="mirDown_mrnaUP")  # miRNA ↓ → mRNA ↑

# Filtered significant CpG–gene pairs from promoter region
path_cpg_genes = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/1_PD_PromoterRegion_SignificantCpGs.csv"
df_cpg = pd.read_csv(path_cpg_genes)
selected_genes = df_cpg["Gene_Symbol"].dropna().unique()

# === Step 2: Prepare CpG–Gene and miRNA–Gene pairs ===
# Combine CpG–Gene regulation pairs
df_dna_all = pd.concat([
    df_dna_hypo[["name", "SYMBOL"]],
    df_dna_hyper[["name", "SYMBOL"]]
]).rename(columns={"name": "CpG", "SYMBOL": "Gene"})

# Keep only CpG–Gene pairs where Gene is among significant CpG genes
df_dna_all = df_dna_all[df_dna_all["Gene"].isin(selected_genes)]

# Combine miRNA–Gene regulation pairs
df_mirna_all = pd.concat([
    df_mirna_up[["mirna", "Target.Gene"]],
    df_mirna_down[["mirna", "Target.Gene"]]
]).rename(columns={"Target.Gene": "Gene"})

# === Step 3: Construct CpG–miRNA–mRNA triplets ===
# Only retain genes regulated by both CpG and miRNA
shared_genes = set(df_dna_all["Gene"]) & set(df_mirna_all["Gene"])

triplets = []
for gene in shared_genes:
    # In the original data, CpG column actually stores the gene name (not CpG ID)
    # So we explicitly rename it as "CpG_<Gene>" to distinguish it
    cpg_label = f"CpG_{gene}"
    mirnas = df_mirna_all[df_mirna_all["Gene"] == gene]["mirna"].unique()
    for mir in mirnas:
        triplets.append([cpg_label, mir, gene])

# === Step 4: Save the output triplet table ===
df_triplets = pd.DataFrame(triplets, columns=["CpG", "miRNA", "mRNA"])
output_path = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/3_candidate_triplets.csv"
df_triplets.to_csv(output_path, index=False)

print(f"Successfully generated {len(df_triplets)} triplets.")
print(f"Saved to: {output_path}")


Successfully generated 282 triplets.
Saved to: /Users/heweilin/Desktop/P056_Code_2/Processed_Data/3_candidate_triplets.csv


In [31]:
import pandas as pd
import numpy as np
import os

# Set file paths
base_dir = "/Users/heweilin/Desktop/P056_Code_2"
raw_data_dir = os.path.join(base_dir, "Raw_Data")
processed_data_dir = os.path.join(base_dir, "Processed_Data")

# Define all files to check
files_to_check = {
    "mRNA Expression Data": os.path.join(raw_data_dir, "5mRNA_TPM.csv"),
    "miRNA Expression Data": os.path.join(raw_data_dir, "6miRNA_TPM.csv"),
    "CpG Methylation Data": os.path.join(processed_data_dir, "1_PD_PromoterRegion_CpGs.csv"),
    "Clinical Data": os.path.join(raw_data_dir, "7Clinical_data50.csv"),
    "Candidate Triplets": os.path.join(processed_data_dir, "3_candidate_triplets.csv"),
    "DNA-mRNA Integration": os.path.join(raw_data_dir, "9Integrated_DNA_mRNA.xlsx"),
    "miRNA-mRNA Integration": os.path.join(raw_data_dir, "10Integrated_miRNA_mRNA.xlsx"),
    "3DNA Annotation": os.path.join(raw_data_dir, "3DNA_all.csv")
}

def preview_file(file_path, file_name, max_cols=10):
    """Preview basic information and first 5 rows of a single file"""
    print(f"\n{'='*80}")
    print(f"File: {file_name}")
    print(f"Path: {file_path}")
    print(f"{'='*80}")
    
    if not os.path.exists(file_path):
        print("ERROR: File does not exist!")
        return
    
    try:
        # Choose reading method based on file extension
        if file_path.endswith('.xlsx'):
            data = pd.read_excel(file_path)
        elif file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        else:
            print("ERROR: Unsupported file format!")
            return
        
        # Display basic information
        print(f"Dimensions: {data.shape[0]} rows x {data.shape[1]} columns")
        print(f"Column names: {list(data.columns[:max_cols])}")
        if data.shape[1] > max_cols:
            print(f"              ... (and {data.shape[1] - max_cols} more columns)")
        
        # Display data types
        print(f"Data types:")
        dtype_summary = data.dtypes.value_counts()
        for dtype, count in dtype_summary.items():
            print(f"    {dtype}: {count} columns")
        
        # Display first 5 rows (limit columns for display)
        print(f"\nFirst 5 rows:")
        display_data = data.iloc[:5, :max_cols] if data.shape[1] > max_cols else data.head()
        
        # Format display
        pd.set_option('display.max_columns', max_cols)
        pd.set_option('display.width', 120)
        pd.set_option('display.max_colwidth', 15)
        
        print(display_data.to_string())
        
        # Check missing values
        missing_summary = data.isnull().sum()
        missing_cols = missing_summary[missing_summary > 0]
        if len(missing_cols) > 0:
            print(f"\nMissing values:")
            for col, missing_count in missing_cols.head(10).items():
                missing_pct = (missing_count / len(data)) * 100
                print(f"    {col}: {missing_count} ({missing_pct:.1f}%)")
            if len(missing_cols) > 10:
                print(f"    ... (and {len(missing_cols) - 10} more columns with missing values)")
        else:
            print(f"\nNo missing values found.")
        
        # Additional statistics for numeric columns
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            print(f"\nNumeric columns summary:")
            numeric_summary = data[numeric_cols].describe().iloc[:3]  # Only count, mean, std
            print(numeric_summary.to_string())
        
    except Exception as e:
        print(f"ERROR reading file: {str(e)}")

def check_sample_consistency():
    """Check sample ID consistency across expression matrices"""
    print(f"\n{'='*80}")
    print("SAMPLE CONSISTENCY CHECK")
    print(f"{'='*80}")
    
    sample_info = {}
    
    # Check expression matrices
    expr_files = {
        "mRNA": os.path.join(raw_data_dir, "5mRNA_TPM.csv"),
        "miRNA": os.path.join(raw_data_dir, "6miRNA_TPM.csv"),
        "CpG": os.path.join(processed_data_dir, "1_PD_PromoterRegion_CpGs.csv")
    }
    
    for data_type, file_path in expr_files.items():
        if os.path.exists(file_path):
            try:
                data = pd.read_csv(file_path, index_col=0)
                sample_info[data_type] = set(data.columns)
                print(f"{data_type} samples: {len(data.columns)} (first 5: {list(data.columns[:5])})")
            except Exception as e:
                print(f"Error reading {data_type}: {e}")
    
    # Find common samples
    if len(sample_info) > 1:
        common_samples = set.intersection(*sample_info.values())
        print(f"\nCommon samples across all matrices: {len(common_samples)}")
        
        # Show sample differences
        for data_type, samples in sample_info.items():
            unique_samples = samples - common_samples
            if unique_samples:
                print(f"{data_type} unique samples: {len(unique_samples)}")

def check_triplet_coverage():
    """Check if triplet features are present in expression data"""
    print(f"\n{'='*80}")
    print("TRIPLET COVERAGE CHECK")
    print(f"{'='*80}")
    
    # Load triplets
    triplets_path = os.path.join(processed_data_dir, "3_candidate_triplets.csv")
    if not os.path.exists(triplets_path):
        print("Triplets file not found!")
        return
    
    triplets = pd.read_csv(triplets_path)
    print(f"Total triplets: {len(triplets)}")
    
    # Check coverage for each data type
    coverage_info = {}
    
    # Check mRNA coverage
    mrna_path = os.path.join(raw_data_dir, "5mRNA_TPM.csv")
    if os.path.exists(mrna_path):
        mrna_data = pd.read_csv(mrna_path, index_col=0)
        mrna_genes = set(mrna_data.index)
        triplet_mrnas = set(triplets['mRNA'])
        coverage_info['mRNA'] = {
            'available': len(mrna_genes),
            'required': len(triplet_mrnas),
            'covered': len(triplet_mrnas & mrna_genes),
            'missing': triplet_mrnas - mrna_genes
        }
    
    # Check miRNA coverage
    mirna_path = os.path.join(raw_data_dir, "6miRNA_TPM.csv")
    if os.path.exists(mirna_path):
        mirna_data = pd.read_csv(mirna_path, index_col=0)
        mirna_features = set(mirna_data.index)
        triplet_mirnas = set(triplets['miRNA'])
        coverage_info['miRNA'] = {
            'available': len(mirna_features),
            'required': len(triplet_mirnas),
            'covered': len(triplet_mirnas & mirna_features),
            'missing': triplet_mirnas - mirna_features
        }
    
    # Check CpG coverage
    cpg_path = os.path.join(processed_data_dir, "1_PD_PromoterRegion_CpGs.csv")
    if os.path.exists(cpg_path):
        cpg_data = pd.read_csv(cpg_path, index_col=0)
        cpg_features = set(cpg_data.index)
        triplet_cpgs = set(triplets['CpG'])
        coverage_info['CpG'] = {
            'available': len(cpg_features),
            'required': len(triplet_cpgs),
            'covered': len(triplet_cpgs & cpg_features),
            'missing': triplet_cpgs - cpg_features
        }
    
    # Display coverage results
    for data_type, info in coverage_info.items():
        coverage_pct = (info['covered'] / info['required']) * 100 if info['required'] > 0 else 0
        print(f"\n{data_type} Coverage:")
        print(f"    Available features: {info['available']}")
        print(f"    Required by triplets: {info['required']}")
        print(f"    Covered: {info['covered']} ({coverage_pct:.1f}%)")
        print(f"    Missing: {len(info['missing'])}")
        
        if len(info['missing']) > 0 and len(info['missing']) <= 10:
            print(f"    Missing features: {list(info['missing'])}")
        elif len(info['missing']) > 10:
            print(f"    Missing features (first 10): {list(info['missing'])[:10]}")

def main():
    """Main function to preview all data files"""
    print("DATA FILES PREVIEW REPORT")
    print("=" * 80)
    
    # Preview each file
    for file_name, file_path in files_to_check.items():
        preview_file(file_path, file_name)
    
    # Check sample consistency
    check_sample_consistency()
    
    # Check triplet coverage
    check_triplet_coverage()
    
    print(f"\n{'='*80}")
    print("PREVIEW COMPLETE")
    print(f"{'='*80}")

if __name__ == "__main__":
    main()

DATA FILES PREVIEW REPORT

File: mRNA Expression Data
Path: /Users/heweilin/Desktop/P056_Code_2/Raw_Data/5mRNA_TPM.csv
Dimensions: 58735 rows x 51 columns
Column names: ['Unnamed: 0', 'P26m', 'P31m', 'P33m', 'P37m', 'P42m', 'P51m', 'P52m', 'P55m', 'P59m']
              ... (and 41 more columns)
Data types:
    float64: 50 columns
    object: 1 columns

First 5 rows:
        Unnamed: 0          P26m          P31m          P33m          P37m          P42m          P51m          P52m           P55m          P59m
0  ENSG00000213218  13031.394620  54628.551700  64073.976140  125395.84930  84797.883570  85677.974950  91199.705530  108029.726600  16549.916290
1  ENSG00000136488  19798.206670  55595.539630  61209.326870  104521.52130  65546.348340  92972.924610  76486.532730   85963.344440  28180.767680
2  ENSG00000156508   8237.236546   8638.347144   4620.087343    3563.53485   6642.815691   4994.102332   8710.949269    3904.417469   7456.400737
3  ENSG00000105825   5160.342163  28885.268020 

In [36]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Set file paths
base_dir = "/Users/heweilin/Desktop/P056_Code_2"
raw_data_dir = os.path.join(base_dir, "Raw_Data")
processed_data_dir = os.path.join(base_dir, "Processed_Data")

# Input file paths
mrna_path = os.path.join(raw_data_dir, "5mRNA_TPM.csv")
mirna_path = os.path.join(raw_data_dir, "6miRNA_TPM.csv")
cpg_path = os.path.join(processed_data_dir, "1_PD_PromoterRegion_CpGs.csv")
clinical_path = os.path.join(raw_data_dir, "7Clinical_data50.csv")
triplets_path = os.path.join(processed_data_dir, "3_candidate_triplets.csv")
dna_mrna_path = os.path.join(raw_data_dir, "9Integrated_DNA_mRNA.xlsx")

# Ensure output directory exists
os.makedirs(processed_data_dir, exist_ok=True)

def load_and_prepare_data():
    """Load and preprocess all data files"""
    print("=" * 80)
    print("DATA LOADING AND PREPROCESSING")
    print("=" * 80)
    
    # 1. Load mRNA expression data
    print("1. Loading mRNA expression data...")
    mrna_data = pd.read_csv(mrna_path)
    # Handle first column as gene ID
    if 'Unnamed: 0' in mrna_data.columns:
        mrna_data = mrna_data.set_index('Unnamed: 0')
    else:
        mrna_data = mrna_data.set_index(mrna_data.columns[0])
    print(f"   mRNA data: {mrna_data.shape}")
    print(f"   Sample examples: {list(mrna_data.columns[:5])}")
    
    # 2. Load miRNA expression data
    print("2. Loading miRNA expression data...")
    mirna_data = pd.read_csv(mirna_path)
    if 'sRNA' in mirna_data.columns:
        mirna_data = mirna_data.set_index('sRNA')
    else:
        mirna_data = mirna_data.set_index(mirna_data.columns[0])
    print(f"   miRNA data: {mirna_data.shape}")
    print(f"   Sample examples: {list(mirna_data.columns[:5])}")
    
    # 3. Load CpG methylation data
    print("3. Loading CpG methylation data...")
    cpg_data = pd.read_csv(cpg_path)
    if 'Gene_Symbol' in cpg_data.columns:
        cpg_data = cpg_data.set_index('Gene_Symbol')
    else:
        cpg_data = cpg_data.set_index(cpg_data.columns[0])
    print(f"   CpG data: {cpg_data.shape}")
    print(f"   Sample examples: {list(cpg_data.columns[:5])}")
    
    # 4. Load clinical data
    print("4. Loading clinical data...")
    clinical_data = pd.read_csv(clinical_path)
    print(f"   Clinical data: {clinical_data.shape}")
    print(f"   Key columns: {list(clinical_data.columns[:10])}")
    
    # 5. Load candidate triplets
    print("5. Loading candidate triplets...")
    triplets = pd.read_csv(triplets_path)
    print(f"   Number of triplets: {triplets.shape[0]}")
    
    # 6. Load DNA-mRNA integration data (for gene ID conversion)
    print("6. Loading DNA-mRNA integration data...")
    dna_mrna_integration = pd.read_excel(dna_mrna_path)
    print(f"   Integration data: {dna_mrna_integration.shape}")
    
    return mrna_data, mirna_data, cpg_data, clinical_data, triplets, dna_mrna_integration

def standardize_sample_ids():
    """Standardize sample ID mapping"""
    print("\n" + "=" * 50)
    print("STANDARDIZING SAMPLE IDs")
    print("=" * 50)
    
    # Create sample ID mapping dictionary
    # Based on ID correspondence in clinical data
    clinical_data = pd.read_csv(clinical_path)
    
    sample_mapping = {}
    for _, row in clinical_data.iterrows():
        base_id = str(row['NTUID'])
        sample_mapping[f"P{base_id}m"] = f"P{base_id}"  # mRNA
        sample_mapping[f"P{base_id}s"] = f"P{base_id}"  # miRNA
        sample_mapping[f"P{base_id}d"] = f"P{base_id}"  # CpG
    
    print(f"Created {len(sample_mapping)} sample ID mappings")
    return sample_mapping

def create_gene_id_mapping(dna_mrna_integration):
    """Create ENSEMBL ID to gene symbol mapping"""
    print("\nCreating gene ID mapping...")
    
    # Extract gene ID mapping from integration data
    gene_mapping = {}
    
    if 'Gene.stable.ID' in dna_mrna_integration.columns and 'annot.symbol' in dna_mrna_integration.columns:
        mapping_df = dna_mrna_integration[['Gene.stable.ID', 'annot.symbol']].dropna()
        gene_mapping = dict(zip(mapping_df['Gene.stable.ID'], mapping_df['annot.symbol']))
    elif 'Gene.stable.ID' in dna_mrna_integration.columns and 'SYMBOL' in dna_mrna_integration.columns:
        mapping_df = dna_mrna_integration[['Gene.stable.ID', 'SYMBOL']].dropna()
        gene_mapping = dict(zip(mapping_df['Gene.stable.ID'], mapping_df['SYMBOL']))
    
    print(f"Created {len(gene_mapping)} gene ID mappings")
    return gene_mapping

def harmonize_sample_names(data_dict, sample_mapping):
    """Harmonize sample names across all datasets"""
    print("\nHarmonizing sample names...")
    
    harmonized_data = {}
    
    for data_type, data in data_dict.items():
        print(f"Processing {data_type} data...")
        
        # Rename columns
        new_columns = []
        for col in data.columns:
            if col in sample_mapping:
                new_columns.append(sample_mapping[col])
            else:
                new_columns.append(col)
        
        data_renamed = data.copy()
        data_renamed.columns = new_columns
        
        # Remove samples with excessive missing values (e.g., P26)
        missing_threshold = 0.8  # Remove if >80% values are missing
        samples_to_keep = []
        
        for col in data_renamed.columns:
            missing_rate = data_renamed[col].isna().sum() / len(data_renamed)
            if missing_rate < missing_threshold:
                samples_to_keep.append(col)
        
        data_clean = data_renamed[samples_to_keep]
        harmonized_data[data_type] = data_clean
        
        print(f"   {data_type}: {data.shape} -> {data_clean.shape}")
        print(f"   Retained samples: {samples_to_keep[:5]}...")
    
    return harmonized_data

def create_cpg_gene_mapping(cpg_data, triplets):
    """Create gene to CpG mapping for CpG data"""
    print("\nCreating CpG-gene mapping...")
    
    # Extract CpG-gene relationships from triplets
    cpg_gene_mapping = {}
    
    for _, row in triplets.iterrows():
        cpg_id = row['CpG']
        gene_symbol = row['mRNA']
        
        # Try different CpG ID formats
        possible_cpg_names = [
            cpg_id,
            cpg_id.replace('CpG_', ''),
            gene_symbol,
            f"CpG_{gene_symbol}"
        ]
        
        found_cpg = None
        for possible_name in possible_cpg_names:
            if possible_name in cpg_data.index:
                found_cpg = possible_name
                break
        
        if found_cpg:
            cpg_gene_mapping[cpg_id] = found_cpg
        else:
            # If no exact match, try partial matching
            for cpg_index in cpg_data.index:
                if gene_symbol.lower() in str(cpg_index).lower():
                    cpg_gene_mapping[cpg_id] = cpg_index
                    break
    
    print(f"Mapped {len(cpg_gene_mapping)} CpG sites")
    return cpg_gene_mapping

def filter_triplet_features(harmonized_data, triplets, gene_mapping, cpg_gene_mapping):
    """Filter features relevant to triplets"""
    print("\n" + "=" * 50)
    print("FILTERING TRIPLET-RELEVANT FEATURES")
    print("=" * 50)
    
    filtered_data = {}
    
    # 1. Filter mRNA data
    print("1. Filtering mRNA features...")
    mrna_data = harmonized_data['mRNA']
    
    # Get required genes
    required_genes = set(triplets['mRNA'].unique())
    
    # Try direct gene symbol matching
    available_genes_direct = set(gene_mapping.values()) & required_genes
    
    # Create reverse mapping (gene symbol -> ENSEMBL ID)
    reverse_gene_mapping = {v: k for k, v in gene_mapping.items()}
    
    # Filter available mRNA features
    mrna_features_to_keep = []
    gene_symbol_mapping = {}
    
    for gene in required_genes:
        if gene in reverse_gene_mapping:
            ensembl_id = reverse_gene_mapping[gene]
            if ensembl_id in mrna_data.index:
                mrna_features_to_keep.append(ensembl_id)
                gene_symbol_mapping[ensembl_id] = gene
    
    filtered_mrna = mrna_data.loc[mrna_features_to_keep]
    # Rename index to gene symbols
    filtered_mrna.index = [gene_symbol_mapping[idx] for idx in filtered_mrna.index]
    
    filtered_data['mRNA'] = filtered_mrna
    print(f"   mRNA: {len(required_genes)} required -> {len(filtered_mrna)} available")
    
    # 2. Filter miRNA data
    print("2. Filtering miRNA features...")
    mirna_data = harmonized_data['miRNA']
    required_mirnas = set(triplets['miRNA'].unique())
    available_mirnas = set(mirna_data.index) & required_mirnas
    
    filtered_mirna = mirna_data.loc[list(available_mirnas)]
    filtered_data['miRNA'] = filtered_mirna
    print(f"   miRNA: {len(required_mirnas)} required -> {len(filtered_mirna)} available")
    
    # 3. Filter CpG data
    print("3. Filtering CpG features...")
    cpg_data = harmonized_data['CpG']
    
    cpg_features_to_keep = []
    cpg_rename_mapping = {}
    
    for cpg_id, actual_cpg in cpg_gene_mapping.items():
        if actual_cpg in cpg_data.index:
            cpg_features_to_keep.append(actual_cpg)
            cpg_rename_mapping[actual_cpg] = cpg_id
    
    if cpg_features_to_keep:
        filtered_cpg = cpg_data.loc[cpg_features_to_keep]
        # Rename index
        filtered_cpg.index = [cpg_rename_mapping[idx] for idx in filtered_cpg.index]
    else:
        # If no matching CpG found, create empty DataFrame
        filtered_cpg = pd.DataFrame()
    
    filtered_data['CpG'] = filtered_cpg
    print(f"   CpG: {len(set(triplets['CpG'].unique()))} required -> {len(filtered_cpg)} available")
    
    return filtered_data

def prepare_clinical_covariates(clinical_data, common_samples):
    """Prepare clinical covariates"""
    print("\nPreparing covariate data...")
    
    # Create sample ID mapping
    clinical_sample_mapping = {}
    for _, row in clinical_data.iterrows():
        base_id = f"P{row['NTUID']}"
        clinical_sample_mapping[base_id] = row
    
    # Build covariate matrix
    covariate_data = []
    
    for sample in common_samples:
        if sample in clinical_sample_mapping:
            row_data = clinical_sample_mapping[sample]
            covariate_data.append({
                'sample_id': sample,
                'age': row_data.get('age', np.nan),
                'BMI': row_data.get('BMI', np.nan),
                'B12_status': row_data.get('B12_status', 'Unknown'),
                'parity': row_data.get('parity', np.nan),
                'batch_miRNA': row_data.get('batch_miRNA', 1),
                'batch_mRNA': row_data.get('batch_mRNA', 1)
            })
    
    covariate_df = pd.DataFrame(covariate_data).set_index('sample_id')
    
    # Handle categorical variables
    if 'B12_status' in covariate_df.columns:
        b12_dummies = pd.get_dummies(covariate_df['B12_status'], prefix='B12', drop_first=True)
        covariate_df = pd.concat([covariate_df.drop('B12_status', axis=1), b12_dummies], axis=1)
    
    # Remove columns with all NaN values
    covariate_df = covariate_df.dropna(axis=1, how='all')
    
    print(f"   Covariate matrix: {covariate_df.shape}")
    print(f"   Covariate columns: {list(covariate_df.columns)}")
    
    return covariate_df

def standardize_expression_data(data, method='log2_zscore'):
    """Standardize expression data"""
    print(f"\nStandardizing data using {method} method...")
    
    if method == 'log2_zscore':
        # log2(TPM + 1) + Z-score
        log_data = np.log2(data + 1)
        standardized = pd.DataFrame(
            stats.zscore(log_data, axis=1, nan_policy='omit'),
            index=data.index,
            columns=data.columns
        )
    elif method == 'zscore_only':
        # Z-score only (suitable for CpG data)
        standardized = pd.DataFrame(
            stats.zscore(data, axis=1, nan_policy='omit'),
            index=data.index,
            columns=data.columns
        )
    else:
        standardized = data.copy()
    
    return standardized

def adjust_for_covariates(expression_data, covariate_df):
    """Covariate adjustment"""
    print(f"Adjusting {len(expression_data)} features for covariates...")
    
    if covariate_df.empty or covariate_df.shape[1] == 0:
        print("   No available covariates, skipping adjustment")
        return expression_data
    
    # Ensure sample matching
    common_samples = list(set(expression_data.columns) & set(covariate_df.index))
    if len(common_samples) < 10:
        print("   Insufficient sample overlap, skipping adjustment")
        return expression_data
    
    expr_subset = expression_data[common_samples]
    cov_subset = covariate_df.loc[common_samples]
    
    # Remove covariates with NaN values
    cov_clean = cov_subset.dropna(axis=1)
    if cov_clean.shape[1] == 0:
        print("   Covariates contain too many missing values, skipping adjustment")
        return expression_data
    
    adjusted_data = pd.DataFrame(index=expr_subset.index, columns=common_samples)
    
    for feature in expr_subset.index:
        y = expr_subset.loc[feature].values
        X = cov_clean.values
        
        # Remove NaN samples
        valid_mask = ~np.isnan(y)
        if valid_mask.sum() < 10:
            adjusted_data.loc[feature] = y
            continue
        
        try:
            reg = LinearRegression().fit(X[valid_mask], y[valid_mask])
            residuals = y[valid_mask] - reg.predict(X[valid_mask])
            
            adjusted_values = np.full_like(y, np.nan)
            adjusted_values[valid_mask] = residuals
            adjusted_data.loc[feature] = adjusted_values
        except:
            adjusted_data.loc[feature] = y
    
    return adjusted_data

def save_processed_data(processed_data, output_dir):
    """Save processed data"""
    print("\n" + "=" * 50)
    print("SAVING PROCESSED DATA")
    print("=" * 50)
    
    output_files = {}
    
    for data_type, data in processed_data.items():
        if not data.empty:
            filename = f"3_{data_type}_expr_adjusted.csv"
            filepath = os.path.join(output_dir, filename)
            data.to_csv(filepath, encoding='utf-8-sig')
            output_files[data_type] = filepath
            print(f"Saved {data_type}: {filepath} ({data.shape})")
        else:
            print(f"Skipped {data_type}: empty data")
    
    # Generate processing report
    report_path = os.path.join(output_dir, "3_processing_report.txt")
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write("Expression Data Standardization Processing Report\n")
        f.write("=" * 50 + "\n\n")
        
        for data_type, data in processed_data.items():
            f.write(f"{data_type} Data:\n")
            f.write(f"  Dimensions: {data.shape}\n")
            f.write(f"  Features: {len(data)}\n")
            f.write(f"  Samples: {len(data.columns) if len(data) > 0 else 0}\n")
            if len(data) > 0:
                f.write(f"  Sample IDs: {list(data.columns[:5])}\n")
            f.write("\n")
    
    print(f"Processing report: {report_path}")
    return output_files

def main():
    """Main processing pipeline"""
    print("=" * 80)
    print("EXPRESSION DATA STANDARDIZATION AND COVARIATE ADJUSTMENT")
    print("=" * 80)
    
    # 1. Load data
    mrna_data, mirna_data, cpg_data, clinical_data, triplets, dna_mrna_integration = load_and_prepare_data()
    
    # 2. Create mappings
    sample_mapping = standardize_sample_ids()
    gene_mapping = create_gene_id_mapping(dna_mrna_integration)
    
    # 3. Harmonize data format
    data_dict = {
        'mRNA': mrna_data,
        'miRNA': mirna_data,
        'CpG': cpg_data
    }
    
    harmonized_data = harmonize_sample_names(data_dict, sample_mapping)
    
    # 4. Get common samples
    common_samples = None
    for data_type, data in harmonized_data.items():
        if common_samples is None:
            common_samples = set(data.columns)
        else:
            common_samples &= set(data.columns)
    
    common_samples = list(common_samples)
    print(f"\nCommon sample count: {len(common_samples)}")
    
    # 5. Create CpG mapping
    cpg_gene_mapping = create_cpg_gene_mapping(cpg_data, triplets)
    
    # 6. Filter relevant features
    filtered_data = filter_triplet_features(harmonized_data, triplets, gene_mapping, cpg_gene_mapping)
    
    # 7. Prepare covariates
    covariate_df = prepare_clinical_covariates(clinical_data, common_samples)
    
    # 8. Standardize and adjust
    processed_data = {}
    
    for data_type, data in filtered_data.items():
        if data.empty:
            processed_data[data_type] = data
            continue
        
        print(f"\nProcessing {data_type} data...")
        
        # Ensure using common samples
        data_common = data[common_samples] if len(data) > 0 else data
        
        # Standardization
        if data_type in ['mRNA', 'miRNA']:
            standardized = standardize_expression_data(data_common, 'log2_zscore')
        else:  # CpG
            standardized = standardize_expression_data(data_common, 'zscore_only')
        
        # Covariate adjustment
        if data_type in ['mRNA', 'miRNA']:  # CpG usually not adjusted for covariates
            adjusted = adjust_for_covariates(standardized, covariate_df)
        else:
            adjusted = standardized
        
        processed_data[data_type] = adjusted
    
    # 9. Save results
    output_files = save_processed_data(processed_data, processed_data_dir)
    
    print("\n" + "=" * 80)
    print("PROCESSING COMPLETE!")
    print("Output files:")
    for data_type, filepath in output_files.items():
        print(f"  {data_type}: {filepath}")
    print("=" * 80)

if __name__ == "__main__":
    main()

EXPRESSION DATA STANDARDIZATION AND COVARIATE ADJUSTMENT
DATA LOADING AND PREPROCESSING
1. Loading mRNA expression data...
   mRNA data: (58735, 50)
   Sample examples: ['P26m', 'P31m', 'P33m', 'P37m', 'P42m']
2. Loading miRNA expression data...
   miRNA data: (2201, 50)
   Sample examples: ['P102s', 'P105s', 'P111s', 'P113s', 'P117s']
3. Loading CpG methylation data...
   CpG data: (17584, 50)
   Sample examples: ['P102d', 'P105d', 'P111d', 'P113d', 'P117d']
4. Loading clinical data...
   Clinical data: (50, 21)
   Key columns: ['NTUID', 'DNA_ID', 'mRNA_ID', 'sRNA_ID', 'age', 'BMI', 'BMI_cat', 'B12_status', 'B12_mol', 'B12supplem']
5. Loading candidate triplets...
   Number of triplets: 282
6. Loading DNA-mRNA integration data...
   Integration data: (2762, 38)

STANDARDIZING SAMPLE IDs
Created 150 sample ID mappings

Creating gene ID mapping...
Created 111 gene ID mappings

Harmonizing sample names...
Processing mRNA data...
   mRNA: (58735, 50) -> (58735, 50)
   Retained samples: ['

In [39]:
import pandas as pd
import os

# Define paths
base_path = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data"
path_mrna   = os.path.join(base_path, "3_mRNA_expr_adjusted.csv")
path_mirna  = os.path.join(base_path, "3_miRNA_expr_adjusted.csv")
path_cpg    = os.path.join(base_path, "3_CpG_expr_adjusted.csv")
path_triplet = os.path.join(base_path, "3_candidate_triplets.csv")

# Load data
df_mrna = pd.read_csv(path_mrna, index_col=0)
df_mirna = pd.read_csv(path_mirna, index_col=0)
df_cpg = pd.read_csv(path_cpg, index_col=0)
df_triplets = pd.read_csv(path_triplet)

# Display first 5 rows of each
print(">>> mRNA Expression (first 5 rows):")
print(df_mrna.head(), "\n")

print(">>> miRNA Expression (first 5 rows):")
print(df_mirna.head(), "\n")

print(">>> CpG Expression (first 5 rows):")
print(df_cpg.head(), "\n")

print(">>> Triplet List (first 5 rows):")
print(df_triplets.head())


>>> mRNA Expression (first 5 rows):
              P87       P42      P131      P204       P64  ...       P68       P86      P191      P207      P246
PLEKHG4  0.142834  0.000198  0.895434 -0.583812 -1.013475  ... -1.661380 -0.581513 -0.907411 -0.385621  0.753601
SCML2    1.086286  1.113681 -0.020616  0.871553 -0.042929  ...  0.462531  0.029497  0.441360 -0.411833  0.112399
LEP     -0.019547 -0.615577 -0.089677  2.017611  0.454004  ...  1.354351  0.397417 -0.284881 -0.355310 -0.704216
LHX3     3.152699 -0.711416 -0.374365 -0.462718 -0.141049  ... -0.301088  0.217298  0.006024 -0.426089  2.896053
FBXO17  -0.559441 -0.227297  1.382265 -0.247219 -1.358195  ...  0.146625 -0.144769 -0.952598  0.192881 -0.481858

[5 rows x 49 columns] 

>>> miRNA Expression (first 5 rows):
                      P87       P42      P131      P204       P64  ...       P68       P86      P191      P207  \
sRNA                                                               ...                                        