In [7]:
# Multi-omics Raw Data Processing
# Author: Generated for omics data processing
# Date: 2025
# Note: No standardization - keeping original raw data

import pandas as pd
import numpy as np
import os

# Set file paths
mrna_file = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/5mRNA_TPM.csv"
mirna_file = "/Users/heweilin/Desktop/P056_Code_2/Raw_Data/6miRNA_TPM.csv"
cpg_file = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/1_PD_PromoterRegion_CpGs.csv"
output_dir = "/Users/heweilin/Desktop/P056_Code_2/Processed_Data/"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print("Processing multi-omics raw data files (no standardization)...")

# 1. Process mRNA data (5mRNA_TPM.csv)
print("\nProcessing mRNA data...")
mrna_raw = pd.read_csv(mrna_file)

print(f"mRNA raw data dimensions: {mrna_raw.shape[0]} x {mrna_raw.shape[1]}")
print(f"First few column names: {list(mrna_raw.columns[:5])}")

# Check the first column name and handle accordingly
if mrna_raw.columns[0] == '' or pd.isna(mrna_raw.columns[0]) or mrna_raw.columns[0].strip() == '':
    # If first column is unnamed, it likely contains gene names
    mrna_processed = mrna_raw.copy()
    mrna_processed.columns = ['Gene_Symbol'] + list(mrna_processed.columns[1:])
else:
    # First column already has a name
    mrna_processed = mrna_raw.copy()

# Save mRNA raw data
output_file_mrna = os.path.join(output_dir, "1_OR_mRNA_expr_raw.csv")
mrna_processed.to_csv(output_file_mrna, index=False)

print(f"mRNA raw data saved: {mrna_processed.shape[0]} features x {mrna_processed.shape[1]-1} samples")

# 2. Process miRNA data (6miRNA_TPM.csv)
print("\nProcessing miRNA data...")
mirna_raw = pd.read_csv(mirna_file)

print(f"miRNA raw data dimensions: {mirna_raw.shape[0]} x {mirna_raw.shape[1]}")
print(f"Column names preview: {list(mirna_raw.columns[:5])}")

# miRNA data should have 'sRNA' column
mirna_processed = mirna_raw.copy()

# Save miRNA raw data
output_file_mirna = os.path.join(output_dir, "1_OR_miRNA_expr_raw.csv")
mirna_processed.to_csv(output_file_mirna, index=False)

print(f"miRNA raw data saved: {mirna_processed.shape[0]} features x {mirna_processed.shape[1]-1} samples")

# 3. Process CpG data (1_PD_PromoterRegion_CpGs.csv)
print("\nProcessing CpG methylation data...")
cpg_raw = pd.read_csv(cpg_file)

print(f"CpG raw data dimensions: {cpg_raw.shape[0]} x {cpg_raw.shape[1]}")
print(f"Column names preview: {list(cpg_raw.columns[:5])}")

# CpG data should have 'Gene_Symbol' column
cpg_processed = cpg_raw.copy()

# Save CpG raw data
output_file_cpg = os.path.join(output_dir, "1_OR_CpG_expr_raw.csv")
cpg_processed.to_csv(output_file_cpg, index=False)

print(f"CpG raw data saved: {cpg_processed.shape[0]} features x {cpg_processed.shape[1]-1} samples")

# Data summary
print("\n" + "="*50)
print("DATA PROCESSING SUMMARY")
print("="*50)
print("="*50)
print("Input files with full paths:")
print("1. /Users/heweilin/Desktop/P056_Code_2/Raw_Data/5mRNA_TPM.csv -> 1_OR_mRNA_expr_raw.csv")
print("2. /Users/heweilin/Desktop/P056_Code_2/Raw_Data/6miRNA_TPM.csv -> 1_OR_miRNA_expr_raw.csv")
print("3. /Users/heweilin/Desktop/P056_Code_2/Processed_Data/1_PD_PromoterRegion_CpGs.csv -> 1_OR_CpG_expr_raw.csv")

print(f"\nOutput dimensions:")
print(f"- mRNA: {mrna_processed.shape[0]} genes x {mrna_processed.shape[1]-1} samples")
print(f"- miRNA: {mirna_processed.shape[0]} miRNAs x {mirna_processed.shape[1]-1} samples")
print(f"- CpG: {cpg_processed.shape[0]} CpG sites x {cpg_processed.shape[1]-1} samples")

# Check sample overlap
# Get sample column names (excluding the first column which contains feature names)
mrna_samples = list(mrna_processed.columns[1:])
mirna_samples = list(mirna_processed.columns[1:])
cpg_samples = list(cpg_processed.columns[1:])

print(f"\nSample information:")
print(f"- mRNA samples: {len(mrna_samples)}")
print(f"- miRNA samples: {len(mirna_samples)}")  
print(f"- CpG samples: {len(cpg_samples)}")

# Convert sample names to a common format for comparison (remove suffixes)
mrna_base = [sample.rstrip('msd') for sample in mrna_samples]
mirna_base = [sample.rstrip('msd') for sample in mirna_samples]
cpg_base = [sample.rstrip('msd') for sample in cpg_samples]

# Find common base sample names
common_base_samples = list(set(mrna_base) & set(mirna_base) & set(cpg_base))
print(f"Common samples (base names): {len(common_base_samples)}")

# Display some example sample names
print(f"\nExample sample names:")
print(f"- mRNA: {mrna_samples[:5]}")
print(f"- miRNA: {mirna_samples[:5]}")
print(f"- CpG: {cpg_samples[:5]}")

# Show data value ranges for each dataset
print("\n" + "="*50)
print("DATA VALUE RANGES (RAW DATA)")
print("="*50)

def show_data_range(df, name, feature_col):
    """Display basic statistics for the dataset"""
    # Get numeric columns (excluding the feature name column)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) == 0:
        # If no numeric columns detected, try converting all except first column
        numeric_data = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
    else:
        numeric_data = df[numeric_cols]
    
    print(f"\n{name} data statistics:")
    print(f"- Value range: [{numeric_data.min().min():.6f}, {numeric_data.max().max():.6f}]")
    print(f"- Mean of all values: {numeric_data.mean().mean():.6f}")
    print(f"- Standard deviation: {numeric_data.std().std():.6f}")
    
    # Count missing values
    missing_values = numeric_data.isnull().sum().sum()
    total_values = numeric_data.size
    print(f"- Missing values: {missing_values} out of {total_values} ({100*missing_values/total_values:.2f}%)")

show_data_range(mrna_processed, "mRNA", "Gene_Symbol")
show_data_range(mirna_processed, "miRNA", "sRNA")
show_data_range(cpg_processed, "CpG", "Gene_Symbol")

print(f"\n" + "="*50)
print("PROCESSING COMPLETE")
print("="*50)
print("All raw data files saved to: {output_dir}")
print("Note: No standardization applied - original TPM/methylation values preserved")

# Additional file information
print(f"\nOutput files created:")
for filename in ["1_OR_mRNA_expr_raw.csv", "1_OR_miRNA_expr_raw.csv", "1_OR_CpG_expr_raw.csv"]:
    filepath = os.path.join(output_dir, filename)
    if os.path.exists(filepath):
        file_size = os.path.getsize(filepath) / (1024*1024)  # Size in MB
        print(f"- {filename}: {file_size:.2f} MB")

print("\nReady for downstream analysis!")

Processing multi-omics raw data files (no standardization)...

Processing mRNA data...
mRNA raw data dimensions: 58735 x 51
First few column names: ['Unnamed: 0', 'P26m', 'P31m', 'P33m', 'P37m']
mRNA raw data saved: 58735 features x 50 samples

Processing miRNA data...
miRNA raw data dimensions: 2201 x 51
Column names preview: ['sRNA', 'P102s', 'P105s', 'P111s', 'P113s']
miRNA raw data saved: 2201 features x 50 samples

Processing CpG methylation data...
CpG raw data dimensions: 17584 x 51
Column names preview: ['Gene_Symbol', 'P102d', 'P105d', 'P111d', 'P113d']
CpG raw data saved: 17584 features x 50 samples

DATA PROCESSING SUMMARY
Input files with full paths:
1. /Users/heweilin/Desktop/P056_Code_2/Raw_Data/5mRNA_TPM.csv -> 1_OR_mRNA_expr_raw.csv
2. /Users/heweilin/Desktop/P056_Code_2/Raw_Data/6miRNA_TPM.csv -> 1_OR_miRNA_expr_raw.csv
3. /Users/heweilin/Desktop/P056_Code_2/Processed_Data/1_PD_PromoterRegion_CpGs.csv -> 1_OR_CpG_expr_raw.csv

Output dimensions:
- mRNA: 58735 genes x 5