In [5]:
# test_data_generator.ipynb

import pandas as pd
import numpy as np
import os
from pathlib import Path

# Get project root
current_dir = Path(os.getcwd())
project_root = current_dir.parent  # Gehe einen Ordner hoch

# Setup absolute paths
DATA_DIR = os.path.join(project_root, "data")
COHORT_DATA_DIR = os.path.join(DATA_DIR, "cohort_data")
MERGED_DATA_DIR = os.path.join(DATA_DIR, "merged_data")

# Create directories
for dir_path in [
    os.path.join(COHORT_DATA_DIR, "exprs"),
    os.path.join(COHORT_DATA_DIR, "pData", "original"),
    os.path.join(COHORT_DATA_DIR, "pData", "imputed"),
    os.path.join(MERGED_DATA_DIR, "exprs", "common_genes"),
    os.path.join(MERGED_DATA_DIR, "pData", "imputed"),
]:
    os.makedirs(dir_path, exist_ok=True)

# Generate consistent data
np.random.seed(42)

# Define cohorts and sizes
cohorts = {
    'Belfast_2018_Jain': 248,
    'CPC_GENE_2017_Fraser': 73,
    'DKFZ_2018_Gerhauser': 82,
    'CancerMap_2017_Luca': 133,
    'MSKCC_2010_Taylor': 131,
    'Atlanta_2014_Long': 100,
    'CamCap_2016_Ross_Adams': 112,
    'Stockholm_2016_Ross_Adams': 92,
    'CPGEA_2020_Li': 120
}



In [6]:
# Generate 100 gene names
genes = [f'ENSG{str(i).zfill(11)}' for i in range(100)]

def generate_expression_data(n_samples, n_genes=100):
    """Generate random expression data"""
    return pd.DataFrame(
        np.random.normal(0, 1, (n_samples, n_genes)),
        columns=genes
    )

def generate_clinical_data(n_samples):
    """Generate random clinical data"""
    return pd.DataFrame({
        'AGE': np.random.uniform(45, 80, n_samples),
        'CLIN_T_STAGE': np.random.choice(['T1', 'T1C', 'T2', 'T2A', 'T2B', 'T3'], n_samples),
        'PATH_T_STAGE': np.random.choice(['T2', 'T2A', 'T2B', 'T3', 'T3A', 'T3B'], n_samples),
        'GLEASON_SCORE': np.random.choice([6, 7, 8, 9], n_samples),
        'GLEASON_SCORE_1': np.random.choice([3, 4, 5], n_samples),
        'GLEASON_SCORE_2': np.random.choice([3, 4, 5], n_samples),
        'PRE_OPERATIVE_PSA': np.random.uniform(0, 100, n_samples),
        'MONTH_TO_BCR': np.random.uniform(0, 120, n_samples),
        'BCR_STATUS': np.random.choice([0, 1], n_samples),
        'MONTH_TO_CEP': np.random.uniform(0, 120, n_samples),
        'CEP_STATUS': np.random.choice([0, 1], n_samples),
    })



In [7]:
# Generate and save cohort data
all_exprs = []
all_pdata = []

for cohort, n_samples in cohorts.items():
    # Generate expression data
    exprs = generate_expression_data(n_samples)
    exprs.index = [f"S{str(i).zfill(4)}" for i in range(n_samples)]
    exprs.to_csv(os.path.join(COHORT_DATA_DIR, "exprs", f"{cohort}.csv"))
    
    # Generate clinical data
    pdata = generate_clinical_data(n_samples)
    pdata.index = exprs.index
    
    # Save original and imputed (same for test data)
    pdata.to_csv(os.path.join(COHORT_DATA_DIR, "pData", "original", f"{cohort}.csv"))
    pdata.to_csv(os.path.join(COHORT_DATA_DIR, "pData", "imputed", f"{cohort}.csv"))
    
    # Store for merged data
    exprs.index = [f"{cohort}.{idx}" for idx in exprs.index]
    pdata.index = exprs.index
    all_exprs.append(exprs)
    all_pdata.append(pdata)



In [8]:
# Create merged data
merged_exprs = pd.concat(all_exprs)
merged_pdata = pd.concat(all_pdata)

# Add TISSUE column to merged pdata
merged_pdata['TISSUE'] = ['FFPE' if 'Atlanta' in idx else 'Fresh_frozen' for idx in merged_pdata.index]

# Save merged data
merged_exprs.to_csv(os.path.join(MERGED_DATA_DIR, "exprs", "common_genes", "common_genes_knn_imputed.csv"))
merged_pdata.to_csv(os.path.join(MERGED_DATA_DIR, "pData", "imputed", "merged_imputed_pData.csv"))

print("Generated test data files with 100 genes:")
print("\nCohort-wise files:")
for cohort in cohorts:
    print(f"- {cohort}.csv (exprs and pData)")
print("\nMerged files:")
print("- common_genes_knn_imputed.csv")
print("- merged_imputed_pData.csv")

Generated test data files with 100 genes:

Cohort-wise files:
- Belfast_2018_Jain.csv (exprs and pData)
- CPC_GENE_2017_Fraser.csv (exprs and pData)
- DKFZ_2018_Gerhauser.csv (exprs and pData)
- CancerMap_2017_Luca.csv (exprs and pData)
- MSKCC_2010_Taylor.csv (exprs and pData)
- Atlanta_2014_Long.csv (exprs and pData)
- CamCap_2016_Ross_Adams.csv (exprs and pData)
- Stockholm_2016_Ross_Adams.csv (exprs and pData)
- CPGEA_2020_Li.csv (exprs and pData)

Merged files:
- common_genes_knn_imputed.csv
- merged_imputed_pData.csv
