In [None]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# Modified gene names (already given)
modified_gene_names = [
    'YQ47', 'ZJQI', 'ZLHS', 'QUL2KD', 'PYMZ3B', 'VX9', 'RWB', 'CKNF3', 'HNCG', 
    'TWK', 'NEX', 'MZPD8', 'LVR', 'JHTCUQ', 'FQ7', 'XYWZ6', 'UHV', 'GZPKL8', 
    'HUVQX2', 'JPLS', 'RQXT', 'LGRP8', 'QYPL9', 'ALBF6', 'ZNEX3', 'PFVG7', 'MNS', 'KQOZ4'
]

# Clinical names (as given)
clinical_names = ['Gender', 'Age', 'Tobacco Use', 'Pathology', 'treatment', 'PD-L1', 'Line of IO_conden']

# Generate patient IDs (for 100 patients)
patient_ids = ['MDA_100' + str(i).zfill(3) for i in range(1, 101)]

# 1. Create data_gene.csv (60% of 100 patients)
num_patients_gene = int(0.8 * len(patient_ids))  # 60% of 100 patients
gene_data = np.random.rand(num_patients_gene, len(modified_gene_names))  # Random float data from 0 to 1
df_gene = pd.DataFrame(gene_data, columns=modified_gene_names)
df_gene['Patient_ID'] = patient_ids[:num_patients_gene]
df_gene = df_gene.set_index('Patient_ID')

# 2. Create data_clinic.csv (80% of 100 patients)
num_patients_clinic = int(0.8 * len(patient_ids))  # 80% of 100 patients
clinic_data = []

for _ in range(num_patients_clinic):
    gender = random.choice(['Male', 'Female'])
    age = random.randint(18, 90)  # Age between 18 and 90
    tobacco_use = random.choice([0, 1])  # 0 for no, 1 for yes
    pathology = random.choice(['Cancer', 'Non-Cancer'])
    treatment = random.choice(['Surgery', 'Chemotherapy', 'Radiation', 'None'])
    pd_l1 = round(random.uniform(0, 100), 2)  # PD-L1 score between 0 and 100
    line_of_io_conden = random.choice(['1st Line', '2nd Line', '3rd Line', 'No IO'])

    clinic_data.append([gender, age, tobacco_use, pathology, treatment, pd_l1, line_of_io_conden])

df_clinic = pd.DataFrame(clinic_data, columns=clinical_names)
df_clinic['Patient_ID'] = patient_ids[-num_patients_clinic:]
df_clinic = df_clinic.set_index('Patient_ID')

# 3. Create data_label.csv (100 patients, label 0 or 1)
labels = np.random.choice([0, 1], size=len(patient_ids), p=[0.5, 0.5])  # Random labels 0 or 1
df_label = pd.DataFrame({'Patient_ID': patient_ids, 'Label': labels})
df_label = df_label.set_index('Patient_ID')

# Save to CSV files
df_gene.to_csv('to_pengjie/data_gene.csv')
df_clinic.to_csv('to_pengjie/data_clinic.csv')
df_label.to_csv('to_pengjie/data_label.csv')

# Output the first few rows of each dataframe to verify
print("data_gene.csv:")
print(df_gene.head(), "\n")

print("data_clinic.csv:")
print(df_clinic.head(), "\n")

print("data_label.csv:")
print(df_label.head())
