In [29]:
import ehrapy as ep
import pandas as pd
import numpy as np
import anndata as ad

In [30]:
df = pd.read_csv("data-CAT.csv")

In [39]:
ordinal_columns = [
    "pT_status", "pN_status", "G_status", "L_status", "V_status", "Pn_status", "R_status", "M_status_postop"
]
nominal_columns = [
    "Status_OS", "PFS_sum", "Status_PFS", "sex", "Weight_loss_>10%_6months_before_OP", "DM",
    "Long_term_steroid_medication","C2_abusus", "CEA (ng/ml)","Localisation", "Abdominal pain", 
    "icterus", "new_onset_DM", "Episode_acute_pancreatitis", "Episode_chronic_pancreatitis", 
    "OP_technique", "Vessel_resection", "Resection_liver_met_intraop", "histology",
    "Clavien-Dindo",  "ICD-10 Diagnosis",
]   
numerical_columns = [
    "ID_scRNAseq", "NIssc", "OS", "PFS", "Liver_met_sum", "Lung_met_sum", "other_met_sum", "local_rec_sum", 
    "age_at_diagnosis", "Size_cm", "Weight_kg", "BMI", "ASA", "Serumalbumin (g/dl)", "Serum-CRP (mg/l)", 
    "Leucocytes (n*1000/?l)", "Hb (g/dl)", "Creatinine (mg/dl)", "Smoker_py", "Bilirubin (mg/dl)", 
    "Gamma-GT", "AP (U/l)", "Amylase (U/l)", "Lipase (U/l)","CA 19-9 (U/ml)",  
    "Number_resected_lymphnodes","Number_tumor_infiltrated_lymphnodes", "Ratio_tumorLN/resectedLN", "Tumorsize_mm"   
]

In [40]:
df[ordinal_columns] = df[ordinal_columns].astype("category")
print("Ordinal columns converted to categorical.")

Ordinal columns converted to categorical.


In [41]:
df_nominal = pd.get_dummies(df[nominal_columns], drop_first=True)
print("Nominal columns one-hot encoded.") 

Nominal columns one-hot encoded.


In [42]:
df_numerical = df[numerical_columns].values

In [43]:
obs_data = pd.concat([df[ordinal_columns], df_nominal], axis=1)

In [44]:
X_data = df_numerical

In [45]:
adata = ad.AnnData(X=X_data, obs=obs_data)

print("AnnData object created successfully.")

AnnData object created successfully.


In [48]:
adata.write("processed.h5ad")
print("AnnData object saved as 'processed.h5ad'.")

AnnData object saved as 'processed.h5ad'.


In [49]:
adata_loaded = ad.read("processed.h5ad")
print("AnnData object loaded.")

AnnData object loaded.


In [50]:
print(adata_loaded)

AnnData object with n_obs × n_vars = 23 × 29
    obs: 'pT_status', 'pN_status', 'G_status', 'L_status', 'V_status', 'Pn_status', 'R_status', 'M_status_postop', 'Status_OS_dead', 'PFS_sum_03.07.2023', 'PFS_sum_12.02.2018', 'PFS_sum_13.06.2022', 'PFS_sum_18.10.2022', 'PFS_sum_21.05.2021', 'PFS_sum_24.05.2021', 'PFS_sum_24.05.2023', 'PFS_sum_28.04.2023', 'PFS_sum_29.08.2023', 'PFS_sum_30.07.2018', 'PFS_sum_Data still not available (OP< 6 months)', 'PFS_sum_Periop. Tod', 'Status_PFS_recurrence', 'sex_male', 'Weight_loss_>10%_6months_before_OP_Yes', 'DM_Yes', 'DM_yes', 'Long_term_steroid_medication_Yes', 'CEA (ng/ml)_1.95', 'CEA (ng/ml)_10.53', 'CEA (ng/ml)_2.2', 'CEA (ng/ml)_2.8', 'CEA (ng/ml)_5', 'CEA (ng/ml)_6.45', 'CEA (ng/ml)_8.27', 'CEA (ng/ml)_<1,80', 'Localisation_Cauda', 'Localisation_Corpus', 'Abdominal pain_No', 'icterus_No', 'new_onset_DM_No', 'OP_technique_Traverso-Longmire OP', 'OP_technique_total pancreatectomy', 'Resection_liver_met_intraop_No', 'Clavien-Dindo_grade 1', 'Cla