In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [4]:
overall_clinical = pd.read_csv("/Users/mac1/Desktop/DATAFEST/merged_clinical.csv")
overall_clinical.shape

(1980, 35)

In [5]:
overall_clinical.dtypes

PATIENT_ID                        object
LYMPH_NODES_EXAMINED_POSITIVE    float64
NPI                              float64
CELLULARITY                       object
CHEMOTHERAPY                      object
COHORT                           float64
ER_IHC                            object
HER2_SNP6                         object
HORMONE_THERAPY                   object
INFERRED_MENOPAUSAL_STATE         object
SEX                               object
INTCLUST                          object
AGE_AT_DIAGNOSIS                 float64
OS_MONTHS                        float64
OS_STATUS                         object
CLAUDIN_SUBTYPE                   object
THREEGENE                         object
VITAL_STATUS                      object
LATERALITY                        object
RADIO_THERAPY                     object
HISTOLOGICAL_SUBTYPE              object
BREAST_SURGERY                    object
RFS_MONTHS                       float64
RFS_STATUS                        object
CANCER_TYPE     

In [6]:
##Define different set of variables:
continue_set=['LYMPH_NODES_EXAMINED_POSITIVE', #Number of lymph nodes positive
              'NPI', #Nottingham prognostic index
              'AGE_AT_DIAGNOSIS', #Age at Diagnosis
              'TUMOR_SIZE', #Tumor Size
              'TMB_NONSYNONYMOUS', #TMB nonsynonymous
]
categorical_set=['CELLULARITY', #Tumor Content
                 'CHEMOTHERAPY', #Chemotherapy
                 'COHORT', #Cohort
                 'HORMONE_THERAPY', #Hormone Therapy
                 'INFERRED_MENOPAUSAL_STATE', #Inferred Menopausal State
                'CLAUDIN_SUBTYPE', #Pam50 Claudin low subtype
                 'THREEGENE', #X3 Gene classifier subtype
                 'PR_STATUS', #PR Status
                 'LATERALITY', #Primary Tumor Laterality
                 'RADIO_THERAPY', #Radio Therapy
                 'HISTOLOGICAL_SUBTYPE', #Tumor Other Histologic Subtype
                 'BREAST_SURGERY', #Type of Breast Surgery
                 'CANCER_TYPE_DETAILED', #Cancer Type Detailed
                 'ER_STATUS', #ER Status
                 'HER2_STATUS', #HER2 Status
                 'GRADE', #Neoplasm Histologic Grade
                 'ONCOTREE_CODE', #Oncotree Code
                 'TUMOR_STAGE', #Tumor Stage
]
outcome_set=['OS_MONTHS', #Overall Survival Months
             'OS_STATUS', #Overall Survival Status
             'RFS_MONTHS', #Relapse Free Months
             'RFS_STATUS', #Relapse Free Status
             'VITAL_STATUS', #Patients Vital Status
 ]

In [7]:
##For variables in continuous set, make them float
overall_clinical[continue_set] = overall_clinical[continue_set].astype(float)

In [8]:
## for variables in categorical set, make them object
overall_clinical[categorical_set] = overall_clinical[categorical_set].astype("object")

In [10]:
##Subset a new dataset so it only keeps variables in continuous_set and categorical_set
ordered_vars = (
    ["PATIENT_ID"]
    + list(outcome_set)
    + list(continue_set)
    + list(categorical_set)
)
# keep only columns that actually exist, preserve order
ordered_vars = [c for c in ordered_vars if c in overall_clinical.columns]
overall_clinical_selected = overall_clinical[ordered_vars].copy()
overall_clinical_selected.columns.tolist()

['PATIENT_ID',
 'OS_MONTHS',
 'OS_STATUS',
 'RFS_MONTHS',
 'RFS_STATUS',
 'VITAL_STATUS',
 'LYMPH_NODES_EXAMINED_POSITIVE',
 'NPI',
 'AGE_AT_DIAGNOSIS',
 'TUMOR_SIZE',
 'TMB_NONSYNONYMOUS',
 'CELLULARITY',
 'CHEMOTHERAPY',
 'COHORT',
 'HORMONE_THERAPY',
 'INFERRED_MENOPAUSAL_STATE',
 'CLAUDIN_SUBTYPE',
 'THREEGENE',
 'PR_STATUS',
 'LATERALITY',
 'RADIO_THERAPY',
 'HISTOLOGICAL_SUBTYPE',
 'BREAST_SURGERY',
 'CANCER_TYPE_DETAILED',
 'ER_STATUS',
 'HER2_STATUS',
 'GRADE',
 'ONCOTREE_CODE',
 'TUMOR_STAGE']

In [12]:
overall_clinical_selected.to_csv("/Users/mac1/Desktop/clinical.csv", index=False)