In [1]:
#Load libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from ctgan import CTGANSynthesizer

In [2]:
HOME_PATH = '' #home directory of the project
TRAIN_FILE = 'REAL DATASETS/TRAIN DATASETS/A_Diabetes_Data_Real_Train.csv'
SYNTHETIC_FILE = 'SYNTHETIC DATASETS/CTGAN/A_Diabetes_Data_Synthetic_CTGAN.csv'

## 1. Load the dataset

In [3]:
real_data = pd.read_csv(HOME_PATH + TRAIN_FILE)
real_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,81844290,94788,Female,[70-80),1,1,7,4,48,0,11,0,0,0,9,,Norm,No,No,NO
1,396159158,135023315,Male,[50-60),1,1,7,1,42,0,5,0,0,0,6,,,No,No,>30
2,31258956,18397782,Male,[80-90),1,1,7,4,44,0,10,0,0,0,7,,,No,Yes,NO
3,210691074,67509558,Male,[80-90),1,3,7,3,54,0,8,0,0,0,8,,,Ch,Yes,NO
4,104902980,23272362,Female,[70-80),1,11,7,11,35,3,23,0,0,1,8,,,No,No,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81407,31296060,3344202,Male,[70-80),1,1,7,2,35,0,12,0,0,0,9,,,No,No,>30
81408,159139902,93611655,Male,[60-70),5,1,1,5,63,2,23,0,0,0,9,,,Ch,Yes,<30
81409,232191828,85600899,Male,[70-80),3,3,1,3,55,1,33,0,0,0,9,,,Ch,Yes,NO
81410,6740700,8208234,Female,[60-70),6,25,7,12,77,2,21,0,0,0,9,,,Ch,Yes,>30


In [4]:
real_data.dtypes

encounter_id                 int64
patient_nbr                  int64
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
change                      object
diabetesMed                 object
readmitted                  object
dtype: object

In [5]:
discrete_columns = real_data.select_dtypes(include=['object']).columns.tolist()
discrete_columns.append('admission_type_id')
discrete_columns.append('discharge_disposition_id')
discrete_columns.append('admission_source_id')
discrete_columns

['gender',
 'age',
 'max_glu_serum',
 'A1Cresult',
 'change',
 'diabetesMed',
 'readmitted',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id']

In [6]:
ctgan = CTGANSynthesizer()
ctgan.fit(real_data, discrete_columns)

In [7]:
synthetic_data = ctgan.sample(len(real_data))
synthetic_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,178760744,61269320,Female,[60-70),3,1,1,4,55,1,22,0,0,0,7,,>8,Ch,Yes,NO
1,275127896,37904511,Male,[50-60),3,1,1,3,61,2,17,0,0,0,7,,,Ch,Yes,NO
2,88957358,23026313,Female,[70-80),2,1,10,3,46,0,17,0,0,0,5,,,No,Yes,NO
3,362071111,114265419,Female,[50-60),1,1,7,2,46,0,7,0,0,0,7,,,No,No,NO
4,160779558,92169994,Male,[40-50),1,1,7,4,35,1,12,0,1,2,7,,,No,No,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81407,55916783,962667,Female,[70-80),1,6,7,4,69,0,17,0,0,1,9,,>8,No,Yes,NO
81408,383410659,88478903,Female,[50-60),1,1,7,10,45,2,10,0,1,1,8,,Norm,Ch,Yes,>30
81409,163294538,52575141,Female,[80-90),1,3,7,9,45,2,16,0,0,1,9,,,No,No,NO
81410,112081539,4929517,Female,[70-80),1,11,7,6,78,2,17,0,0,0,4,,,No,Yes,NO


In [8]:
real_data.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0
mean,165458300.0,54323150.0,2.020562,3.7174,5.753132,4.401513,43.077839,1.338107,16.00834,0.368115,0.198853,0.636122,7.421817
std,102882700.0,38717750.0,1.441261,5.290315,4.06203,2.99373,19.736568,1.704868,8.119336,1.264267,0.928555,1.266526,1.93408
min,15738.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,85026040.0,23409550.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152529000.0,45499310.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230702400.0,87492710.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443857200.0,189481500.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,40.0,76.0,19.0,16.0


In [9]:
synthetic_data.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0,81412.0
mean,163695900.0,57337030.0,2.21643,3.560544,6.368066,4.143664,41.662384,1.17861,15.966565,0.150715,0.066157,0.627561,6.90069
std,92782370.0,38701810.0,1.678223,5.2816,4.285218,3.014978,18.381279,1.420349,8.751668,0.659219,0.349277,1.064099,1.971926
min,-5936336.0,-4175660.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,91643900.0,25442560.0,1.0,1.0,3.0,2.0,31.0,0.0,9.0,0.0,0.0,0.0,5.0
50%,156551600.0,56696150.0,1.0,1.0,7.0,4.0,42.0,1.0,14.0,0.0,0.0,0.0,8.0
75%,216484500.0,90607370.0,3.0,3.0,7.0,5.0,54.0,2.0,21.0,0.0,0.0,1.0,8.0
max,469978700.0,204881300.0,8.0,28.0,25.0,14.0,111.0,6.0,79.0,5.0,3.0,6.0,9.0


In [10]:
synthetic_data.to_csv(HOME_PATH + SYNTHETIC_FILE, index = False)