In [1]:
#Load libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from copulas.multivariate import GaussianMultivariate
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
from gaussian_multivariate import DataPreProcessor

In [3]:
HOME_PATH = '/home/v6operator/work/notebooks/POST TFM/'
TRAIN_FILE = 'REAL DATASETS/TRAIN DATASETS/A_Diabetes_Data_Real_Train.csv'
SYNTHETIC_FILE = 'SYNTHETIC DATASETS/GM/A_Diabetes_Data_Synthetic_GM.csv'

## 1. Read data

In [4]:
real_data = pd.read_csv(HOME_PATH + TRAIN_FILE)
categorical_columns = real_data.select_dtypes(include=['object']).columns.tolist()
categorical_columns.append('admission_type_id')
categorical_columns.append('discharge_disposition_id')
categorical_columns.append('admission_source_id')
for col in categorical_columns :
    real_data[col] = real_data[col].astype('category')
data_train = real_data
real_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,81844290,94788,Female,[70-80),1,1,7,4,48,0,11,0,0,0,9,,Norm,No,No,NO
1,396159158,135023315,Male,[50-60),1,1,7,1,42,0,5,0,0,0,6,,,No,No,>30
2,31258956,18397782,Male,[80-90),1,1,7,4,44,0,10,0,0,0,7,,,No,Yes,NO
3,210691074,67509558,Male,[80-90),1,3,7,3,54,0,8,0,0,0,8,,,Ch,Yes,NO
4,104902980,23272362,Female,[70-80),1,11,7,11,35,3,23,0,0,1,8,,,No,No,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81407,31296060,3344202,Male,[70-80),1,1,7,2,35,0,12,0,0,0,9,,,No,No,>30
81408,159139902,93611655,Male,[60-70),5,1,1,5,63,2,23,0,0,0,9,,,Ch,Yes,<30
81409,232191828,85600899,Male,[70-80),3,3,1,3,55,1,33,0,0,0,9,,,Ch,Yes,NO
81410,6740700,8208234,Female,[60-70),6,25,7,12,77,2,21,0,0,0,9,,,Ch,Yes,>30


In [5]:
data_train.dtypes

encounter_id                   int64
patient_nbr                    int64
gender                      category
age                         category
admission_type_id           category
discharge_disposition_id    category
admission_source_id         category
time_in_hospital               int64
num_lab_procedures             int64
num_procedures                 int64
num_medications                int64
number_outpatient              int64
number_emergency               int64
number_inpatient               int64
number_diagnoses               int64
max_glu_serum               category
A1Cresult                   category
change                      category
diabetesMed                 category
readmitted                  category
dtype: object

In [6]:
# data configuration
preprocessor = DataPreProcessor(data_train)
data_train = preprocessor.preprocess_train_data()
data_train

Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,A1Cresult1,A1Cresult2,A1Cresult3,change0,change1,diabetesMed0,diabetesMed1,readmitted0,readmitted1,readmitted2
0,81844290,94788,4,48,0,11,0,0,0,9,...,0.5,0.500000,0.730957,0.500000,0.730957,0.730957,0.500000,0.500000,0.500000,0.730957
1,396159158,135023315,1,42,0,5,0,0,0,6,...,0.5,0.730957,0.500000,0.500000,0.730957,0.730957,0.500000,0.500000,0.730957,0.500000
2,31258956,18397782,4,44,0,10,0,0,0,7,...,0.5,0.730957,0.500000,0.500000,0.730957,0.500000,0.730957,0.500000,0.500000,0.730957
3,210691074,67509558,3,54,0,8,0,0,0,8,...,0.5,0.730957,0.500000,0.730957,0.500000,0.500000,0.730957,0.500000,0.500000,0.730957
4,104902980,23272362,11,35,3,23,0,0,1,8,...,0.5,0.730957,0.500000,0.500000,0.730957,0.730957,0.500000,0.500000,0.500000,0.730957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81407,31296060,3344202,2,35,0,12,0,0,0,9,...,0.5,0.730957,0.500000,0.500000,0.730957,0.730957,0.500000,0.500000,0.730957,0.500000
81408,159139902,93611655,5,63,2,23,0,0,0,9,...,0.5,0.730957,0.500000,0.730957,0.500000,0.500000,0.730957,0.730957,0.500000,0.500000
81409,232191828,85600899,3,55,1,33,0,0,0,9,...,0.5,0.730957,0.500000,0.730957,0.500000,0.500000,0.730957,0.500000,0.500000,0.730957
81410,6740700,8208234,12,77,2,21,0,0,0,9,...,0.5,0.730957,0.500000,0.730957,0.500000,0.500000,0.730957,0.500000,0.730957,0.500000


## 2. Train the model and generate data

In [7]:
gm = GaussianMultivariate()
gm.fit(data_train)

In [None]:
generated_samples = gm.sample(len(data_train))
generated_samples

## 3. Transform Generated Data

In [None]:
synthetic_data = preprocessor.transform_data(generated_samples)
synthetic_data

In [None]:
real_data.describe()

In [None]:
synthetic_data.describe()

In [None]:
synthetic_data.to_csv(HOME_PATH + SYNTHETIC_FILE, index = False)