In [1]:
#Load libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from ctgan import CTGANSynthesizer

In [2]:
HOME_PATH = '' #home path of the project
TRAIN_FILE = 'REAL DATASETS/TRAIN DATASETS/B_Cardio_Data_Real_Train.csv'
SYNTHETIC_FILE = 'SYNTHETIC DATASETS/CTGAN/B_Cardio_Data_Synthetic_CTGAN.csv'

## 1. Load the dataset

In [3]:
real_data = pd.read_csv(HOME_PATH + TRAIN_FILE)
real_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,67617,21876,1,154,80.0,130,90,2,1,0,0,1,1
1,96320,16717,2,162,70.0,140,90,1,1,0,0,0,1
2,17571,21128,2,174,92.0,150,100,1,1,0,0,1,1
3,46492,23366,2,173,76.0,120,82,1,1,0,0,1,1
4,945,20281,1,160,60.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,53137,16001,2,170,75.0,150,80,1,1,1,0,1,1
55996,8918,23209,2,162,73.0,160,90,1,1,0,0,1,1
55997,78302,23589,1,169,74.0,120,80,1,1,0,0,1,0
55998,1197,18227,1,167,70.0,120,80,1,1,0,0,0,0


In [4]:
real_data.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [5]:
discrete_columns = ['gender','cholesterol','gluc','smoke','alco','active','cardio']
discrete_columns

['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

In [6]:
ctgan = CTGANSynthesizer()
ctgan.fit(real_data, discrete_columns)

In [7]:
synthetic_data = ctgan.sample(len(real_data))
synthetic_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,95179,20434,2,171,72.929260,133,84,3,1,0,0,1,1
1,66469,19869,1,159,73.492993,175,95,1,1,1,0,1,1
2,48068,18480,1,169,60.729046,119,76,1,1,0,0,1,1
3,32289,21122,1,162,83.544909,104,74,1,1,0,0,1,1
4,40044,18057,2,171,92.955322,137,78,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,89136,20408,1,171,94.750406,134,80,1,1,0,0,1,1
55996,67066,14787,2,172,52.397482,118,77,1,1,0,0,1,0
55997,39692,19069,2,174,79.765181,92,70,1,1,0,0,1,0
55998,87110,21793,1,162,62.687074,141,91,1,1,0,0,1,1


In [8]:
synthetic_data['weight'] = np.round(synthetic_data['weight'],1)
synthetic_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,95179,20434,2,171,72.9,133,84,3,1,0,0,1,1
1,66469,19869,1,159,73.5,175,95,1,1,1,0,1,1
2,48068,18480,1,169,60.7,119,76,1,1,0,0,1,1
3,32289,21122,1,162,83.5,104,74,1,1,0,0,1,1
4,40044,18057,2,171,93.0,137,78,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,89136,20408,1,171,94.8,134,80,1,1,0,0,1,1
55996,67066,14787,2,172,52.4,118,77,1,1,0,0,1,0
55997,39692,19069,2,174,79.8,92,70,1,1,0,0,1,0
55998,87110,21793,1,162,62.7,141,91,1,1,0,0,1,1


In [9]:
real_data.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0
mean,49971.932589,19464.929107,1.349982,164.348125,74.188586,128.737893,97.025536,1.366286,1.224946,0.08875,0.054839,0.803768,0.499411
std,28875.383115,2469.403825,0.476968,8.187461,14.361142,148.345232,197.976631,0.679328,0.57002,0.284385,0.227668,0.39715,0.500004
min,0.0,10798.0,1.0,55.0,22.0,-140.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,24985.0,17658.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,49942.0,19699.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74990.25,21325.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,14020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [10]:
synthetic_data.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0
mean,55598.976125,19519.057554,1.398196,164.811179,75.470066,122.478214,98.265768,1.461518,1.318036,0.167804,0.156571,0.807786,0.632679
std,29048.605428,2072.387195,0.489531,6.657466,13.95573,15.132236,138.470477,0.759413,0.607397,0.373695,0.363399,0.394044,0.482079
min,-1918.0,13976.0,1.0,142.0,42.0,-47.0,29.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,32945.5,18357.0,1.0,160.0,65.1,118.0,73.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,60997.5,19627.0,1.0,164.0,73.3,120.0,79.0,1.0,1.0,0.0,0.0,1.0,1.0
75%,80020.5,21076.0,2.0,170.0,82.9,131.0,85.0,2.0,1.0,0.0,0.0,1.0,1.0
max,103794.0,24174.0,2.0,192.0,184.4,200.0,1223.0,3.0,3.0,1.0,1.0,1.0,1.0


In [11]:
synthetic_data.to_csv(HOME_PATH + SYNTHETIC_FILE, index = False)