In [1]:
#Load libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from ctgan import CTGANSynthesizer

In [2]:
HOME_PATH = '' #home path of the project
TRAIN_FILE = 'REAL DATASETS/TRAIN DATASETS/F_IndianLiverPatient_Real_Train.csv'
SYNTHETIC_FILE = 'SYNTHETIC DATASETS/CTGAN/F_IndianLiverPatient_Synthetic_CTGAN.csv'

## 1. Load the dataset

In [3]:
real_data = pd.read_csv(HOME_PATH + TRAIN_FILE)
real_data

Unnamed: 0,age,gender,TB,DB,alkphos,sgpt,sgot,TP,ALB,A_G,class
0,68,Female,0.6,0.1,1620.0,95.0,127.0,4.6,2.1,0.80,1.0
1,31,Male,1.3,0.5,184.0,29.0,32.0,6.8,3.4,1.00,1.0
2,28,Male,0.8,0.3,190.0,20.0,14.0,4.1,2.4,1.40,1.0
3,60,Male,2.3,0.6,272.0,79.0,51.0,6.6,3.5,1.10,1.0
4,48,Female,0.9,0.2,173.0,26.0,27.0,6.2,3.1,1.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...
461,75,Female,0.8,0.2,188.0,20.0,29.0,4.4,1.8,0.60,1.0
462,36,Male,5.3,2.3,145.0,32.0,92.0,5.1,2.6,1.00,2.0
463,37,Male,0.7,0.2,235.0,96.0,54.0,9.5,4.9,1.00,1.0
464,17,Female,0.5,0.1,206.0,28.0,21.0,7.1,4.5,1.70,2.0


In [4]:
real_data.dtypes

age          int64
gender      object
TB         float64
DB         float64
alkphos    float64
sgpt       float64
sgot       float64
TP         float64
ALB        float64
A_G        float64
class      float64
dtype: object

In [5]:
discrete_columns = ['gender','class']
discrete_columns

['gender', 'class']

In [6]:
ctgan = CTGANSynthesizer()
ctgan.fit(real_data, discrete_columns)

In [7]:
synthetic_data = ctgan.sample(len(real_data))
synthetic_data

Unnamed: 0,age,gender,TB,DB,alkphos,sgpt,sgot,TP,ALB,A_G,class
0,48,Male,0.440782,0.308033,293.252031,90.896484,122.096510,6.890105,4.752066,-373.747204,2.0
1,63,Male,5.085392,1.394731,146.670395,44.377022,767.046877,4.464615,2.621534,-215.704113,2.0
2,46,Male,2.603662,0.240794,279.466418,8.131594,71.510555,3.289337,3.809173,-105.824387,1.0
3,32,Male,4.933504,-0.489432,108.998273,6.876087,25.919481,5.179950,1.349090,-417.342566,2.0
4,54,Male,2.223495,0.306417,118.096090,28.734462,56.360980,7.203344,3.815880,-29.166500,2.0
...,...,...,...,...,...,...,...,...,...,...,...
461,4,Female,0.881327,-0.444146,218.079550,19.334236,7.314730,8.814648,2.467271,-91.134308,2.0
462,59,Male,1.060235,1.671386,94.277504,-3.203115,-2.999015,6.251866,3.013546,-453.237723,2.0
463,68,Female,1.999556,-0.725384,250.928793,47.820754,76.553263,5.980000,4.586086,-331.602326,1.0
464,66,Male,1.789744,1.611520,113.355834,4.252852,843.007034,7.931423,4.868714,-467.791338,1.0


In [8]:
synthetic_data.dtypes

age          int64
gender      object
TB         float64
DB         float64
alkphos    float64
sgpt       float64
sgot       float64
TP         float64
ALB        float64
A_G        float64
class      float64
dtype: object

In [9]:
float_1 = ['TB','DB','alkphos','sgpt','sgot','TP','ALB']
for col in float_1 :
    synthetic_data[col] = abs(np.round(synthetic_data[col],1))
float_2 = ['A_G']
for col in float_2 :
    synthetic_data[col] = abs(np.round(synthetic_data[col],2))
synthetic_data

Unnamed: 0,age,gender,TB,DB,alkphos,sgpt,sgot,TP,ALB,A_G,class
0,48,Male,0.4,0.3,293.3,90.9,122.1,6.9,4.8,373.75,2.0
1,63,Male,5.1,1.4,146.7,44.4,767.0,4.5,2.6,215.70,2.0
2,46,Male,2.6,0.2,279.5,8.1,71.5,3.3,3.8,105.82,1.0
3,32,Male,4.9,0.5,109.0,6.9,25.9,5.2,1.3,417.34,2.0
4,54,Male,2.2,0.3,118.1,28.7,56.4,7.2,3.8,29.17,2.0
...,...,...,...,...,...,...,...,...,...,...,...
461,4,Female,0.9,0.4,218.1,19.3,7.3,8.8,2.5,91.13,2.0
462,59,Male,1.1,1.7,94.3,3.2,3.0,6.3,3.0,453.24,2.0
463,68,Female,2.0,0.7,250.9,47.8,76.6,6.0,4.6,331.60,1.0
464,66,Male,1.8,1.6,113.4,4.3,843.0,7.9,4.9,467.79,1.0


In [10]:
synthetic_data.to_csv(HOME_PATH + SYNTHETIC_FILE, index = False)

In [11]:
real_data.describe()

Unnamed: 0,age,TB,DB,alkphos,sgpt,sgot,TP,ALB,A_G,class
count,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0
mean,44.212446,3.318455,1.472532,289.995708,84.251073,116.491416,6.503004,3.1603,-428.230429,1.293991
std,15.650069,6.33577,2.774472,245.212815,196.869782,314.013021,1.087422,0.793272,6544.232426,0.456078
min,4.0,0.4,0.1,63.0,10.0,11.0,2.7,0.9,-100000.0,1.0
25%,32.25,0.8,0.2,174.0,24.0,26.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,206.0,36.0,43.0,6.6,3.1,0.98,1.0
75%,56.0,2.675,1.3,298.0,62.0,88.0,7.2,3.8,1.1,2.0
max,85.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [12]:
synthetic_data.describe()

Unnamed: 0,age,TB,DB,alkphos,sgpt,sgot,TP,ALB,A_G,class
count,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0
mean,56.2897,3.743562,1.061588,313.172532,117.161588,171.149785,5.908584,3.415451,356.731395,1.403433
std,17.084854,5.433921,1.847953,321.95777,227.086296,298.02637,1.735168,0.906925,230.760039,0.491114
min,-3.0,0.0,0.0,2.8,0.1,1.1,1.3,0.8,3.0,1.0
25%,47.0,1.125,0.2,155.625,24.625,51.625,4.7,2.8,158.525,1.0
50%,58.5,2.0,0.4,230.5,51.45,99.5,5.9,3.5,325.19,1.0
75%,68.0,4.775,1.0,313.975,92.475,196.2,7.1,4.1,515.8375,2.0
max,92.0,41.3,15.5,2438.1,2277.2,4882.7,11.1,5.6,1088.53,2.0
