In [1]:
#Load libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from copulas.multivariate import GaussianMultivariate
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
from gaussian_multivariate import DataPreProcessor

In [3]:
HOME_PATH = '' #home path of the project
TRAIN_FILE = 'REAL DATASETS/TRAIN DATASETS/B_Cardio_Data_Real_Train.csv'
SYNTHETIC_FILE = 'SYNTHETIC DATASETS/GM/B_Cardio_Data_Synthetic_GM.csv'

## 1. Read data

In [4]:
real_data = pd.read_csv(HOME_PATH + TRAIN_FILE)
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active','cardio']
for col in categorical_columns :
    real_data[col] = real_data[col].astype('category')
data_train = real_data
real_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,67617,21876,1,154,80.0,130,90,2,1,0,0,1,1
1,96320,16717,2,162,70.0,140,90,1,1,0,0,0,1
2,17571,21128,2,174,92.0,150,100,1,1,0,0,1,1
3,46492,23366,2,173,76.0,120,82,1,1,0,0,1,1
4,945,20281,1,160,60.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,53137,16001,2,170,75.0,150,80,1,1,1,0,1,1
55996,8918,23209,2,162,73.0,160,90,1,1,0,0,1,1
55997,78302,23589,1,169,74.0,120,80,1,1,0,0,1,0
55998,1197,18227,1,167,70.0,120,80,1,1,0,0,0,0


In [5]:
data_train.dtypes

id                int64
age               int64
gender         category
height            int64
weight          float64
ap_hi             int64
ap_lo             int64
cholesterol    category
gluc           category
smoke          category
alco           category
active         category
cardio         category
dtype: object

In [6]:
# data configuration
preprocessor = DataPreProcessor(data_train)
data_train = preprocessor.preprocess_train_data()
data_train

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo,gender0,gender1,cholesterol0,cholesterol1,...,gluc1,gluc2,smoke0,smoke1,alco0,alco1,active0,active1,cardio0,cardio1
0,67617,21876,154,80.0,130,90,0.730957,0.500000,0.500000,0.730957,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.500000,0.730957,0.500000,0.730957
1,96320,16717,162,70.0,140,90,0.500000,0.730957,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.730957,0.500000,0.500000,0.730957
2,17571,21128,174,92.0,150,100,0.500000,0.730957,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.500000,0.730957,0.500000,0.730957
3,46492,23366,173,76.0,120,82,0.500000,0.730957,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.500000,0.730957,0.500000,0.730957
4,945,20281,160,60.0,120,80,0.730957,0.500000,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.500000,0.730957,0.730957,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,53137,16001,170,75.0,150,80,0.500000,0.730957,0.730957,0.500000,...,0.5,0.5,0.500000,0.730957,0.730957,0.5,0.500000,0.730957,0.500000,0.730957
55996,8918,23209,162,73.0,160,90,0.500000,0.730957,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.500000,0.730957,0.500000,0.730957
55997,78302,23589,169,74.0,120,80,0.730957,0.500000,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.500000,0.730957,0.730957,0.500000
55998,1197,18227,167,70.0,120,80,0.730957,0.500000,0.730957,0.500000,...,0.5,0.5,0.730957,0.500000,0.730957,0.5,0.730957,0.500000,0.730957,0.500000


## 2. Train the model and generate data

In [7]:
gm = GaussianMultivariate()
gm.fit(data_train)

In [8]:
generated_samples = gm.sample(len(data_train))
generated_samples

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo,gender0,gender1,cholesterol0,cholesterol1,...,gluc1,gluc2,smoke0,smoke1,alco0,alco1,active0,active1,cardio0,cardio1
0,3540.620967,14298.141981,159.362019,68.646934,141.770263,97.168823,0.495155,0.735795,0.773654,0.473365,...,0.496515,0.488927,0.726687,0.504270,0.733678,0.497279,0.510863,0.720109,0.749367,0.481600
1,8670.403579,22551.690140,149.002396,72.531185,131.218155,74.646254,0.738610,0.492352,0.515467,0.498579,...,0.501172,0.508666,0.728849,0.502111,0.726990,0.503971,0.503499,0.727466,0.495564,0.735392
2,26537.090274,22729.934151,148.938922,74.919117,140.742054,108.171900,0.751279,0.479684,0.495208,0.735181,...,0.500243,0.499556,0.722516,0.508448,0.728422,0.502540,0.507559,0.723386,0.516611,0.714328
3,16126.302099,19467.876214,160.394853,86.949278,123.084633,90.042782,0.734024,0.496935,0.751274,0.486529,...,0.504747,0.495567,0.730505,0.500448,0.729927,0.501021,0.732084,0.498888,0.481274,0.749688
4,39966.232570,22059.506343,171.711143,83.258253,116.991010,81.895964,0.714988,0.515983,0.741802,0.491695,...,0.507018,0.500249,0.730404,0.500555,0.727689,0.503269,0.731475,0.499465,0.720479,0.510445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,61086.954953,20098.609602,156.082010,60.735778,128.201449,92.342373,0.748864,0.482095,0.726376,0.506577,...,0.505290,0.490914,0.726209,0.504750,0.506783,0.724135,0.728569,0.502379,0.499990,0.730952
55996,95276.901200,20522.452932,154.868771,88.455547,125.404334,109.362940,0.521611,0.709392,0.493256,0.738298,...,0.504890,0.503958,0.728917,0.502039,0.735399,0.495555,0.495669,0.735297,0.504955,0.726020
55997,49638.744801,18805.802068,165.459608,79.951021,171.912772,138.628011,0.729393,0.501552,0.498857,0.511378,...,0.498451,0.515744,0.725832,0.505117,0.729222,0.501738,0.526949,0.704244,0.507496,0.723469
55998,30191.121656,18800.992682,164.479617,66.637624,109.176553,71.533380,0.487696,0.743256,0.716661,0.725174,...,0.745421,0.493452,0.490571,0.740383,0.490236,0.740715,0.476030,0.754924,0.733465,0.497494


## 3. Transform Generated Data

In [9]:
synthetic_data = preprocessor.transform_data(generated_samples)
synthetic_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,3540,14298,2,159,68.646934,141,97,1,1,0,0,1,0
1,8670,22551,1,149,72.531185,131,74,3,1,0,0,1,1
2,26537,22729,1,148,74.919117,140,108,2,1,0,0,1,1
3,16126,19467,1,160,86.949278,123,90,1,1,0,0,0,1
4,39966,22059,1,171,83.258253,116,81,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,61086,20098,1,156,60.735778,128,92,1,1,0,1,0,1
55996,95276,20522,2,154,88.455547,125,109,2,1,0,0,1,1
55997,49638,18805,1,165,79.951021,171,138,2,1,0,0,1,1
55998,30191,18800,2,164,66.637624,109,71,2,2,1,1,1,0


In [10]:
real_data.describe()

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo
count,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0
mean,49971.932589,19464.929107,164.348125,74.188586,128.737893,97.025536
std,28875.383115,2469.403825,8.187461,14.361142,148.345232,197.976631
min,0.0,10798.0,55.0,22.0,-140.0,0.0
25%,24985.0,17658.0,159.0,65.0,120.0,80.0
50%,49942.0,19699.0,165.0,72.0,120.0,80.0
75%,74990.25,21325.0,170.0,82.0,140.0,90.0
max,99999.0,23713.0,250.0,200.0,14020.0,11000.0


In [11]:
synthetic_data.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0
mean,50024.298857,19473.059143,1.347589,163.859482,74.206613,123.732625,96.120143,1.413,1.272339,0.088696,0.053821,0.804429,0.498482
std,28906.746439,2481.062102,0.476209,8.210312,14.471028,20.048214,187.664152,0.707771,0.616527,0.284308,0.225667,0.396643,0.500002
min,0.0,10483.0,1.0,59.0,19.281064,-325.0,-59.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,24952.0,17773.0,1.0,158.0,64.349831,114.0,65.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,49996.0,19709.0,1.0,164.0,71.944623,124.0,81.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,75025.25,21437.0,2.0,169.0,82.116287,133.0,98.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99998.0,24487.0,2.0,251.0,198.124411,883.0,10951.0,3.0,3.0,1.0,1.0,1.0,1.0


In [12]:
synthetic_data.to_csv(HOME_PATH + SYNTHETIC_FILE, index = False)