In [1]:
#Load libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from copulas.multivariate import GaussianMultivariate
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
from gaussian_multivariate import DataPreProcessor

In [3]:
HOME_PATH = '' #home path of the project
TRAIN_FILE = 'REAL DATASETS/TRAIN DATASETS/C_Obesity_Data_Real_Train.csv'
SYNTHETIC_FILE = 'SYNTHETIC DATASETS/GM/C_Obesity_Data_Synthetic_GM.csv'

## 1. Read data

In [4]:
#read real dataset
real_data = pd.read_csv(HOME_PATH + TRAIN_FILE)
categorical_columns = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS','Obesity_level']
for col in categorical_columns :
    real_data[col] = real_data[col].astype('category')
data_train = real_data
real_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_level
0,Female,21,1.63,60.00,yes,yes,3.000000,3.000000,Always,yes,2.000000,no,2.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
1,Female,21,1.75,133.62,yes,yes,3.000000,3.000000,Sometimes,no,2.887659,no,1.480919,0.779641,Sometimes,Public_Transportation,Obesity_Type_III
2,Female,23,1.66,82.60,yes,yes,1.203754,1.355354,Sometimes,no,2.765593,no,0.128342,1.659476,Sometimes,Public_Transportation,Obesity_Type_I
3,Female,22,1.59,44.24,no,no,3.000000,1.696080,Frequently,no,2.550307,no,1.098862,0.000000,no,Public_Transportation,Insufficient_Weight
4,Male,26,1.81,106.04,yes,yes,3.000000,3.000000,Sometimes,no,2.858171,no,1.813318,0.680215,Sometimes,Public_Transportation,Obesity_Type_I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,Male,32,1.75,120.10,yes,yes,2.967300,3.000000,Sometimes,no,2.530035,no,0.955317,1.339232,Sometimes,Automobile,Obesity_Type_II
1684,Male,23,1.72,81.67,yes,yes,2.000000,1.729553,Sometimes,no,1.400247,no,0.887923,1.011983,Sometimes,Public_Transportation,Overweight_Level_II
1685,Female,23,1.65,80.00,yes,yes,2.000000,3.000000,Sometimes,no,2.000000,no,0.146919,2.000000,no,Public_Transportation,Overweight_Level_II
1686,Female,23,1.63,84.50,yes,yes,2.058687,2.962004,Sometimes,no,2.010596,no,0.851059,0.630866,no,Public_Transportation,Obesity_Type_I


In [5]:
data_train.dtypes

Gender                            category
Age                                  int64
Height                             float64
Weight                             float64
family_history_with_overweight    category
FAVC                              category
FCVC                               float64
NCP                                float64
CAEC                              category
SMOKE                             category
CH2O                               float64
SCC                               category
FAF                                float64
TUE                                float64
CALC                              category
MTRANS                            category
Obesity_level                     category
dtype: object

In [6]:
# data configuration
preprocessor = DataPreProcessor(data_train)
data_train = preprocessor.preprocess_train_data()
data_train

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender0,Gender1,...,MTRANS2,MTRANS3,MTRANS4,Obesity_level0,Obesity_level1,Obesity_level2,Obesity_level3,Obesity_level4,Obesity_level5,Obesity_level6
0,21,1.63,60.00,3.000000,3.000000,2.000000,2.000000,0.000000,0.730957,0.500000,...,0.5,0.730957,0.5,0.500000,0.730957,0.500000,0.500000,0.500000,0.500000,0.500000
1,21,1.75,133.62,3.000000,3.000000,2.887659,1.480919,0.779641,0.730957,0.500000,...,0.5,0.730957,0.5,0.500000,0.500000,0.500000,0.500000,0.730957,0.500000,0.500000
2,23,1.66,82.60,1.203754,1.355354,2.765593,0.128342,1.659476,0.730957,0.500000,...,0.5,0.730957,0.5,0.500000,0.500000,0.730957,0.500000,0.500000,0.500000,0.500000
3,22,1.59,44.24,3.000000,1.696080,2.550307,1.098862,0.000000,0.730957,0.500000,...,0.5,0.730957,0.5,0.730957,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000
4,26,1.81,106.04,3.000000,3.000000,2.858171,1.813318,0.680215,0.500000,0.730957,...,0.5,0.730957,0.5,0.500000,0.500000,0.730957,0.500000,0.500000,0.500000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,32,1.75,120.10,2.967300,3.000000,2.530035,0.955317,1.339232,0.500000,0.730957,...,0.5,0.500000,0.5,0.500000,0.500000,0.500000,0.730957,0.500000,0.500000,0.500000
1684,23,1.72,81.67,2.000000,1.729553,1.400247,0.887923,1.011983,0.500000,0.730957,...,0.5,0.730957,0.5,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.730957
1685,23,1.65,80.00,2.000000,3.000000,2.000000,0.146919,2.000000,0.730957,0.500000,...,0.5,0.730957,0.5,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.730957
1686,23,1.63,84.50,2.058687,2.962004,2.010596,0.851059,0.630866,0.730957,0.500000,...,0.5,0.730957,0.5,0.500000,0.500000,0.730957,0.500000,0.500000,0.500000,0.500000


## 2. Train the model and generate data

In [7]:
gm = GaussianMultivariate()
gm.fit(data_train)

In [8]:
generated_samples = gm.sample(len(data_train))
generated_samples

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender0,Gender1,...,MTRANS2,MTRANS3,MTRANS4,Obesity_level0,Obesity_level1,Obesity_level2,Obesity_level3,Obesity_level4,Obesity_level5,Obesity_level6
0,21.555029,1.736983,96.575084,3.043926,3.229494,2.767918,0.559224,0.723408,0.700940,0.529936,...,0.498785,0.740462,0.490062,0.723206,0.483625,0.487299,0.722915,0.519029,0.472188,0.508979
1,27.717045,1.791849,132.568140,2.743510,3.213367,2.543950,-0.718087,0.050567,0.490350,0.740645,...,0.494954,0.695597,0.511068,0.490418,0.503111,0.487742,0.717248,0.698049,0.489211,0.500910
2,16.438841,1.849836,80.521419,2.474037,2.287594,3.005978,0.120772,1.338905,0.505270,0.725687,...,0.500786,0.720587,0.513401,0.536028,0.498622,0.473387,0.487117,0.517468,0.543082,0.504330
3,20.927978,1.753469,107.888290,2.575027,2.686841,2.198808,1.113582,0.739439,0.495213,0.735767,...,0.504591,0.743279,0.491632,0.497724,0.484871,0.717536,0.495338,0.494438,0.482014,0.746841
4,33.640377,1.616082,112.720873,2.779278,1.335992,2.765885,1.447929,1.321864,0.737051,0.493870,...,0.495764,0.653335,0.502197,0.479738,0.502441,0.737681,0.490043,0.515472,0.504148,0.499034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,23.167952,1.729979,107.686391,3.038139,3.429195,1.447293,0.157166,0.764616,0.499642,0.731345,...,0.495085,0.520406,0.498462,0.505896,0.491908,0.498988,0.484653,0.748582,0.515751,0.486529
1684,24.365665,1.722470,122.682563,2.556549,3.763153,2.444046,2.250645,0.188289,0.515016,0.715927,...,0.500017,0.692978,0.511001,0.491593,0.491092,0.511452,0.504492,0.718331,0.503017,0.496478
1685,26.702273,1.761826,88.618928,2.240721,3.604517,2.886618,0.781996,0.615303,0.692810,0.538097,...,0.500382,0.714950,0.480666,0.509600,0.504150,0.537099,0.523850,0.487557,0.493745,0.488825
1686,21.976326,1.508331,102.850821,2.048591,1.967142,2.308040,-0.629877,-0.509730,0.757227,0.473738,...,0.499395,0.755668,0.493571,0.521568,0.460903,0.756174,0.493164,0.728850,0.497319,0.475547


## 3. Transform Generated Data

In [9]:
synthetic_data = preprocessor.transform_data(generated_samples)
synthetic_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_level
0,Female,21,1.736983,96.575084,yes,yes,3.043926,3.229494,Sometimes,no,2.767918,no,0.559224,0.723408,Sometimes,Public_Transportation,Insufficient_Weight
1,Male,27,1.791849,132.568140,yes,yes,2.743510,3.213367,Sometimes,no,2.543950,no,-0.718087,0.050567,Sometimes,Public_Transportation,Obesity_Type_II
2,Male,16,1.849836,80.521419,no,yes,2.474037,2.287594,Frequently,no,3.005978,no,0.120772,1.338905,Sometimes,Public_Transportation,Overweight_Level_I
3,Male,20,1.753469,107.888290,yes,yes,2.575027,2.686841,Sometimes,no,2.198808,no,1.113582,0.739439,Sometimes,Public_Transportation,Overweight_Level_II
4,Female,33,1.616082,112.720873,yes,yes,2.779278,1.335992,Sometimes,no,2.765885,no,1.447929,1.321864,Sometimes,Public_Transportation,Obesity_Type_I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,Male,23,1.729979,107.686391,yes,yes,3.038139,3.429195,Sometimes,no,1.447293,no,0.157166,0.764616,Sometimes,Automobile,Obesity_Type_III
1684,Male,24,1.722470,122.682563,yes,yes,2.556549,3.763153,Sometimes,no,2.444046,no,2.250645,0.188289,Sometimes,Public_Transportation,Obesity_Type_III
1685,Female,26,1.761826,88.618928,yes,yes,2.240721,3.604517,Sometimes,no,2.886618,no,0.781996,0.615303,Sometimes,Automobile,Obesity_Type_I
1686,Female,21,1.508331,102.850821,yes,yes,2.048591,1.967142,Sometimes,no,2.308040,no,-0.629877,-0.509730,Sometimes,Public_Transportation,Obesity_Type_I


In [10]:
real_data.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0
mean,24.449052,1.701564,86.598235,2.42188,2.685886,2.006181,1.004208,0.644357
std,6.486209,0.093323,26.09942,0.537088,0.782557,0.609598,0.839721,0.602679
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.63,66.0,2.0,2.658599,1.591831,0.132315,0.0
50%,23.0,1.7,82.975,2.392422,3.0,2.0,1.0,0.609316
75%,26.0,1.77,106.7375,3.0,3.0,2.458165,1.64637,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [11]:
synthetic_data.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0,1688.0
mean,23.904621,1.703164,87.275004,2.431259,2.673412,1.976341,1.012154,0.64978
std,6.588131,0.097911,26.486274,0.538638,0.787192,0.622799,0.83318,0.59661
min,11.0,1.431505,28.518961,0.687839,1.003858,0.584472,-1.44928,-1.194236
25%,19.0,1.631236,66.162896,2.069985,2.066293,1.517491,0.46041,0.251056
50%,22.0,1.707179,85.023344,2.41143,2.732014,1.989249,1.028708,0.642656
75%,27.0,1.773036,108.130431,2.765563,3.3246,2.449859,1.552222,1.055012
max,61.0,1.992629,167.616683,4.373367,3.997278,3.284626,3.527239,2.488937


In [12]:
len(synthetic_data.columns)

17

In [13]:
synthetic_data.to_csv(HOME_PATH + SYNTHETIC_FILE, index = False)