# TSTR CTGAN Dataset B - Cardio

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/CTGAN/B_Cardio_Data_Synthetic_CTGAN.csv')
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active','cardio']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,95179,20434,2,171,72.9,133,84,3,1,0,0,1,1
1,66469,19869,1,159,73.5,175,95,1,1,1,0,1,1
2,48068,18480,1,169,60.7,119,76,1,1,0,0,1,1
3,32289,21122,1,162,83.5,104,74,1,1,0,0,1,1
4,40044,18057,2,171,93.0,137,78,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,89136,20408,1,171,94.8,134,80,1,1,0,0,1,1
55996,67066,14787,2,172,52.4,118,77,1,1,0,0,1,0
55997,39692,19069,2,174,79.8,92,70,1,1,0,0,1,0
55998,87110,21793,1,162,62.7,141,91,1,1,0,0,1,1


In [5]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/B_Cardio_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,66728,21770,1,156,64.0,140,80,2,1,0,0,1,1
1,69098,21876,1,170,85.0,160,90,1,1,0,0,1,1
2,59185,23270,1,151,90.0,130,80,1,1,0,0,1,1
3,49288,19741,1,159,97.0,120,80,1,1,0,0,1,1
4,62481,18395,1,164,68.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,30751,20490,2,172,70.0,120,80,1,1,0,0,0,1
13996,23253,16797,2,174,96.0,120,80,1,2,0,0,1,1
13997,34847,22607,1,165,66.0,110,80,1,1,0,0,0,0
13998,40118,19670,1,157,89.0,120,80,3,3,0,0,1,1


In [6]:
target = 'cardio'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(56000, 13)
cardio
0    20570
1    35430
dtype: int64
#####################################
Test data
(14000, 13)
cardio
0    6988
1    7012
dtype: int64


## 2. Pre-process training data

In [7]:
target = 'cardio'
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active']
numerical_columns = numerical_columns = train_data.select_dtypes(include=['int64']).columns.tolist()
categories = [np.array(range(2)), np.array(range(3)), np.array(range(3)), np.array(range(2)), np.array(range(2)), 
            np.array(range(2))]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((56000, 19), (56000,))

## 3. Preprocess test data

In [8]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((14000, 19), (14000,))

## 4. Create a dataset to save the results

In [9]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [10]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    2.8s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    6.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.4s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.644,0.6719,0.644,0.6287


## 5. Train and Evaluate KNeighbors Classifier

In [11]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.653,0.6638,0.653,0.647


## 6. Train and evaluate Decision Tree Classifier

In [12]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.5859,0.5926,0.5859,0.578


## 7. Train and evaluate Support Vector Machines Classifier

In [13]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.5384,0.5734,0.5384,0.4744


## 8. Train and evaluate Multilayer Perceptron Classifier

In [14]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 0.56990161
Iteration 2, loss = 0.55790141
Iteration 3, loss = 0.55203003
Iteration 4, loss = 0.54931797
Iteration 5, loss = 0.54713549
Iteration 6, loss = 0.54500662
Iteration 7, loss = 0.54372376
Iteration 8, loss = 0.54271407
Iteration 9, loss = 0.54161450
Iteration 10, loss = 0.54008094
Iteration 11, loss = 0.54012957
Iteration 12, loss = 0.53943498
Iteration 13, loss = 0.53937687
Iteration 14, loss = 0.53836392
Iteration 15, loss = 0.53757202
Iteration 16, loss = 0.53709136
Iteration 17, loss = 0.53663309
Iteration 18, loss = 0.53597249
Iteration 19, loss = 0.53580286
Iteration 20, loss = 0.53515619
Iteration 21, loss = 0.53534494
Iteration 22, loss = 0.53454818
Iteration 23, loss = 0.53361696
Iteration 24, loss = 0.53276463
Iteration 25, loss = 0.53299493
Iteration 26, loss = 0.53238658
Iteration 27, loss = 0.53145746
Iteration 28, loss = 0.53120933
Iteration 29, loss = 0.53117800
Iteration 30, loss = 0.53004611
Iteration 31, loss = 0.53001969
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.6174,0.6337,0.6174,0.6051


## 9. Save results file

In [15]:
results.to_csv('RESULTS/models_results_ctgan.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.644,0.6719,0.644,0.6287
1,KNN,0.653,0.6638,0.653,0.647
2,DT,0.5859,0.5926,0.5859,0.578
3,SVM,0.5384,0.5734,0.5384,0.4744
4,MLP,0.6174,0.6337,0.6174,0.6051
