# TSTR WGANGP Dataset B - Cardio

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH ='' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/WGANGP/B_Cardio_Data_Synthetic_WGANGP.csv')
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active','cardio']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,56228,18085,1,165,92.330704,131,89,1,1,0,0,0,1
1,53329,18063,1,165,90.155136,130,79,1,1,0,0,0,1
2,47872,18119,1,165,78.922653,121,98,1,1,0,0,0,0
3,61915,18502,1,165,85.397675,135,102,1,1,0,0,1,1
4,52196,17734,1,165,92.626076,128,84,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,55651,17762,1,165,106.259781,134,76,1,1,0,0,0,1
55996,61439,18372,1,165,78.966408,128,72,1,1,0,0,1,0
55997,56393,17824,1,166,92.484062,125,82,1,1,0,0,0,1
55998,51686,17814,1,166,94.494011,128,82,1,1,0,0,0,0


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/B_Cardio_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,66728,21770,1,156,64.0,140,80,2,1,0,0,1,1
1,69098,21876,1,170,85.0,160,90,1,1,0,0,1,1
2,59185,23270,1,151,90.0,130,80,1,1,0,0,1,1
3,49288,19741,1,159,97.0,120,80,1,1,0,0,1,1
4,62481,18395,1,164,68.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,30751,20490,2,172,70.0,120,80,1,1,0,0,0,1
13996,23253,16797,2,174,96.0,120,80,1,2,0,0,1,1
13997,34847,22607,1,165,66.0,110,80,1,1,0,0,0,0
13998,40118,19670,1,157,89.0,120,80,3,3,0,0,1,1


In [5]:
target = 'cardio'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(56000, 13)
cardio
0    31723
1    24277
dtype: int64
#####################################
Test data
(14000, 13)
cardio
0    6988
1    7012
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'cardio'
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active']
numerical_columns = numerical_columns = train_data.select_dtypes(include=['int64']).columns.tolist()
categories = [np.array(range(2)), np.array(range(3)), np.array(range(3)), np.array(range(2)), np.array(range(2)), 
            np.array(range(2))]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((56000, 19), (56000,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((14000, 19), (14000,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    6.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:   12.8s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.3s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.6319,0.6611,0.6319,0.6148


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.5211,0.5258,0.5211,0.4999


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.6574,0.6729,0.6574,0.6498


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
svm_results['model'] = svm_results['model'].replace(['MLP'],'SVM')
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.4411,0.4288,0.4411,0.4151


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 0.26819569
Iteration 2, loss = 0.23105497
Iteration 3, loss = 0.22841676
Iteration 4, loss = 0.22683700
Iteration 5, loss = 0.22657570
Iteration 6, loss = 0.22540795
Iteration 7, loss = 0.22561905
Iteration 8, loss = 0.22422081
Iteration 9, loss = 0.22489462
Iteration 10, loss = 0.22525040
Iteration 11, loss = 0.22436458
Iteration 12, loss = 0.22446932
Iteration 13, loss = 0.22405444
Iteration 14, loss = 0.22426129
Iteration 15, loss = 0.22424545
Iteration 16, loss = 0.22344121
Iteration 17, loss = 0.22366558
Iteration 18, loss = 0.22376931
Iteration 19, loss = 0.22359271
Iteration 20, loss = 0.22378650
Iteration 21, loss = 0.22299282
Iteration 22, loss = 0.22354987
Iteration 23, loss = 0.22260238
Iteration 24, loss = 0.22270431
Iteration 25, loss = 0.22274642
Iteration 26, loss = 0.22269832
Iteration 27, loss = 0.22261701
Iteration 28, loss = 0.22233405
Iteration 29, loss = 0.22207702
Iteration 30, loss = 0.22232760
Iteration 31, loss = 0.22213365
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.5451,0.5707,0.5451,0.5011


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_wgangp.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.6319,0.6611,0.6319,0.6148
1,KNN,0.5211,0.5258,0.5211,0.4999
2,DT,0.6574,0.6729,0.6574,0.6498
3,SVM,0.4411,0.4288,0.4411,0.4151
4,MLP,0.5451,0.5707,0.5451,0.5011
