# TRTR Dataset B - Cardio

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TRAIN DATASETS/B_Cardio_Data_Real_Train.csv')
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active','cardio']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,67617,21876,1,154,80.0,130,90,2,1,0,0,1,1
1,96320,16717,2,162,70.0,140,90,1,1,0,0,0,1
2,17571,21128,2,174,92.0,150,100,1,1,0,0,1,1
3,46492,23366,2,173,76.0,120,82,1,1,0,0,1,1
4,945,20281,1,160,60.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55995,53137,16001,2,170,75.0,150,80,1,1,1,0,1,1
55996,8918,23209,2,162,73.0,160,90,1,1,0,0,1,1
55997,78302,23589,1,169,74.0,120,80,1,1,0,0,1,0
55998,1197,18227,1,167,70.0,120,80,1,1,0,0,0,0


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/B_Cardio_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,66728,21770,1,156,64.0,140,80,2,1,0,0,1,1
1,69098,21876,1,170,85.0,160,90,1,1,0,0,1,1
2,59185,23270,1,151,90.0,130,80,1,1,0,0,1,1
3,49288,19741,1,159,97.0,120,80,1,1,0,0,1,1
4,62481,18395,1,164,68.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,30751,20490,2,172,70.0,120,80,1,1,0,0,0,1
13996,23253,16797,2,174,96.0,120,80,1,2,0,0,1,1
13997,34847,22607,1,165,66.0,110,80,1,1,0,0,0,0
13998,40118,19670,1,157,89.0,120,80,3,3,0,0,1,1


In [5]:
target = 'cardio'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(56000, 13)
cardio
0    28033
1    27967
dtype: int64
#####################################
Test data
(14000, 13)
cardio
0    6988
1    7012
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'cardio'
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active']
numerical_columns = numerical_columns = train_data.select_dtypes(include=['int64']).columns.tolist()
categories = [np.array(range(2)), np.array(range(3)), np.array(range(3)), np.array(range(2)), np.array(range(2)), 
            np.array(range(2))]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((56000, 19), (56000,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((14000, 19), (14000,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    3.5s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.7139,0.7141,0.7139,0.7138


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.6504,0.6568,0.6504,0.6469


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.6343,0.6343,0.6343,0.6343


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.5006,0.6118,0.5006,0.3373


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = inf
Iteration 2, loss = inf
Iteration 3, loss = 0.54951042
Iteration 4, loss = 0.54691666
Iteration 5, loss = 0.54630366
Iteration 6, loss = 0.54527458
Iteration 7, loss = 0.54498814
Iteration 8, loss = 0.54388112
Iteration 9, loss = 0.54433403
Iteration 10, loss = 0.54266721
Iteration 11, loss = 0.54157524
Iteration 12, loss = 0.54118756
Iteration 13, loss = 0.54170565
Iteration 14, loss = 0.54031526
Iteration 15, loss = 0.54085046
Iteration 16, loss = 0.54022206
Iteration 17, loss = 0.54023308
Iteration 18, loss = 0.53892197
Iteration 19, loss = 0.53861417
Iteration 20, loss = 0.53825106
Iteration 21, loss = 0.53879218
Iteration 22, loss = 0.53764884
Iteration 23, loss = 0.53753162
Iteration 24, loss = 0.53693925
Iteration 25, loss = 0.53655625
Iteration 26, loss = 0.53695441
Iteration 27, loss = 0.53538187
Iteration 28, loss = 0.53566733
Iteration 29, loss = 0.53524642
Iteration 30, loss = 0.53486497
Iteration 31, loss = 0.53445025
Iteration 32, loss = 0.53437499

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.7036,0.7054,0.7036,0.703


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_real.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.7139,0.7141,0.7139,0.7138
1,KNN,0.6504,0.6568,0.6504,0.6469
2,DT,0.6343,0.6343,0.6343,0.6343
3,SVM,0.5006,0.6118,0.5006,0.3373
4,MLP,0.7036,0.7054,0.7036,0.703
