# TSTR WGANGP Dataset A - Diabetes

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home directory of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/WGANGP/A_Diabetes_Data_Synthetic_WGANGP.csv')
categorical_columns = ['gender','age','admission_type_id','discharge_disposition_id','admission_source_id','max_glu_serum',
                      'A1Cresult','change','diabetesMed','readmitted']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,160291792,30203270,Female,[60-70),5,1,1,2,30,0,15,1,1,1,6,,,Ch,Yes,>30
1,150234576,23529820,Female,[60-70),5,1,1,1,24,0,15,2,2,2,5,,,Ch,Yes,>30
2,157173424,32108070,Female,[60-70),5,1,1,2,30,1,16,1,1,1,6,,,Ch,Yes,>30
3,170411776,42212280,Male,[60-70),5,1,1,2,33,1,15,0,1,1,7,,,Ch,Yes,NO
4,162151744,32820374,Female,[60-70),5,1,1,2,34,0,15,1,1,1,6,,,Ch,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81407,157678000,37311180,Female,[60-70),5,1,1,3,35,0,15,1,1,1,6,,,Ch,Yes,>30
81408,150013616,62489392,Male,[60-70),2,1,7,6,61,2,19,0,0,1,8,,,Ch,Yes,<30
81409,102509624,9322925,Male,[60-70),3,1,1,3,36,1,18,0,0,0,7,,,Ch,Yes,NO
81410,154289120,29546988,Female,[60-70),5,1,1,2,27,1,16,2,2,2,5,,,Ch,Yes,>30


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/A_Diabetes_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,110939484,19274094,Female,[70-80),1,1,6,11,68,0,20,0,0,0,5,,,No,Yes,NO
1,170328306,65634327,Male,[50-60),1,1,1,1,20,0,7,0,0,0,8,,,No,Yes,NO
2,245688426,100657359,Female,[60-70),3,6,1,4,21,3,23,1,0,2,7,,,No,Yes,NO
3,150826224,83144448,Male,[30-40),2,1,1,12,28,0,19,0,0,1,7,,,No,Yes,>30
4,135993852,65234214,Female,[60-70),1,2,7,1,21,0,6,0,0,0,7,,,No,Yes,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20349,61022328,93447369,Male,[80-90),1,1,7,6,51,1,17,0,0,0,8,,,No,Yes,<30
20350,189128682,97296336,Male,[40-50),3,1,1,2,49,1,16,0,0,2,9,,,Ch,Yes,<30
20351,251922018,84358062,Female,[70-80),1,1,7,9,41,2,17,3,0,0,9,,,No,Yes,>30
20352,214461468,98468460,Male,[70-80),2,1,7,2,22,0,5,0,0,0,7,,,No,Yes,>30


In [5]:
target = 'readmitted'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(81412, 20)
readmitted
<30    14552
>30    42112
NO     24748
dtype: int64
#####################################
Test data
(20354, 20)
readmitted
<30     2285
>30     7117
NO     10952
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'readmitted'
categorical_columns = ['gender','age','admission_type_id','discharge_disposition_id','admission_source_id','max_glu_serum',
                    'A1Cresult','change','diabetesMed']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array(range(3)), np.array(range(10)), np.array(range(9)), np.array(range(29)), np.array(range(21)), 
                np.array(range(4)), np.array(range(4)), np.array(range(2)), np.array(range(2))]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((81412, 94), (81412,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((20354, 94), (20354,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.9s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    4.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.4917,0.4541,0.4917,0.461


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.4248,0.4528,0.4248,0.4316


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.4208,0.4414,0.4208,0.4289


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
svm_results['model'] = svm_results['model'].replace(['MLP'],'SVM')
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.2249,0.4742,0.2249,0.2386


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 0.16884682
Iteration 2, loss = 0.11760759
Iteration 3, loss = 0.11087512
Iteration 4, loss = 0.10780863
Iteration 5, loss = 0.10604144
Iteration 6, loss = 0.10372577
Iteration 7, loss = 0.10244279
Iteration 8, loss = 0.10085110
Iteration 9, loss = 0.10068159
Iteration 10, loss = 0.09907646
Iteration 11, loss = 0.09849422
Iteration 12, loss = 0.09752210
Iteration 13, loss = 0.09802663
Iteration 14, loss = 0.09621681
Iteration 15, loss = 0.09506780
Iteration 16, loss = 0.09471739
Iteration 17, loss = 0.09493640
Iteration 18, loss = 0.09381768
Iteration 19, loss = 0.09360797
Iteration 20, loss = 0.09312458
Iteration 21, loss = 0.09295738
Iteration 22, loss = 0.09144284
Iteration 23, loss = 0.09148732
Iteration 24, loss = 0.09269785
Iteration 25, loss = 0.09100285
Iteration 26, loss = 0.09023553
Iteration 27, loss = 0.08957925
Iteration 28, loss = 0.08907526
Iteration 29, loss = 0.08843624
Iteration 30, loss = 0.08845586
Iteration 31, loss = 0.08936051
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.4685,0.4333,0.4685,0.4283


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_wgangp.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.4917,0.4541,0.4917,0.461
1,KNN,0.4248,0.4528,0.4248,0.4316
2,DT,0.4208,0.4414,0.4208,0.4289
3,SVM,0.2249,0.4742,0.2249,0.2386
4,MLP,0.4685,0.4333,0.4685,0.4283
