# TSTR GM Dataset A - Diabetes

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home directory of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/GM/A_Diabetes_Data_Synthetic_GM.csv')
categorical_columns = ['gender','age','admission_type_id','discharge_disposition_id','admission_source_id','max_glu_serum',
                      'A1Cresult','change','diabetesMed','readmitted']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,55263844,73425914,Male,[70-80),2,1,1,4,46,1,11,0,0,0,7,,,No,Yes,>30
1,70034002,36749733,Female,[80-90),2,13,1,5,44,2,21,2,0,1,8,,,No,Yes,<30
2,428180152,136154611,Male,[70-80),3,1,1,3,60,0,18,0,0,0,6,,,No,No,<30
3,36215451,5314375,Female,[70-80),6,25,7,2,48,0,17,0,0,2,8,,,Ch,Yes,NO
4,132810221,70592062,Female,[70-80),5,3,6,2,12,0,15,0,0,0,5,,,No,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81407,52044663,14996419,Female,[90-100),1,1,7,7,70,0,27,0,0,0,8,,,Ch,Yes,NO
81408,156727385,105999805,Female,[50-60),1,6,17,2,55,0,14,0,0,0,9,,,No,No,NO
81409,81094768,31322560,Female,[40-50),3,3,17,0,23,0,10,2,1,0,4,,,No,Yes,>30
81410,10611239,25019256,Male,[40-50),2,1,7,13,43,1,13,0,0,0,6,,,No,Yes,<30


In [5]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/A_Diabetes_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,encounter_id,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,readmitted
0,110939484,19274094,Female,[70-80),1,1,6,11,68,0,20,0,0,0,5,,,No,Yes,NO
1,170328306,65634327,Male,[50-60),1,1,1,1,20,0,7,0,0,0,8,,,No,Yes,NO
2,245688426,100657359,Female,[60-70),3,6,1,4,21,3,23,1,0,2,7,,,No,Yes,NO
3,150826224,83144448,Male,[30-40),2,1,1,12,28,0,19,0,0,1,7,,,No,Yes,>30
4,135993852,65234214,Female,[60-70),1,2,7,1,21,0,6,0,0,0,7,,,No,Yes,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20349,61022328,93447369,Male,[80-90),1,1,7,6,51,1,17,0,0,0,8,,,No,Yes,<30
20350,189128682,97296336,Male,[40-50),3,1,1,2,49,1,16,0,0,2,9,,,Ch,Yes,<30
20351,251922018,84358062,Female,[70-80),1,1,7,9,41,2,17,3,0,0,9,,,No,Yes,>30
20352,214461468,98468460,Male,[70-80),2,1,7,2,22,0,5,0,0,0,7,,,No,Yes,>30


In [6]:
target = 'readmitted'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(81412, 20)
readmitted
<30     9183
>30    29458
NO     42771
dtype: int64
#####################################
Test data
(20354, 20)
readmitted
<30     2285
>30     7117
NO     10952
dtype: int64


## 2. Pre-process training data

In [7]:
target = 'readmitted'
categorical_columns = ['gender','age','admission_type_id','discharge_disposition_id','admission_source_id','max_glu_serum',
                    'A1Cresult','change','diabetesMed']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array(range(3)), np.array(range(10)), np.array(range(9)), np.array(range(29)), np.array(range(21)), 
                np.array(range(4)), np.array(range(4)), np.array(range(2)), np.array(range(2))]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((81412, 94), (81412,))

## 3. Preprocess test data

In [8]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((20354, 94), (20354,))

## 4. Create a dataset to save the results

In [9]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [10]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    3.8s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    8.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.4s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.5562,0.4932,0.5562,0.5097


## 5. Train and Evaluate KNeighbors Classifier

In [11]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.4995,0.4606,0.4995,0.4744


## 6. Train and evaluate Decision Tree Classifier

In [12]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.4296,0.4424,0.4296,0.4351


## 7. Train and evaluate Support Vector Machines Classifier

In [13]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.2608,0.4459,0.2608,0.2883


## 8. Train and evaluate Multilayer Perceptron Classifier

In [14]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 0.94057579
Iteration 2, loss = 0.93284634
Iteration 3, loss = 0.93137452
Iteration 4, loss = 0.92961596
Iteration 5, loss = 0.92743647
Iteration 6, loss = 0.92556973
Iteration 7, loss = 0.92340373
Iteration 8, loss = 0.92044318
Iteration 9, loss = 0.91794586
Iteration 10, loss = 0.91447144
Iteration 11, loss = 0.91083015
Iteration 12, loss = 0.90655639
Iteration 13, loss = 0.90284928
Iteration 14, loss = 0.89903177
Iteration 15, loss = 0.89378237
Iteration 16, loss = 0.88981780
Iteration 17, loss = 0.88472541
Iteration 18, loss = 0.88075516
Iteration 19, loss = 0.87597363
Iteration 20, loss = 0.87204065
Iteration 21, loss = 0.86738098
Iteration 22, loss = 0.86345999
Iteration 23, loss = 0.85979932
Iteration 24, loss = 0.85589837
Iteration 25, loss = 0.85140336
Iteration 26, loss = 0.84815942
Iteration 27, loss = 0.84447657
Iteration 28, loss = 0.84125881
Iteration 29, loss = 0.83775880
Iteration 30, loss = 0.83463125
Iteration 31, loss = 0.83163838
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.4918,0.4654,0.4918,0.476


## 9. Save results file

In [15]:
results.to_csv('RESULTS/models_results_gm.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.5562,0.4932,0.5562,0.5097
1,KNN,0.4995,0.4606,0.4995,0.4744
2,DT,0.4296,0.4424,0.4296,0.4351
3,SVM,0.2608,0.4459,0.2608,0.2883
4,MLP,0.4918,0.4654,0.4918,0.476
