# TSTR SDV Dataset C - Obesity

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/SDV/C_Obesity_Data_Synthetic_SDV.csv')
categorical_columns = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS','Obesity_level']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_level
0,Male,28,1.76,123.43,yes,yes,3.205596,2.903116,Sometimes,no,1.904073,no,0.448286,0.313930,Sometimes,Public_Transportation,Obesity_Type_II
1,Male,12,1.69,40.75,yes,yes,2.239509,2.653353,Sometimes,no,2.042313,no,0.645014,0.601065,no,Public_Transportation,Overweight_Level_I
2,Female,18,1.63,94.78,yes,yes,3.094983,3.169795,Sometimes,no,2.692556,no,1.037283,0.856946,Sometimes,Public_Transportation,Overweight_Level_I
3,Male,21,1.82,137.00,no,yes,2.095592,3.177185,Sometimes,no,2.276122,no,1.221715,0.845391,Sometimes,Public_Transportation,Obesity_Type_I
4,Female,38,1.65,84.02,yes,yes,2.726584,1.015046,Sometimes,no,0.278486,no,1.055731,0.248985,no,Public_Transportation,Obesity_Type_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,Male,31,1.48,76.02,yes,yes,2.602491,3.355760,Sometimes,no,2.240679,no,1.903530,0.172436,Sometimes,Public_Transportation,Overweight_Level_II
1684,Female,21,1.67,45.28,yes,yes,2.534703,3.542202,Sometimes,no,1.935781,no,-0.184959,0.306923,Sometimes,Public_Transportation,Normal_Weight
1685,Male,32,1.69,103.85,yes,yes,2.682470,2.334944,Sometimes,no,3.093748,no,1.181994,0.758202,Sometimes,Public_Transportation,Obesity_Type_I
1686,Male,22,1.77,115.65,yes,yes,3.076754,3.583925,Sometimes,no,2.792898,no,1.703388,1.141771,Sometimes,Public_Transportation,Obesity_Type_II


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/C_Obesity_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_level
0,Female,20,1.76,53.70,yes,yes,2.000000,3.891994,Frequently,no,1.863930,no,2.870127,2.000000,no,Public_Transportation,Insufficient_Weight
1,Female,26,1.62,111.00,yes,yes,3.000000,3.000000,Sometimes,no,2.704315,no,0.000000,0.322666,Sometimes,Public_Transportation,Obesity_Type_III
2,Male,18,1.85,60.00,yes,yes,3.000000,4.000000,Sometimes,no,2.000000,yes,2.000000,0.000000,Sometimes,Automobile,Insufficient_Weight
3,Female,21,1.52,42.00,no,yes,3.000000,1.000000,Frequently,no,1.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Insufficient_Weight
4,Male,22,1.75,74.00,yes,no,2.000000,3.000000,Sometimes,no,2.000000,no,1.000000,2.000000,Sometimes,Bike,Normal_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,Male,19,1.80,87.00,yes,yes,2.000000,4.000000,Sometimes,no,2.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Overweight_Level_I
419,Male,31,1.65,101.14,yes,yes,2.913452,2.269799,Sometimes,no,1.000000,no,1.889937,0.378818,no,Public_Transportation,Obesity_Type_II
420,Male,19,1.85,65.00,yes,no,2.000000,3.000000,Sometimes,no,3.000000,no,2.000000,1.000000,Sometimes,Bike,Normal_Weight
421,Male,29,1.76,113.50,yes,yes,2.320201,3.000000,Sometimes,no,2.164784,no,0.000000,1.465479,Sometimes,Automobile,Obesity_Type_II


In [5]:
target = 'Obesity_level'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(1688, 17)
Obesity_level
Insufficient_Weight    135
Normal_Weight          203
Obesity_Type_I         185
Obesity_Type_II        311
Obesity_Type_III       267
Overweight_Level_I     297
Overweight_Level_II    290
dtype: int64
#####################################
Test data
(423, 17)
Obesity_level
Insufficient_Weight    56
Normal_Weight          62
Obesity_Type_I         78
Obesity_Type_II        58
Obesity_Type_III       63
Overweight_Level_I     56
Overweight_Level_II    50
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'Obesity_level'
categorical_columns = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array([0, 1]), np.array([0, 1]), np.array([0, 1]), np.array([0, 1, 2, 3]), np.array([0, 1]), 
              np.array([0, 1]), np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3, 4])]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((1688, 31), (1688,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((423, 31), (423,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.1702,0.2082,0.1702,0.175


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.1513,0.1667,0.1513,0.1502


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.1915,0.2394,0.1915,0.2069


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.1702,0.1973,0.1702,0.1665


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 1.98768556
Iteration 2, loss = 1.91251759
Iteration 3, loss = 1.89066202
Iteration 4, loss = 1.86943417
Iteration 5, loss = 1.84944906
Iteration 6, loss = 1.82883847
Iteration 7, loss = 1.80722832
Iteration 8, loss = 1.78624045
Iteration 9, loss = 1.76701335
Iteration 10, loss = 1.74486098
Iteration 11, loss = 1.72592534
Iteration 12, loss = 1.71056509
Iteration 13, loss = 1.69266133
Iteration 14, loss = 1.68214506
Iteration 15, loss = 1.66310682
Iteration 16, loss = 1.64686186
Iteration 17, loss = 1.63708379
Iteration 18, loss = 1.62216268
Iteration 19, loss = 1.61024017
Iteration 20, loss = 1.60407851
Iteration 21, loss = 1.58635509
Iteration 22, loss = 1.58079131
Iteration 23, loss = 1.56837718
Iteration 24, loss = 1.55490047
Iteration 25, loss = 1.54824971
Iteration 26, loss = 1.53653047
Iteration 27, loss = 1.52243205
Iteration 28, loss = 1.51201122
Iteration 29, loss = 1.50651508
Iteration 30, loss = 1.49880188
Iteration 31, loss = 1.48424175
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.1939,0.1907,0.1939,0.1769


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_sdv.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.1702,0.2082,0.1702,0.175
1,KNN,0.1513,0.1667,0.1513,0.1502
2,DT,0.1915,0.2394,0.1915,0.2069
3,SVM,0.1702,0.1973,0.1702,0.1665
4,MLP,0.1939,0.1907,0.1939,0.1769
