# TRTR Dataset C - Obesity

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)


#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TRAIN DATASETS/C_Obesity_Data_Real_Train.csv')
categorical_columns = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS','Obesity_level']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_level
0,Female,21,1.63,60.00,yes,yes,3.000000,3.000000,Always,yes,2.000000,no,2.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
1,Female,21,1.75,133.62,yes,yes,3.000000,3.000000,Sometimes,no,2.887659,no,1.480919,0.779641,Sometimes,Public_Transportation,Obesity_Type_III
2,Female,23,1.66,82.60,yes,yes,1.203754,1.355354,Sometimes,no,2.765593,no,0.128342,1.659476,Sometimes,Public_Transportation,Obesity_Type_I
3,Female,22,1.59,44.24,no,no,3.000000,1.696080,Frequently,no,2.550307,no,1.098862,0.000000,no,Public_Transportation,Insufficient_Weight
4,Male,26,1.81,106.04,yes,yes,3.000000,3.000000,Sometimes,no,2.858171,no,1.813318,0.680215,Sometimes,Public_Transportation,Obesity_Type_I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,Male,32,1.75,120.10,yes,yes,2.967300,3.000000,Sometimes,no,2.530035,no,0.955317,1.339232,Sometimes,Automobile,Obesity_Type_II
1684,Male,23,1.72,81.67,yes,yes,2.000000,1.729553,Sometimes,no,1.400247,no,0.887923,1.011983,Sometimes,Public_Transportation,Overweight_Level_II
1685,Female,23,1.65,80.00,yes,yes,2.000000,3.000000,Sometimes,no,2.000000,no,0.146919,2.000000,no,Public_Transportation,Overweight_Level_II
1686,Female,23,1.63,84.50,yes,yes,2.058687,2.962004,Sometimes,no,2.010596,no,0.851059,0.630866,no,Public_Transportation,Obesity_Type_I


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/C_Obesity_Data_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_level
0,Female,20,1.76,53.70,yes,yes,2.000000,3.891994,Frequently,no,1.863930,no,2.870127,2.000000,no,Public_Transportation,Insufficient_Weight
1,Female,26,1.62,111.00,yes,yes,3.000000,3.000000,Sometimes,no,2.704315,no,0.000000,0.322666,Sometimes,Public_Transportation,Obesity_Type_III
2,Male,18,1.85,60.00,yes,yes,3.000000,4.000000,Sometimes,no,2.000000,yes,2.000000,0.000000,Sometimes,Automobile,Insufficient_Weight
3,Female,21,1.52,42.00,no,yes,3.000000,1.000000,Frequently,no,1.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Insufficient_Weight
4,Male,22,1.75,74.00,yes,no,2.000000,3.000000,Sometimes,no,2.000000,no,1.000000,2.000000,Sometimes,Bike,Normal_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,Male,19,1.80,87.00,yes,yes,2.000000,4.000000,Sometimes,no,2.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Overweight_Level_I
419,Male,31,1.65,101.14,yes,yes,2.913452,2.269799,Sometimes,no,1.000000,no,1.889937,0.378818,no,Public_Transportation,Obesity_Type_II
420,Male,19,1.85,65.00,yes,no,2.000000,3.000000,Sometimes,no,3.000000,no,2.000000,1.000000,Sometimes,Bike,Normal_Weight
421,Male,29,1.76,113.50,yes,yes,2.320201,3.000000,Sometimes,no,2.164784,no,0.000000,1.465479,Sometimes,Automobile,Obesity_Type_II


In [5]:
target = 'Obesity_level'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(1688, 17)
Obesity_level
Insufficient_Weight    216
Normal_Weight          225
Obesity_Type_I         273
Obesity_Type_II        239
Obesity_Type_III       261
Overweight_Level_I     234
Overweight_Level_II    240
dtype: int64
#####################################
Test data
(423, 17)
Obesity_level
Insufficient_Weight    56
Normal_Weight          62
Obesity_Type_I         78
Obesity_Type_II        58
Obesity_Type_III       63
Overweight_Level_I     56
Overweight_Level_II    50
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'Obesity_level'
categorical_columns = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array([0, 1]), np.array([0, 1]), np.array([0, 1]), np.array([0, 1, 2, 3]), np.array([0, 1]), 
              np.array([0, 1]), np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3, 4])]


data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((1688, 31), (1688,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((423, 31), (423,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.922,0.9238,0.922,0.922


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.7778,0.7799,0.7778,0.7622


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.9314,0.9326,0.9314,0.9315


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.8771,0.8813,0.8771,0.8754


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 1.95034385
Iteration 2, loss = 1.76753732
Iteration 3, loss = 1.59483317
Iteration 4, loss = 1.38260335
Iteration 5, loss = 1.15953547
Iteration 6, loss = 0.97567828
Iteration 7, loss = 0.82863316
Iteration 8, loss = 0.70599043
Iteration 9, loss = 0.60975385
Iteration 10, loss = 0.53218663
Iteration 11, loss = 0.46247078
Iteration 12, loss = 0.39744864
Iteration 13, loss = 0.34708805
Iteration 14, loss = 0.30448384
Iteration 15, loss = 0.26718559
Iteration 16, loss = 0.23620416
Iteration 17, loss = 0.20909579
Iteration 18, loss = 0.18645448
Iteration 19, loss = 0.16761698
Iteration 20, loss = 0.15101342
Iteration 21, loss = 0.13742970
Iteration 22, loss = 0.12528573
Iteration 23, loss = 0.11229585
Iteration 24, loss = 0.10042314
Iteration 25, loss = 0.09145454
Iteration 26, loss = 0.08512425
Iteration 27, loss = 0.07639695
Iteration 28, loss = 0.07016740
Iteration 29, loss = 0.06498716
Iteration 30, loss = 0.05889680
Iteration 31, loss = 0.05411972
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.9362,0.9361,0.9362,0.9356


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_real.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.922,0.9238,0.922,0.922
1,KNN,0.7778,0.7799,0.7778,0.7622
2,DT,0.9314,0.9326,0.9314,0.9315
3,SVM,0.8771,0.8813,0.8771,0.8754
4,MLP,0.9362,0.9361,0.9362,0.9356
