## Body Fat Prediction

In [1]:
# Imports

import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import SCORERS
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, cross_validate, KFold

In [2]:
print(SCORERS.keys())

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [3]:
'neg_mean_absolute_error'
'neg_mean_absolute_percentage_error'
'neg_root_mean_squared_error'
'max_error'

'neg_root_mean_squared_error'

Getting Data

In [25]:
bodyfat = pd.read_csv("bodyfat.csv")
df = pd.DataFrame(bodyfat)
df['Weight'] = df['Weight'] * 0.45

In [26]:
df = df.sort_values(by=['Weight'])
# df_split = np.array_split(df, 5)

In [38]:
df_70 = df[df['Weight'] <= 70]
df_75 = df[(df['Weight'] > 70) & (df['Weight'] <= 75)]
df_80 = df[(df['Weight'] > 75) & (df['Weight'] <= 80)]
df_90 = df[(df['Weight'] > 80) & (df['Weight'] <= 90)]
heavy = df[df['Weight'] > 90]

In [39]:
print(len(df_70))
print(len(df_75))
print(len(df_80))
print(len(df_90))
print(len(heavy))

51
36
46
61
55


In [4]:
# healthy = df[df['Weight'] <= 100]
# heavy = df[df['Weight'] > 100]

New DataFrames

In [5]:
labels = df['BodyFat']
features = df.drop(columns=['Density', 'BodyFat'])

labels_healthy = healthy['BodyFat']
features_healthy = healthy.drop(columns=['Density', 'BodyFat'])

labels_heavy = heavy['BodyFat']
features_heavy = heavy.drop(columns=['Density', 'BodyFat'])

Labeling Data

In [6]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn_pandas import DataFrameMapper

In [7]:
mapper = DataFrameMapper([(features.columns, StandardScaler())])
x = mapper.fit_transform(features)
x = pd.DataFrame(x, index=features.index, columns=features.columns)
y = labels

mapper = DataFrameMapper([(features_healthy.columns, StandardScaler())])
x_n = mapper.fit_transform(features_healthy)
x_n = pd.DataFrame(x_n, index=features_healthy.index, columns=features_healthy.columns)
y_n = labels_healthy

mapper = DataFrameMapper([(features_heavy.columns, StandardScaler())])
x_h = mapper.fit_transform(features_heavy)
x_h = pd.DataFrame(x_h, index=features_heavy.index, columns=features_heavy.columns)
y_h = labels_heavy

### Models

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

Linear Regression

In [9]:
def linRegModel(x_train, y_train):
    lin_reg = LinearRegression()
    lin_reg.fit(x_train, y_train)

    return lin_reg

Decision Tree

In [10]:
def treeRegModel(x_train, y_train):
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(x_train, y_train)
    # scores = cross_val_score(tree_reg, x_train, y_train, scoring = 'r2', cv = 10)

    return tree_reg

Random Forest

In [11]:
def rfModel(x_train, y_train):
    rf = RandomForestRegressor(random_state = 42)
    rf.fit(x_train, y_train)
    
    return rf

XGBoost

In [12]:
def xgbModel(x_train, y_train):
    xgboost = XGBRegressor(max_depth = 5)
    xgboost.fit(x_train, y_train)
    
    return xgboost

Support Vector Machine

In [13]:
def svmModel(x_train, y_train):
    svm = SVR(kernel = 'rbf') #???
    svm.fit(x, y)

    return svm

## Feature Selection

Pearson

In [14]:

pearson = ['Abdomen', 'Chest', 'Hip', 'Weight', 'Thigh', 'Knee', 'Biceps', 'Neck', 'Forearm', 'Wrist', 'Age', 'Ankle', 'Height']

Lasso

In [15]:
lasso = ['Abdomen', 'Wrist', 'Weight', 'Neck', 'Age', 'Forearm', 'Height', 'Hip', 'Thigh', 'Biceps', 'Chest', 'Knee', 'Ankle']

RFE

In [16]:
rfe = ['Age', 'Weight', 'Height', 'Neck', 'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']

### Evaluating

In [17]:
x_train_list = []
x_test_list = []
y_train_list = []
y_test_list = []

Splitting Data

In [18]:
def splitDataFrame(x, y, feature_selection):
    x_train_list.clear()
    x_test_list.clear()
    y_train_list.clear()
    y_test_list.clear()

    for i in range(1, 14):
        x_feat = x[feature_selection[:i]]
        # x_split = x.iloc[:, 0:i]
        x_train, x_test, y_train, y_test = train_test_split(x_feat, y, test_size=0.3, random_state=42)
        x_train_list.append(x_train)
        x_test_list.append(x_test)
        y_train_list.append(y_train)
        y_test_list.append(y_test)

    return x_train_list, x_test_list, y_train_list, y_test_list

Cross-Validation

In [19]:
def cvScores(model, x_train, y_train, criteria):
    # lin_reg = LinearRegression()
    # lin_reg.fit(x_train, y_train)
    scores = cross_val_score(model, x_train, y_train, scoring=criteria, cv=5)

    return scores.mean()

In [20]:
def evaluatingModel(model, x_train, y_train):
    mean_mape = cvScores(model, x_train, y_train, 'neg_mean_absolute_percentage_error')
    mean_mape = abs(mean_mape)
    mean_mae = cvScores(model, x_train, y_train, 'neg_mean_absolute_error')
    mean_mae = abs(mean_mae)
    mean_rmse = cvScores(model, x_train, y_train, 'neg_root_mean_squared_error')
    mean_rmse = abs(mean_rmse)

    return mean_mape, mean_mae, mean_rmse

In [21]:
table = [['feat_selection','modelo','data_partition', 'feat_set','MAPE','MAE','RMSE']]
model_names = ['LR', 'DT', 'RF', 'XGB', 'RF']

In [22]:
def results(feat_selection, model_name, data_set, table, model, x_train, y_train):
    for i in range(13):
        row = [feat_selection]
        row.append(model_name)
        row.append(data_set)
        row.append(i+1)
        mean_mape, mean_mae, mean_rmse = evaluatingModel(model[i], x_train[i], y_train[i])
        row.append(mean_mape.round(4))
        row.append(mean_mae.round(4))
        row.append(mean_rmse.round(4))
        table.append(row)
    
    return table

## Running

#### Feature Selection: Pearson


Data: Whole dataset

In [23]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x, y, pearson)

In [24]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [25]:
table = results('Pearson', 'LR', 'padrão', table, lin_reg, x_train_list, y_train_list)
table = results('Pearson', 'DT', 'padrão',table, tree_reg, x_train_list, y_train_list)
table = results('Pearson', 'RF', 'padrão',table, rf, x_train_list, y_train_list)
table = results('Pearson', 'XGB', 'padrão',table, xgb, x_train_list, y_train_list)
table = results('Pearson', 'SVM', 'padrão',table, svm, x_train_list, y_train_list)

Data: Weight <= 100 Kg

In [26]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x_n, y_n, pearson)

In [27]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [28]:
table = results('Pearson', 'LR', '<=100', table, lin_reg, x_train_list, y_train_list)
table = results('Pearson', 'DT', '<=100',table, tree_reg, x_train_list, y_train_list)
table = results('Pearson', 'RF', '<=100',table, rf, x_train_list, y_train_list)
table = results('Pearson', 'XGB', '<=100',table, xgb, x_train_list, y_train_list)
table = results('Pearson', 'SVM', '<=100',table, svm, x_train_list, y_train_list)

Data: Weight > 100 Kg

In [29]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x_h, y_h, pearson)

In [30]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [31]:
table = results('Pearson', 'LR', '>100', table, lin_reg, x_train_list, y_train_list)
table = results('Pearson', 'DT', '>100',table, tree_reg, x_train_list, y_train_list)
table = results('Pearson', 'RF', '>100',table, rf, x_train_list, y_train_list)
table = results('Pearson', 'XGB', '>100',table, xgb, x_train_list, y_train_list)
table = results('Pearson', 'SVM', '>100',table, svm, x_train_list, y_train_list)

#### Feature Selection: Lasso

Data: Whole dataset

In [32]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x, y, lasso)

In [33]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [34]:
table = results('Lasso', 'LR', 'padrão', table, lin_reg, x_train_list, y_train_list)
table = results('Lasso', 'DT', 'padrão',table, tree_reg, x_train_list, y_train_list)
table = results('Lasso', 'RF', 'padrão',table, rf, x_train_list, y_train_list)
table = results('Lasso', 'XGB', 'padrão',table, xgb, x_train_list, y_train_list)
table = results('Lasso', 'SVM', 'padrão',table, svm, x_train_list, y_train_list)

Data: Weight <= 100 Kg

In [35]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x_n, y_n, lasso)

In [36]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [37]:
table = results('Lasso', 'LR', '<=100', table, lin_reg, x_train_list, y_train_list)
table = results('Lasso', 'DT', '<=100',table, tree_reg, x_train_list, y_train_list)
table = results('Lasso', 'RF', '<=100',table, rf, x_train_list, y_train_list)
table = results('Lasso', 'XGB', '<=100',table, xgb, x_train_list, y_train_list)
table = results('Lasso', 'SVM', '<=100',table, svm, x_train_list, y_train_list)

Data: Weight > 100 Kg

In [38]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x_h, y_h, lasso)

In [39]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [40]:
table = results('Lasso', 'LR', '>100', table, lin_reg, x_train_list, y_train_list)
table = results('Lasso', 'DT', '>100',table, tree_reg, x_train_list, y_train_list)
table = results('Lasso', 'RF', '>100',table, rf, x_train_list, y_train_list)
table = results('Lasso', 'XGB', '>100',table, xgb, x_train_list, y_train_list)
table = results('Lasso', 'SVM', '>100',table, svm, x_train_list, y_train_list)

#### Feature Selection: RFE

Data: Whole dataset

In [41]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x, y, rfe)

In [42]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [43]:
table = results('RFE', 'LR', 'padrão', table, lin_reg, x_train_list, y_train_list)
table = results('RFE', 'DT', 'padrão',table, tree_reg, x_train_list, y_train_list)
table = results('RFE', 'RF', 'padrão',table, rf, x_train_list, y_train_list)
table = results('RFE', 'XGB', 'padrão',table, xgb, x_train_list, y_train_list)
table = results('RFE', 'SVM', 'padrão',table, svm, x_train_list, y_train_list)

Data: Weight <= 100 Kg

In [44]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x_n, y_n, rfe)

In [45]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [46]:
table = results('RFE', 'LR', '<=100', table, lin_reg, x_train_list, y_train_list)
table = results('RFE', 'DT', '<=100',table, tree_reg, x_train_list, y_train_list)
table = results('RFE', 'RF', '<=100',table, rf, x_train_list, y_train_list)
table = results('RFE', 'XGB', '<=100',table, xgb, x_train_list, y_train_list)
table = results('RFE', 'SVM', '<=100',table, svm, x_train_list, y_train_list)

Data: Weight > 100 Kg

In [47]:
x_train_list, x_test_list, y_train_list, y_test_list = splitDataFrame(x_h, y_h, rfe)

In [48]:
lin_reg = []
tree_reg = []
rf = []
xgb = []
svm = []

for i in range(13):
    lin_reg.append(linRegModel(x_train_list[i], y_train_list[i]))
    tree_reg.append(treeRegModel(x_train_list[i], y_train_list[i]))
    rf.append(rfModel(x_train_list[i], y_train_list[i]))
    xgb.append(xgbModel(x_train_list[i], y_train_list[i]))
    svm.append(svmModel(x_train_list[i], y_train_list[i]))

In [49]:
table = results('RFE', 'LR', '>100', table, lin_reg, x_train_list, y_train_list)
table = results('RFE', 'DT', '>100',table, tree_reg, x_train_list, y_train_list)
table = results('RFE', 'RF', '>100',table, rf, x_train_list, y_train_list)
table = results('RFE', 'XGB', '>100',table, xgb, x_train_list, y_train_list)
table = results('RFE', 'SVM', '>100',table, svm, x_train_list, y_train_list)

In [50]:
tabela_resultados = pd.DataFrame(table)
print(tabela_resultados)

                  0       1               2         3       4       5       6
0    feat_selection  modelo  data_partition  feat_set     MAE    MAPE    RMSE
1           Pearson      LR          padrão         1  0.2913  3.9994  4.8699
2           Pearson      LR          padrão         2  0.2887  3.9507  4.7465
3           Pearson      LR          padrão         3  0.2818  3.8314  4.6691
4           Pearson      LR          padrão         4  0.2751  3.6953  4.4827
..              ...     ...             ...       ...     ...     ...     ...
581             RFE     SVM            >100         9  0.2393  6.1075   6.717
582             RFE     SVM            >100        10  0.2427  6.1975  6.7811
583             RFE     SVM            >100        11  0.2485  6.3595  6.9027
584             RFE     SVM            >100        12  0.2515  6.4518  6.9939
585             RFE     SVM            >100        13  0.2514  6.4465  6.9843

[586 rows x 7 columns]


In [51]:
tabela_resultados.to_csv('resultados.csv')

#### Outlier Detection