# Estimation of obesity levels based on eating habits and physical condition

# This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition.

# Dataset Characteristics
# Multivariate
# ===============================================
# Subject Area
# Health and Medicine
# ==============================================
# Associated Tasks
# Classification
# ==============================================

# Feature Type
# Integer
# ==============================================

# Instances
# 2111
# ==============================================

# Features
# 16

# Additional Information

# This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition. The data contains 17 attributes and 2111 records, the records are labeled with the class variable NObesity (Obesity Level), that allows classification of the data using the values of Insufficient Weight, Normal Weight, Overweight Level I, Overweight Level II, Obesity Type I, Obesity Type II and Obesity Type III. 77% of the data was generated synthetically using the Weka tool and the SMOTE filter, 23% of the data was collected directly from users through a web platform.

# Main Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mis
import os
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

## metric
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report , mean_squared_error

## preprocessing
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score , cross_val_predict , GridSearchCV , RandomizedSearchCV

## pipeline
from sklearn.pipeline import Pipeline ,FeatureUnion
from sklearn_features.transformers import DataFrameSelector

## Ensemble
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgp


# ANN Neural Network
from keras import layers , Sequential 
from tensorflow.keras.layers import Dense

In [None]:
path_data = r"D:\ML_data\estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition\ObesityDataSet_raw_and_data_sinthetic.csv"

if os.path.exists(path_data):
    df = pd.read_csv(path_data)
    print(df.head())
    
else:
    print("Sorry , your path data not found.")

In [None]:
df.tail(10)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe().T

In [None]:
df.describe(include=object).T

In [None]:
df.isna().sum()

In [None]:
mis.matrix(df)

In [None]:
mis.bar(df)

# EDA (Exploratory data analysis)

# Variable Name	Role	Type	Demographic	Description	Units	Missing Values
# Gender	Feature	Categorical	Gender			no
# Age	Feature	Continuous	Age			no
# Height	Feature	Continuous				no
# Weight	Feature	Continuous				no
# family_history_with_overweight	Feature	Binary		Has a family member suffered or suffers from overweight?		no
# FAVC	Feature	Binary		Do you eat high caloric food frequently?		no
# (Food-Amount-Version-Control)
# FCVC	Feature	Integer		Do you usually eat vegetables in your meals?		no
# (Frequency of Consumption of Vegetables)
# NCP	Feature	Continuous		How many main meals do you have daily?		no
# (Number of Consumed Portions)
# CAEC	Feature	Categorical		Do you eat any food between meals?		no
# (Consumption of Alcoholic Beverages)
# SMOKE	Feature	Binary		Do you smoke?		no
# CH2O	Feature	Continuous		How much water do you drink daily?		no
# SCC	Feature	Binary		Do you monitor the calories you eat daily?		no
# (Self-Control of Caloric Intak)
# FAF	Feature	Continuous		How often do you have physical activity?		no
# ("Frequency of Physical Activity")
# TUE	Feature	Integer		How much time do you use technological devices such as cell phone, videogames, television, computer and others?		no
# (Time of Use of Electronic Devices)
# CALC	Feature	Categorical		How often do you drink alcohol?		no
# (Consumption of Alcohol)
# MTRANS	Feature	Categorical		Which transportation do you usually use?		no
# NObeyesdad	Target	Categorical		Obesity level		no

In [None]:
df["Gender"].value_counts()

In [None]:
df["NObeyesdad"].value_counts()

In [None]:
label_counts = {
    'Male':1068,
    'Female':1043
}

# Create a Pie chart using Plotly
fig = px.pie(values=list(label_counts.values()), names=list(label_counts.keys()), title='Label Distribution Male and Female')

fig.update_traces(textposition='inside', textinfo='percent+label+value', hole=0.5)

fig.show()


In [None]:
label_counts = {
    'Obesity_Type_I':351,
    'Obesity_Type_III':324,
    'Obesity_Type_II':297,
    'Overweight_Level_I':290,
    'Overweight_Level_II':290,
    'Normal_Weight':287,
    'Insufficient_Weight':272
}

# Create a Pie chart using Plotly
fig = px.pie(values=list(label_counts.values()), names=list(label_counts.keys()), title='Label Distribution of Trarget (NObeyesdad) ')

fig.update_traces(textposition='inside', textinfo='percent+label+value', hole=0.5)

fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Convert label_counts dictionary to a DataFrame
df_target = pd.DataFrame(list(label_counts.items()), columns=['Label', 'Count'])

# Plot the bar chart
fig = px.bar(df_target, x='Label', y='Count', title='Distribution of Obesity Levels')
fig.show()


In [None]:
plt.figure(figsize=(15,12))
sns.boxplot(data=df , x=df["NObeyesdad"] , y=df["Age"])
plt.title("Boxplot of Target (NObeyesdad) with Age " , fontsize=25 , color="r")
plt.xlabel("NObeyesdad" , fontsize=25 , color="m" )
plt.ylabel("Age" , fontsize=25 , color="m" )
plt.show()

In [None]:
sns.pairplot(df , hue="NObeyesdad")
plt.show()

In [None]:
df.columns

In [None]:
df._get_numeric_data()

In [None]:
for i in df._get_numeric_data():
    sns.distplot(df[i])
    plt.title(f'Distribution of {i}')
    plt.xlabel(i)
    plt.ylabel('Density')
    plt.show()


In [None]:
sns.barplot(df)
plt.show()

In [None]:
df.hist(bins=25 , figsize=(15,10))
plt.show()

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(131)
sns.countplot(x= 'Age', data = df, palette="GnBu_d",edgecolor="m")
plt.subplot(132)
sns.countplot(x= 'Height', data = df, palette="flag",edgecolor="m")
plt.subplot(133)
sns.countplot(x= 'Weight', data = df, palette="Greens_r",edgecolor="m")
plt.show()

In [None]:
plt.figure(figsize=(16, 4))
plt.subplot(121)
sns.boxplot(x = 'Weight', y = 'SCC', data = df)
plt.subplot(122)
sns.boxplot(x = 'Weight', y = 'CAEC', data = df)
plt.show()

In [None]:
plt.figure(figsize=(16, 4))
plt.subplot(121)
sns.boxplot(x = 'Height', y = 'SCC', data = df)
plt.subplot(122)
sns.boxplot(x = 'Height', y = 'CAEC', data = df)
plt.show()

In [None]:
plt.figure(figsize=(16, 4))
plt.subplot(121)
sns.boxplot(x = 'Age', y = 'SCC', data = df)
plt.subplot(122)
sns.boxplot(x = 'Age', y = 'CAEC', data = df)
plt.show()

In [None]:
vc = df["NObeyesdad"].value_counts()
vc

In [None]:
plt.figure(figsize=(12,8))
ax = sns.countplot(x=df['NObeyesdad'], color='mediumseagreen')
plt.title('Nobeyesdad Class Counts', fontsize=16)
plt.ylabel('Class Counts', fontsize=16)
plt.xlabel('Class Label', fontsize=16)
plt.xticks(rotation='vertical')

for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=15, color='r', xytext=(0, 5),
                textcoords='offset points')


plt.show()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(16, 8))
sns.heatmap(df.corr(), annot=True, cmap="Blues")
plt.title("Correlation of Features", fontsize=25, color="m")
plt.show()


In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
df

In [None]:
X = df.drop(columns=["NObeyesdad"] , axis=1)
y = df["NObeyesdad"]
y = LabelEncoder().fit_transform(y)

In [None]:
X_train , X_test ,y_train , y_test = train_test_split(X , y , test_size=0.2 , shuffle=True , random_state=123)

In [None]:
print(f"the shape of X_train is {X_train.shape}")
print(f"the shape of X_test is {X_test.shape}")
print(f"the shape of y_train is {y_train.shape}")
print(f"the shape of y_test is {y_test.shape}")

In [None]:
col_nums = [col for col in X_train.columns if df[col].dtype in ["int32","int64","float32","float64"]]

col_catg = [col for col in X_train.columns if df[col].dtype not in ["int32","int64","float32","float64"]]

if (col_nums and col_catg):
    print(f"nums columns is \n {col_nums}")
    print(f"catg columns is \n {col_catg}")
    
    
elif catg_cols:
    print(f"catg columns is \n {col_catg}")
    
    
else:
    print(f"catg columns is \n {col_nums}")

In [None]:
## Assuming "EUR/USD" is a valid column name
pip_num = Pipeline(steps=[("select", DataFrameSelector(col_nums)),
                           ("Simple", SimpleImputer(strategy="median")),
                           ("scaler", StandardScaler())])

# X_train_nums = pip_num.fit_transform(X_train , y_train)

In [None]:
pip_catg = Pipeline(steps=[
                 ("selector", DataFrameSelector(col_catg)),# ==> To select the categorical cols from the Data
                 ("Imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                 ("one_hot", LabelEncoder())
                ])


In [None]:
def Preprocessing(cols_nums, col_catg, X_train, X_test):
    if cols_nums and col_catg:
        # total pipeline to concatenate the num_pipeline and cate_pipeline
        total_pipeline = FeatureUnion(transformer_list=[("num_pipeline", pip_num),
                                                        ("cate_pipeline", pip_catg)])

        # make fitting
        pip_num.fit(X_train)
        X_train_final= pip_num.transform(X_train)
        X_test_final =  pip_num.transform(X_test)

        return X_train_final, X_test_final

    elif col_catg == [] and cols_nums:
        # make fitting
        pip_num.fit(X_train)
        X_train_final= pip_num.transform(X_train)
        X_test_final =  pip_num.transform(X_test)

        return X_train_final, X_test_final

    else:
        # make fitting
        pip_num.fit(X_train)
        X_train_final= pip_num.transform(X_train)
        X_test_final =  pip_num.transform(X_test)

        return X_train_final, X_test_final



X_train_final, X_test_final = Preprocessing(col_nums, col_catg, X_train, X_test)
print(f"X_train_final is = {X_train_final}")
print("*"*50)
print(f"X_test_final is = {X_test_final}")


In [None]:
df.hist(bins=25 , figsize=(15,10))
plt.show()

# Solve skewed by log transform

In [None]:
colms = (df._get_numeric_data()).columns
colms

In [None]:
colms = [cols for cols in (df._get_numeric_data()).columns]
for col in colms:
    df[col] = np.log(df[col] + 1e-10).astype(float)

In [None]:
df.hist(bins=20 , figsize=(10,8))
plt.show()

# RandomForestClassifier

In [None]:
Random = RandomForestClassifier()

In [None]:
# Define the hyperparameter grid
hyper_params = {
    "n_estimators": [i for i in range(100 ,500 ,100)],
    "max_depth": [i for i in range(3,15,3)],
    "criterion": ['entropy', 'gini', 'log_loss'],
}

# Create GridSearchCV object
random_hyper = GridSearchCV(
    estimator=Random,
    param_grid=hyper_params,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1
)

# Fit the model
random_model = random_hyper.fit(X_train_final, y_train)


In [None]:
print("Best parameters : \n", random_model.best_params_)


In [None]:
Rand_sea = random_model.best_estimator_
Rand_sea

In [None]:
corss_val = cross_val_score(estimator=Rand_sea ,X = X_train_final , y=y_train ,
                            scoring="accuracy" , cv=5 , n_jobs=-1)

print("Score is ==>>\n ",abs(corss_val))

print("cross val score mean is ==>> ",corss_val.mean())

In [None]:
cross_val_pred = cross_val_predict(estimator=Rand_sea ,X = X_train_final , y=y_train ,
                                  method="predict" , cv=5 , n_jobs=-1)


# Calculate mean squared error
mse = np.sqrt(mean_squared_error(y_train, cross_val_pred))

# Print the MSE
print(f"The mean squared error is {mse:.4f}")

In [None]:
X_train_final.shape[1]

In [None]:
df.columns

In [None]:
mylist =[col for col in df.columns if df[col].dtype in ["int32","int64","float32","float64"]]
mylist

In [None]:
# Numerical Columns only 
mylist = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

importance = random_model.best_estimator_.feature_importances_

for feature, importance_score in zip(mylist, importance):
    print(feature, importance_score)


In [None]:
plt.figure(figsize=(10, 8))
plt.barh(mylist, importance) 
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.xticks(rotation='vertical' ,c="k" , fontsize=12)
plt.yticks(c="k" , fontsize=12)
plt.grid(True)
plt.show()

In [None]:
y_pred = random_model.predict(X_test_final)
y_pred[:5]

In [None]:
confusion = confusion_matrix(y_test , y_pred)
confusion

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(confusion , fmt="g" , annot=True , cbar=True , vmin=0 , cmap="Blues")
plt.xticks(ticks=np.arange(2) + 0.5 , labels=["False" , "True"])
plt.yticks(ticks=np.arange(2) + 0.5 , labels=["False" , "True"])
plt.xlabel("Predicted" , fontsize=14 , color="b")
plt.ylabel("Actula" , fontsize=14 , color="b")
plt.title("Confusion Matrix" , fontsize=20 , color="m")
plt.show()

In [None]:
print(f" some details \n {classification_report(y_test , y_pred)}")

# xgboost

In [None]:
xg_clas = xgp.XGBClassifier()
xg_clas.fit(X_train_final , y_train)

In [None]:
cross_val_xgb = cross_val_score(estimator=xg_clas , X=X_train_final , y=y_train ,
                            cv=5 , scoring="accuracy" , n_jobs=-1)

 
print("Score is ==>>\n ",abs(cross_val_xgb))

print("cross val score mean is ==>> ",cross_val_xgb.mean())

In [None]:
# Make cross-validation predictions
cross_val_pred = cross_val_predict(estimator=xg_clas, X=X_train_final, y=y_train,
                                   cv=5, method="predict", n_jobs=-1)



# Calculate mean squared error
mse =np.sqrt(mean_squared_error(y_train, cross_val_pred))
# Print the MSE
print(f"The mean squared error is {mse:.4f}")

# Using Deep Learning 

In [None]:
X_train_final[0].shape

In [None]:
# setting up the layers of Neural Network

model = Sequential([  
                          layers.Flatten(input_shape=(X_train_final[0].shape)),
                          layers.Dense(64, activation='relu'),
                          layers.Dense(7, activation='softmax')
])

model.summary()

In [None]:
# compiling the Neural Network

model.compile(optimizer="adam" ,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
# training the Neural Network
history = model.fit(X_train_final , y_train , epochs=50)
history

In [None]:
# Evaluating the model

loss, accuracy = model.evaluate(X_test_final, y_test)
print("Accuracy is == " , accuracy)
print("loss is == " , np.round(loss , 4))

In [None]:
# The style
sns.set(style='whitegrid')

# Plotting training history
plt.figure(figsize=(15, 8))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', linestyle='-', marker='X', markersize=5)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', linestyle='-', marker='o', markersize=5)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# The Prediction
prediction = model.predict(X_test_final)
prediction

In [None]:
len(prediction) , len(y_test)

In [None]:
import seaborn as sns

sns.histplot(y_test, color='blue', label='Actual', kde=True)
sns.histplot(prediction, color='red', label='Predicted', kde=True)
plt.xlabel("Values")
plt.ylabel("Frequency")
plt.title("Actual vs. Predicted Distribution")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10,7))
class_index = 0  
predictions_class = prediction[:, class_index]

# Calculate residuals
residuals = y_test - predictions_class

# Plotting residuals
plt.scatter(predictions_class, residuals)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.axhline(y=0, color='k', linestyle='--')
plt.show()


# END 

# BY : SAYED ALI