# 1st Approach: Naive Bayes

### Step 1. Import libraries and load the dataset

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Load dataset
data = pd.read_csv('ObesityDataSet.csv')

### Step 2. Load the dataset and display basic information

In [9]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,Female,21.0,63.8,141.1,1,0,2.0,3.0,1,0,2.0,0,0.0,1.0,0,Public_Transportation,Normal_Weight,24.4
1,Female,21.0,59.8,123.5,1,0,3.0,3.0,1,1,3.0,1,3.0,0.0,1,Public_Transportation,Normal_Weight,24.3
2,Male,23.0,70.9,169.8,1,0,2.0,3.0,1,0,2.0,0,2.0,1.0,2,Public_Transportation,Normal_Weight,23.7
3,Male,27.0,70.9,191.8,0,0,3.0,3.0,1,0,2.0,0,2.0,0.0,2,Walking,Overweight_Level_I,26.8
4,Male,22.0,70.1,198.0,0,0,2.0,1.0,1,0,2.0,0,0.0,0.0,1,Public_Transportation,Overweight_Level_II,28.3


In [10]:
data.shape

(2111, 18)

### Step 3. Prepare the data

In [11]:
# Specify the feature matrix X and the target vector y
X = data.drop('NObeyesdad', axis=1)
y = data['NObeyesdad']

# Encode categorical variables if any
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])


### Step 4. Split the data into training and testing sets

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Step 5. Initialize and train the Naive Bayes classifier

In [13]:
# Initialize the Naive Bayes classifier
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)


### Step 6. Make predictions and calculate accuracy

In [14]:
# Make predictions
predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

report = classification_report(y_test, predictions)
print(report)


Accuracy: 0.8959810874704491
                     precision    recall  f1-score   support

Insufficient_Weight       0.82      1.00      0.90        56
      Normal_Weight       0.88      0.73      0.80        62
     Obesity_Type_I       0.96      0.82      0.88        78
    Obesity_Type_II       0.86      0.98      0.92        58
   Obesity_Type_III       0.97      1.00      0.98        63
 Overweight_Level_I       0.91      0.86      0.88        56
Overweight_Level_II       0.87      0.92      0.89        50

           accuracy                           0.90       423
          macro avg       0.90      0.90      0.89       423
       weighted avg       0.90      0.90      0.89       423



# 2nd Approach: SVC

### Step 1. Import libraries and load the dataset

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
warnings.filterwarnings("ignore")

data = pd.read_csv('ObesityDataSet.csv')

### Step 2. Define categorical and continuous features

In [16]:
categorical_features = ['Gender', 'CALC', 'FAVC', 'SCC', 'SMOKE', 'family_history_with_overweight', 'CAEC', 'MTRANS']
continuous_features = ['Age', 'Height', 'Weight', 'FCVC', "NCP", 'CH2O', 'FAF', 'TUE']


### Step 3. Encode target and categorical features

In [17]:
label_encoder = LabelEncoder()

# Encoding the target variable
data['NObeyesdad'] = label_encoder.fit_transform(data['NObeyesdad'])

# Encoding categorical variables
data[categorical_features] = data[categorical_features].apply(label_encoder.fit_transform)


### Step 4. Prepare feature matrix X and target vector y

In [18]:
X = data.drop('NObeyesdad', axis=1)
y = data['NObeyesdad']


### Step 5. Split the data into training and testing sets

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


### Step 6. Scale the features

In [20]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


### Step 7. Initialize and train the SVM classifier

In [21]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)


### Step 8. Make predictions and evaluate the model

In [22]:
predictions = classifier.predict(X_test)

report = classification_report(y_test, predictions)
print(report)



              precision    recall  f1-score   support

           0       0.90      0.99      0.94       135
           1       0.98      0.85      0.91       151
           2       0.99      0.94      0.96       170
           3       0.95      1.00      0.97       153
           4       1.00      0.99      1.00       163
           5       0.87      0.96      0.91       141
           6       0.94      0.91      0.93       143

    accuracy                           0.95      1056
   macro avg       0.95      0.95      0.95      1056
weighted avg       0.95      0.95      0.95      1056



# Best Approach: Ensemble Method with Support Vector, Random Forest and Gradient Boosting Classifier

### Step 1. Import libraries and load the dataset

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

data = pd.read_csv('ObesityDataSet.csv')

### Step 2. Select categorical and numerical columns

In [24]:
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('NObeyesdad')  # Remove the target column from the list
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()


### Step 3. Create a preprocessing transformer

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])


### Step 4. Define individual classifiers and the ensemble model

In [26]:
rf = RandomForestClassifier(random_state=42)
svc = SVC(probability=True, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('svc', svc),
        ('gb', gb)
    ],
    voting='soft'  # Using 'soft' voting
)


### Step 5. Create the pipeline with preprocessing and ensemble model

In [27]:
ensemble_pipeline = make_pipeline(preprocessor, ensemble_model)


### Step 6. Split the data into training and testing sets

In [28]:
X = data.drop('NObeyesdad', axis=1)
y = data['NObeyesdad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Step 7. Train the ensemble model

In [29]:
ensemble_pipeline.fit(X_train, y_train)


### Step 8. Predict and evaluate the ensemble model

In [30]:
y_pred = ensemble_pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


                     precision    recall  f1-score   support

Insufficient_Weight       0.98      0.95      0.96        56
      Normal_Weight       0.92      0.98      0.95        62
     Obesity_Type_I       1.00      0.99      0.99        78
    Obesity_Type_II       0.98      1.00      0.99        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       1.00      0.95      0.97        56
Overweight_Level_II       0.98      1.00      0.99        50

           accuracy                           0.98       423
          macro avg       0.98      0.98      0.98       423
       weighted avg       0.98      0.98      0.98       423

