In [None]:
# PDAN8411 - ICE 1
# Done by: Kiashen Maharajh (ST10055763)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load your dataset
df = pd.read_csv("datasets/ObesityDataSet_raw_and_data_sinthetic.csv")


In [3]:
# Display the first few rows
print(df.head())

   Gender  Age  Height  Weight family_history_with_overweight FAVC  FCVC  NCP  \
0  Female   21    1.62    64.0                            yes   no   2.0  3.0   
1  Female   21    1.52    56.0                            yes   no   3.0  3.0   
2    Male   23    1.80    77.0                            yes   no   2.0  3.0   
3    Male   27    1.80    87.0                             no   no   3.0  3.0   
4    Male   22    1.78    89.8                             no   no   2.0  1.0   

        CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC                 MTRANS  \
0  Sometimes    no   2.0   no  0.0  1.0          no  Public_Transportation   
1  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes  Public_Transportation   
2  Sometimes    no   2.0   no  2.0  1.0  Frequently  Public_Transportation   
3  Sometimes    no   2.0   no  2.0  0.0  Frequently                Walking   
4  Sometimes    no   2.0   no  0.0  0.0   Sometimes  Public_Transportation   

            NObeyesdad  
0        Normal_Wei

In [4]:
# Check for missing values
print(df.isnull().sum())

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64


In [5]:
# Check the distribution of the target variable
print(df['NObeyesdad'].value_counts())


NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64


In [10]:
# Encode the target variable (NObeyesdad)
label_encoder = LabelEncoder()
df['NObeyesdad'] = label_encoder.fit_transform(df['NObeyesdad'])

In [12]:
# List of categorical features to encode
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

# Check if the categorical columns exist in the DataFrame
existing_categorical_features = [col for col in categorical_features if col in df.columns]

# Apply one-hot encoding only if the columns exist
if existing_categorical_features:
    df = pd.get_dummies(df, columns=existing_categorical_features, drop_first=True)
else:
    print("Categorical columns have already been encoded.")

Categorical columns have already been encoded.


In [13]:
# Define numerical features
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Normalize numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [14]:
# Define features (X) and target (y)
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Start with k=5

# Train the model
knn.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = knn.predict(X_test)

In [17]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[52  1  0  0  0  3  0]
 [12 25  6  1  0  8 10]
 [ 0  0 72  2  0  0  4]
 [ 0  0  3 55  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 1  5  0  0  0 47  3]
 [ 0  1  5  3  1  2 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86        56
           1       0.78      0.40      0.53        62
           2       0.84      0.92      0.88        78
           3       0.90      0.95      0.92        58
           4       0.98      1.00      0.99        63
           5       0.78      0.84      0.81        56
           6       0.69      0.76      0.72        50

    accuracy                           0.83       423
   macro avg       0.83      0.83      0.82       423
weighted avg       0.83      0.83      0.82       423


Accuracy Score:
0.8321513002364066


In [18]:
# Example new data point
new_data = {
    'Age': 25,
    'Height': 1.75,
    'Weight': 80,
    'family_history_with_overweight': 'yes',
    'FAVC': 'yes',
    'FCVC': 3,
    'NCP': 3,
    'CAEC': 'Sometimes',
    'SMOKE': 'no',
    'CH2O': 2,
    'SCC': 'no',
    'FAF': 1,
    'TUE': 0,
    'CALC': 'Sometimes',
    'MTRANS': 'Public_Transportation'
}

# Convert the dictionary to a DataFrame
new_df = pd.DataFrame([new_data])

# Display the new data
print(new_df)

   Age  Height  Weight family_history_with_overweight FAVC  FCVC  NCP  \
0   25    1.75      80                            yes  yes     3    3   

        CAEC SMOKE  CH2O SCC  FAF  TUE       CALC                 MTRANS  
0  Sometimes    no     2  no    1    0  Sometimes  Public_Transportation  


In [19]:
# Encode categorical variables (same as during training)
categorical_features = ['family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
new_df = pd.get_dummies(new_df, columns=categorical_features, drop_first=True)

# Ensure the new data has the same columns as the training data
# Add missing columns (if any) and set their values to 0
missing_cols = set(X_train.columns) - set(new_df.columns)
for col in missing_cols:
    new_df[col] = 0

# Reorder columns to match the training data
new_df = new_df[X_train.columns]

# Scale numerical features (same as during training)
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
new_df[numerical_features] = scaler.transform(new_df[numerical_features])

# Display the preprocessed new data
print(new_df)

        Age    Height   Weight      FCVC       NCP      CH2O       FAF  \
0  0.107628  0.518284 -0.25152  1.088307  0.404102 -0.013141 -0.012127   

        TUE  Gender_Male  family_history_with_overweight_yes  ...  CAEC_no  \
0 -1.080619            0                                   0  ...        0   

   SMOKE_yes  SCC_yes  CALC_Frequently  CALC_Sometimes  CALC_no  MTRANS_Bike  \
0          0        0                0               0        0            0   

   MTRANS_Motorbike  MTRANS_Public_Transportation  MTRANS_Walking  
0                 0                             0               0  

[1 rows x 23 columns]


In [20]:
# Make a prediction
prediction = knn.predict(new_df)

# Decode the prediction (if you encoded the target variable)
predicted_obesity_level = label_encoder.inverse_transform(prediction)

# Display the prediction
print(f"Predicted Obesity Level: {predicted_obesity_level[0]}")

Predicted Obesity Level: 5


In [23]:
# Make a prediction
prediction = knn.predict(new_df)

# Decode the prediction (if you encoded the target variable)
predicted_obesity_level = label_encoder.inverse_transform(prediction)

# Display the prediction
print(f"Predicted Obesity Level: {predicted_obesity_level[0]}")

# Re-fit the LabelEncoder to the original NObeyesdad column
label_encoder.fit(df['NObeyesdad'])

# Check the mapping of encoded values
print("Encoded Classes:", label_encoder.classes_)

Predicted Obesity Level: 5
Encoded Classes: [0 1 2 3 4 5 6]


In [22]:
# Decode the prediction
predicted_obesity_level = label_encoder.inverse_transform(prediction)

# Display the prediction
print(f"Predicted Obesity Level: {predicted_obesity_level[0]}")

Predicted Obesity Level: 5
