In [56]:
#import packages

import numpy as np
import pandas as pd
import os
import pickle


from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report


In [38]:
#import dataset

df=pd.read_csv(r"C:\Users\annap\Desktop\ML_Projects_Deploy\3-Obesity_KNN_deploy\Obesity_DS.csv")
df.head()

Unnamed: 0,Gender,Age,Height,Height_cm,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,162.0,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,152.0,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,180.0,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,180.0,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,178.0,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [39]:
#Drop duplicates and unwanted column

df=df.drop_duplicates()

In [40]:
df=df.drop('Height',axis=1)

In [41]:
df['Age'] = df['Age'].astype(int)

In [42]:
df.describe()

Unnamed: 0,Age,Height_cm,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2087.0,2087.0,2087.0,2087.0,2087.0,2087.0,2087.0,2087.0
mean,24.009104,170.267412,86.85873,2.421466,2.701179,2.004749,1.012812,0.663035
std,6.333297,9.318594,26.190847,0.534737,0.764614,0.608284,0.853475,0.608153
min,14.0,145.0,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.0,163.01785,66.0,2.0,2.697467,1.590922,0.124505,0.0
50%,22.0,170.1584,83.1011,2.396265,3.0,2.0,1.0,0.630866
75%,26.0,176.94915,108.015907,3.0,3.0,2.466193,1.678102,1.0
max,61.0,198.0,173.0,3.0,4.0,3.0,3.0,2.0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2087 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2087 non-null   object 
 1   Age                             2087 non-null   int64  
 2   Height_cm                       2087 non-null   float64
 3   Weight                          2087 non-null   float64
 4   family_history_with_overweight  2087 non-null   object 
 5   FAVC                            2087 non-null   object 
 6   FCVC                            2087 non-null   float64
 7   NCP                             2087 non-null   float64
 8   CAEC                            2087 non-null   object 
 9   SMOKE                           2087 non-null   object 
 10  CH2O                            2087 non-null   float64
 11  SCC                             2087 non-null   object 
 12  FAF                             2087 no

In [44]:
df.isna().sum()

Gender                            0
Age                               0
Height_cm                         0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [45]:
df.duplicated().sum()

np.int64(0)

In [46]:
#define numeric columns and category columns

numeric_col=['Age','Height_cm','Weight','FCVC','NCP','CH2O','FAF','TUE']
categorical_col=['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']


In [47]:
# create directory to store encoders and pickles
os.makedirs("model_assets",exist_ok=True)

In [48]:
#scale numeric columns

scale=StandardScaler()
df[numeric_col]=scale.fit_transform(df[numeric_col])

pickle.dump(scale,open('model_assets/scaler.pkl','wb'))


In [49]:
#encode categorical columns

lebel_Encoder={}

for col in categorical_col:
    le=LabelEncoder()
    df[col]=le.fit_transform(df[col])
    lebel_Encoder[col]=le

    pickle.dump(le,open(f'model_assets/{col}_lebelEncoder.pkl','wb'))



In [50]:
#encode target column

target_encoder=LabelEncoder()

df['NObeyesdad']=target_encoder.fit_transform(df['NObeyesdad'])

pickle.dump(target_encoder,open('model_assets/targetEncoder.pkl','wb'))

In [51]:
#split features and target

x=df.drop(columns='NObeyesdad')
y=df['NObeyesdad']
x.shape,y.shape

((2087, 16), (2087,))

In [52]:
# train test split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=21)
display(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(1669, 16)

(1669,)

(418, 16)

(418,)

In [53]:
#grid search for better model

param_grid={
    'n_neighbors':list(range(1,50)),
    'weights':['uniform','distance'],
    'metric':['euclidean','manhattan']
}

knn=KNeighborsClassifier()
grid_search=GridSearchCV(knn,param_grid,cv=5,scoring='accuracy',verbose=1,n_jobs=-1)
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 196 candidates, totalling 980 fits


In [54]:
#best model

best_knn=grid_search.best_estimator_


In [59]:
#evaluate:

y_pred=best_knn.predict(x_test)
print(y_pred)

[6 4 5 2 5 2 5 1 5 1 1 0 3 0 6 2 3 4 5 6 0 1 1 0 6 2 4 0 0 3 0 3 3 1 1 4 2
 5 5 1 6 0 0 3 3 4 4 4 2 0 6 6 1 6 2 0 5 6 4 0 1 4 6 1 2 1 0 6 0 6 6 1 2 1
 4 6 3 3 5 6 5 4 5 5 0 4 4 1 6 1 5 3 6 0 6 2 3 1 1 6 6 4 4 2 1 4 6 0 3 0 6
 2 3 2 3 1 5 6 6 6 4 3 3 5 3 0 2 6 0 3 4 1 4 5 1 6 2 2 4 1 6 0 5 5 0 0 0 4
 5 4 2 0 5 4 3 4 6 6 4 5 4 4 6 0 3 2 0 0 2 0 1 0 3 2 6 5 3 6 2 4 0 2 4 4 0
 1 6 1 6 3 4 6 5 5 1 6 4 1 5 6 3 2 2 4 1 3 2 5 0 5 0 5 4 6 2 1 2 5 1 4 1 5
 4 3 6 2 4 4 3 1 6 6 5 2 2 5 6 6 4 5 6 0 0 0 2 4 0 5 4 3 4 2 3 0 5 0 2 6 2
 3 5 0 2 4 6 6 6 6 2 0 4 6 4 5 4 3 3 0 4 3 1 4 4 3 6 4 4 3 6 5 0 1 0 4 3 3
 3 6 0 2 6 2 3 3 3 4 5 6 1 2 3 6 4 0 3 0 6 2 0 6 1 2 6 2 2 2 4 0 2 2 3 5 3
 4 1 2 5 5 0 3 0 6 6 1 6 1 0 5 2 1 4 2 6 4 3 4 4 5 5 6 0 4 4 4 2 1 4 0 2 2
 3 5 4 5 2 5 6 5 4 3 3 3 4 4 0 5 4 2 5 0 2 4 4 4 2 6 2 4 6 5 1 4 6 6 3 3 0
 1 4 2 0 0 1 5 3 6 0 3]


In [61]:
con_mat=metrics.confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n",con_mat)

Confusion Matrix:
 [[54  1  0  0  0  0  0]
 [ 7 35  4  0  0  2  6]
 [ 0  1 50  1  0  0  2]
 [ 0  0  0 54  0  1  0]
 [ 0  0  0  0 75  0  0]
 [ 0  7  3  0  0 49  3]
 [ 0  1  1  0  0  1 60]]


In [63]:
accu_score=metrics.accuracy_score(y_test,y_pred)
print("Accuracy Score:\n",accu_score)
print("Accuracy score in percentage:\n",accu_score)

Accuracy Score:
 0.9019138755980861
Accuracy score in percentage:
 0.9019138755980861


In [64]:
report=classification_report(y_test,y_pred)
print("Classification Report:\n",report)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93        55
           1       0.78      0.65      0.71        54
           2       0.86      0.93      0.89        54
           3       0.98      0.98      0.98        55
           4       1.00      1.00      1.00        75
           5       0.92      0.79      0.85        62
           6       0.85      0.95      0.90        63

    accuracy                           0.90       418
   macro avg       0.90      0.90      0.89       418
weighted avg       0.90      0.90      0.90       418



In [67]:
pickle.dump(best_knn,open("model_assets/Obesity_KNN.pkl",'wb'))

In [70]:
Gender=input("Enter your Gender'Female'/ 'Male':")
Age=float(input("Enter your Age"))
Height_cm=float(input("Enter your height:"))
Weight=float(input("Enter your weight:"))
family_history_with_overweight=input("Enter your family_history_with_overweight 'yes'/ 'no':")
FAVC=input("Enter your Frequent consumption of high-calorie food 'no'/ 'yes':")
FCVC=float(input("Enter your Frequency of vegetable consumption:"))
NCP=float(input("Enter your Number of main meals:"))
CAEC=input("Enter your Eating between meals 'Sometimes', 'Frequently', 'Always', 'no':")
SMOKE=input("Do you have  SMOKING habit 'no', 'yes':")
CH2O=float(input("Enter your Daily water intake:"))
SCC=input("Enter your Calorie consumption monitoring 'no', 'yes':")
FAF=float(input("Enter your Physical activity frequency:"))
TUE=float(input("Enter your Technology use per day:"))
CALC=input("Enter your Alcohol consumption 'no', 'Sometimes', 'Frequently', 'Always':")
MTRANS=input("Enter your Transportation method 'Public_Transportation', 'Walking', 'Automobile', 'Motorbike','Bike':")

Gender = lebel_Encoder['Gender'].transform([Gender])[0]
family_history_with_overweight = lebel_Encoder['family_history_with_overweight'].transform([family_history_with_overweight])[0]
FAVC = lebel_Encoder['FAVC'].transform([FAVC])[0]
CAEC = lebel_Encoder['CAEC'].transform([CAEC])[0]
SMOKE = lebel_Encoder['SMOKE'].transform([SMOKE])[0]
SCC = lebel_Encoder['SCC'].transform([SCC])[0]
CALC = lebel_Encoder['CALC'].transform([CALC])[0]
MTRANS = lebel_Encoder['MTRANS'].transform([MTRANS])[0]

p=pickle.load(open('model_assets/Obesity_KNN.pkl','rb'))
p.predict([[Gender,Age,Height_cm,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS]])



array([3])