In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as  plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
df =pd.read_csv('/content/ObesityDataSet_raw_and_data_sinthetic.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [9]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [5]:
df = df.rename(columns = {'family_history_with_overweight': 'FHWO',
                           'NObeyesdad' : 'BMI'})

In [6]:
#Lets see all the object related datatype
object_cols = [col for col in df.columns if df[col].dtype == "object"]

good_label_cols = [col for col in object_cols if set(df[col]).issubset(set(df[col]))]
good_label_cols.remove('BMI')

print('Categorical columns that will be ordinal encoded:', good_label_cols)


Categorical columns that will be ordinal encoded: ['Gender', 'FHWO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']


In [10]:
data_transform = df.copy()
ordinal_encoder = OrdinalEncoder()
data_transform[good_label_cols] = ordinal_encoder.fit_transform(data_transform[good_label_cols])


In [11]:
data_transform.head()

Unnamed: 0,Gender,Age,Height,Weight,FHWO,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,0.0,21.0,1.62,64.0,1.0,0.0,2.0,3.0,2.0,0.0,2.0,0.0,0.0,1.0,3.0,3.0,Normal_Weight
1,0.0,21.0,1.52,56.0,1.0,0.0,3.0,3.0,2.0,1.0,3.0,1.0,3.0,0.0,2.0,3.0,Normal_Weight
2,1.0,23.0,1.8,77.0,1.0,0.0,2.0,3.0,2.0,0.0,2.0,0.0,2.0,1.0,1.0,3.0,Normal_Weight
3,1.0,27.0,1.8,87.0,0.0,0.0,3.0,3.0,2.0,0.0,2.0,0.0,2.0,0.0,1.0,4.0,Overweight_Level_I
4,1.0,22.0,1.78,89.8,0.0,0.0,2.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,Overweight_Level_II


In [12]:
dic_to_replace = {"BMI": {"Insufficient_Weight": 0, 
                          "Normal_Weight": 1,
                         'Overweight_Level_I': 2,
                          'Overweight_Level_II': 3,
                          'Obesity_Type_I': 4,
                          'Obesity_Type_II': 5,
                          'Obesity_Type_III': 6,}
                         }
data_transform.replace(dic_to_replace, inplace=True)

In [13]:
data_input = df.copy()
data_input = data_input.drop(["Gender",'NCP','SMOKE','MTRANS'],axis=1)


In [15]:
#Remove the unnecessary columns
good_label_cols.remove('Gender')
good_label_cols.remove('SMOKE')
good_label_cols.remove('MTRANS')

In [16]:
data_grouped_by = data_input.groupby(['BMI'])
data_balanced = data_grouped_by.apply(lambda x: x.sample(data_grouped_by.size().min()).reset_index(drop=True))
data_bal = data_balanced.droplevel(['BMI'])
data_bal

Unnamed: 0,Age,Height,Weight,FHWO,FAVC,FCVC,CAEC,CH2O,SCC,FAF,TUE,CALC,BMI
0,22.991668,1.740295,54.166453,yes,yes,3.000000,Frequently,2.025279,no,2.000000,0.152985,no,Insufficient_Weight
1,20.345161,1.534385,41.965250,no,yes,2.888530,Frequently,1.000000,no,0.000000,0.196224,Sometimes,Insufficient_Weight
2,19.833682,1.699464,49.676046,no,yes,1.270448,Frequently,1.876915,no,2.000000,1.000000,Sometimes,Insufficient_Weight
3,16.270434,1.818268,47.124717,no,yes,3.000000,Sometimes,2.148146,no,2.458237,1.273333,Sometimes,Insufficient_Weight
4,18.000000,1.763465,50.279053,no,yes,1.567101,Sometimes,1.994139,no,0.107981,1.000000,Sometimes,Insufficient_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,47.706100,1.743935,84.729197,yes,yes,2.535315,Sometimes,1.146595,no,0.313810,0.000000,no,Overweight_Level_II
268,22.000000,1.650000,80.000000,yes,no,2.000000,Sometimes,2.000000,no,3.000000,2.000000,no,Overweight_Level_II
269,28.770852,1.532897,65.031879,yes,no,2.000000,Sometimes,1.000000,no,0.262171,0.000000,no,Overweight_Level_II
270,18.000000,1.770000,87.000000,yes,yes,3.000000,Sometimes,2.000000,no,1.000000,1.000000,Frequently,Overweight_Level_II


In [17]:
data_preprocess = pd.get_dummies(data_bal, columns = good_label_cols)
dic_to_replace = {"BMI": {"Insufficient_Weight": 0, 
                          "Normal_Weight": 1,
                         'Overweight_Level_I': 2,
                          'Overweight_Level_II': 3,
                          'Obesity_Type_I': 4,
                          'Obesity_Type_II': 5,
                          'Obesity_Type_III': 6,}
                         }
data_preprocess.replace(dic_to_replace, inplace=True)

In [18]:
target_name = 'BMI'
labels = data_preprocess[target_name]
features = data_preprocess.drop(target_name, axis=1)

In [19]:
from sklearn.preprocessing import MinMaxScaler
features = MinMaxScaler().fit_transform(features)

In [20]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, train_size=0.8, random_state=123
)

**RandomForest Classifier**

In [21]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(train_features,train_labels)

In [22]:
train_score = rfc.score(train_features, train_labels)
test_score = rfc.score(test_features, test_labels)

print(f"Classical Random Forest Classifier on the training dataset: {train_score:.2f}")
print(f"Classical Random Forest Classifier on the test dataset: {test_score:.2f}")


Classical Random Forest Classifier on the training dataset: 1.00
Classical Random Forest Classifier on the test dataset: 0.95


**Logistic Regression**

In [23]:
lr=LogisticRegression()
lr.fit(train_features,train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
train_score = lr.score(train_features, train_labels)
test_score = lr.score(test_features, test_labels)

print(f"Classical Logistic Regression on the training dataset: {train_score:.2f}")
print(f"Classical Logistic Regression on the test dataset: {test_score:.2f}")


Classical Logistic Regression on the training dataset: 0.71
Classical Logistic Regression on the test dataset: 0.71


**SVC(Support vector classifier)**

In [25]:
svc=SVC()
svc.fit(train_features,train_labels)

In [26]:
train_score = svc.score(train_features, train_labels)
test_score = svc.score(test_features, test_labels)

print(f"Classical Logistic Regression on the training dataset: {train_score:.2f}")
print(f"Classical Logistic Regression on the test dataset: {test_score:.2f}")


Classical Logistic Regression on the training dataset: 0.74
Classical Logistic Regression on the test dataset: 0.73


**KNN**

In [27]:
knn=KNeighborsClassifier()
knn.fit(train_features,train_labels)

In [28]:
train_score = knn.score(train_features, train_labels)
test_score = knn.score(test_features, test_labels)

print(f"Classical Logistic Regression on the training dataset: {train_score:.2f}")
print(f"Classical Logistic Regression on the test dataset: {test_score:.2f}")


Classical Logistic Regression on the training dataset: 0.82
Classical Logistic Regression on the test dataset: 0.76


**Decision tree**

In [29]:
dt=DecisionTreeClassifier()
dt.fit(train_features,train_labels)


In [30]:
train_score = dt.score(train_features, train_labels)
test_score = dt.score(test_features, test_labels)

print(f"Classical Logistic Regression on the training dataset: {train_score:.2f}")
print(f"Classical Logistic Regression on the test dataset: {test_score:.2f}")


Classical Logistic Regression on the training dataset: 1.00
Classical Logistic Regression on the test dataset: 0.93
