In [75]:
# importing libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [76]:
# Reading the dataset
df=pd.read_csv('ObesitydataSet_raw_and_data_sinthetic.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [77]:
#Renaming the columns
df.rename(columns={'family_history_with_overweight':'FHWO','NObeyesdad':'BMI'},inplace=True)

In [78]:
#Categorical columns to be encoded
cat_col=list(df.select_dtypes(include='object').columns)
cat_col.remove('BMI')
print(f'Columns to be encoded: {cat_col}')

Columns to be encoded: ['Gender', 'FHWO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']


In [79]:
#Checking for the unique values in each categorical column
print_format='Unique values of {}:\n{} \n'
for count,i in enumerate(cat_col):
    val_count=df['{}'.format(i)].value_counts()
    print(count,print_format.format(i,val_count))

0 Unique values of Gender:
Male      1068
Female    1043
Name: Gender, dtype: int64 

1 Unique values of FHWO:
yes    1726
no      385
Name: FHWO, dtype: int64 

2 Unique values of FAVC:
yes    1866
no      245
Name: FAVC, dtype: int64 

3 Unique values of CAEC:
Sometimes     1765
Frequently     242
Always          53
no              51
Name: CAEC, dtype: int64 

4 Unique values of SMOKE:
no     2067
yes      44
Name: SMOKE, dtype: int64 

5 Unique values of SCC:
no     2015
yes      96
Name: SCC, dtype: int64 

6 Unique values of CALC:
Sometimes     1401
no             639
Frequently      70
Always           1
Name: CALC, dtype: int64 

7 Unique values of MTRANS:
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: MTRANS, dtype: int64 



In [80]:
# Copying the dataset
df_copy=df.copy()

In [81]:
import category_encoders as ce

In [82]:
# Converting target categorical column to numeric
dic_to_replace = {"BMI": {"Insufficient_Weight": 0, 
                          "Normal_Weight": 1,
                         'Overweight_Level_I': 2,
                          'Overweight_Level_II': 3,
                          'Obesity_Type_I': 4,
                          'Obesity_Type_II': 5,
                          'Obesity_Type_III': 6,}
                         }
df_copy.replace(dic_to_replace, inplace=True)

In [83]:
df_copy['BMI']

0       1
1       1
2       1
3       2
4       3
       ..
2106    6
2107    6
2108    6
2109    6
2110    6
Name: BMI, Length: 2111, dtype: int64

In [84]:
#Applying target encoding
tar_enc=ce.TargetEncoder(cols=cat_col)
df_copy[cat_col]=tar_enc.fit_transform(df_copy[cat_col],df_copy['BMI'])

In [85]:
# Splitting data into X and y
X=df_copy.drop('BMI',axis=1)
y=df_copy.BMI

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [87]:
#Splitting data training and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33)

In [88]:
# r_forest=LogisticRegression(max_iter=60000)
# r_forest.fit(X_train,y_train)
# pred_train=r_forest.predict(X_train)
# print(accuracy_score(y_train,pred_train))
# y_pred=r_forest.predict(X_test)
# print(accuracy_score(y_test,y_pred))

0.8168316831683168
0.8292682926829268


In [89]:
models={'Bagging Classifier':BaggingClassifier(),
        'Logistic Regression':LogisticRegression(max_iter=20000),
        'Random Forest':RandomForestClassifier(),
        'Gradient Boosting Classifier':GradientBoostingClassifier()}

In [90]:
def model_training(X_train,y_train,X_test,y_test,model):
    model=model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    score=accuracy_score(y_test,y_pred)
    return score

In [91]:
for i in models:
    score = model_training(X_train,y_train,X_test,y_test,models[i])
    print(f'Accuracy score for {i} ==> {score*100}')

Accuracy score for Bagging Classifier ==> 95.55236728837878
Accuracy score for Logistic Regression ==> 82.92682926829268
Accuracy score for Random Forest ==> 95.55236728837878
Accuracy score for Gradient Boosting Classifier ==> 95.1219512195122
