In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix , accuracy_score , classification_report
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import LabelEncoder , StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv(r"/kaggle/input/playground-series-s4e2/train.csv")
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
train.tail()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.0,Sometimes,no,2.151809,no,1.330519,0.19668,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.0,1.71,50.0,no,yes,3.0,4.0,Frequently,no,1.0,no,2.0,1.0,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.0,Sometimes,no,2.0,no,1.15804,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.7,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.0,0.973834,no,Automobile,Overweight_Level_II
20757,20757,Male,26.680376,1.816547,118.134898,yes,yes,3.0,3.0,Sometimes,no,2.003563,no,0.684487,0.713823,Sometimes,Public_Transportation,Obesity_Type_II


In [4]:
train.shape

(20758, 18)

In [5]:
train.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [7]:
train.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [8]:
train.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [9]:
train.duplicated().sum()

0

In [10]:
la = LabelEncoder()

In [11]:
obj = train.select_dtypes(exclude = ['int64' , 'float64'])

In [12]:
obj.columns

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [13]:
target_col = 'NObeyesdad'

feature_categorical_cols = [col for col in obj if col != target_col]

label_encoders = {}

for col in feature_categorical_cols:
    la = LabelEncoder()
    train[col] = la.fit_transform(train[col].astype(str))
    label_encoders[col] = la 

la_target = LabelEncoder()
train[target_col] = la_target.fit_transform(train[target_col].astype(str))
label_encoders[target_col] = la_target

In [14]:
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,2,0,2.763573,0,0.0,0.976473,1,3,6
1,1,0,18.0,1.56,57.0,1,1,2.0,3.0,1,0,2.0,0,1.0,1.0,2,0,1
2,2,0,18.0,1.71146,50.165754,1,1,1.880534,1.411685,2,0,1.910378,0,0.866045,1.673584,2,3,0
3,3,0,20.952737,1.71073,131.274851,1,1,3.0,3.0,2,0,1.674061,0,1.467863,0.780199,1,3,4
4,4,1,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,2,0,1.979848,0,1.967973,0.931721,1,3,6


In [15]:
test = pd.read_csv(r"/kaggle/input/playground-series-s4e2/test.csv")
test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


In [16]:
obj_test = test.select_dtypes(include = 'object')

In [17]:
for col in obj_test.columns:
    test[col] = la.fit_transform(test[col])


In [18]:
scaler = StandardScaler()

In [19]:
scal1 = train[['Age']]
scal2 = train[['Weight']]

train['Age'] = scaler.fit_transform(scal1)
train['Weight'] = scaler.fit_transform(scal2)

In [20]:
scal1 = test[['Age']]
scal2 = test[['Weight']]

test['Age'] = scaler.fit_transform(scal1)
test['Weight'] = scaler.fit_transform(scal2)

In [21]:
test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,1,0.508163,1.848294,1.273786,1,1,2.938616,3.0,2,0,2.825629,0,0.8554,0.0,2,3
1,20759,0,-0.509128,1.6,-0.818988,1,1,2.0,1.0,2,0,3.0,0,1.0,0.0,2,3
2,20760,0,0.353,1.643355,0.927432,1,1,3.0,3.0,2,0,2.621877,0,0.0,0.250502,2,3
3,20761,1,-0.512705,1.553127,0.623672,1,1,2.0,2.977909,2,0,2.786417,0,0.094851,0.0,2,3
4,20762,0,0.353,1.627396,0.668336,1,1,3.0,3.0,2,0,2.653531,0,0.0,0.741069,2,3


In [22]:
X = train.drop(['id','NObeyesdad'] , axis = 1)
y = train['NObeyesdad']

In [23]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = .2 , random_state = 42)

In [24]:
model1 = LogisticRegression()
model2 = SVC()
model3 = RandomForestClassifier()
model4 = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200)
model5 = GaussianNB()
model6 = DecisionTreeClassifier()

In [25]:
def pred(model):
    model.fit(X_train , y_train)
    pre = model.predict(X_test)
    print(classification_report(y_test , pre))

In [26]:
pred(model1)

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       524
           1       0.70      0.67      0.69       626
           2       0.77      0.79      0.78       543
           3       0.95      0.97      0.96       657
           4       0.99      1.00      1.00       804
           5       0.63      0.59      0.61       484
           6       0.64      0.61      0.62       514

    accuracy                           0.80      4152
   macro avg       0.78      0.78      0.78      4152
weighted avg       0.80      0.80      0.80      4152



In [27]:
pred(model2)

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       524
           1       0.81      0.77      0.79       626
           2       0.82      0.84      0.83       543
           3       0.97      0.97      0.97       657
           4       1.00      1.00      1.00       804
           5       0.69      0.66      0.67       484
           6       0.70      0.69      0.69       514

    accuracy                           0.85      4152
   macro avg       0.83      0.84      0.84      4152
weighted avg       0.85      0.85      0.85      4152



In [28]:
pred(model3)

              precision    recall  f1-score   support

           0       0.95      0.92      0.93       524
           1       0.84      0.88      0.86       626
           2       0.89      0.87      0.88       543
           3       0.97      0.98      0.97       657
           4       1.00      1.00      1.00       804
           5       0.76      0.75      0.75       484
           6       0.79      0.78      0.78       514

    accuracy                           0.89      4152
   macro avg       0.88      0.88      0.88      4152
weighted avg       0.89      0.89      0.89      4152



In [29]:
pred(model4)

              precision    recall  f1-score   support

           0       0.95      0.94      0.94       524
           1       0.88      0.89      0.89       626
           2       0.88      0.88      0.88       543
           3       0.98      0.98      0.98       657
           4       1.00      1.00      1.00       804
           5       0.78      0.79      0.78       484
           6       0.80      0.79      0.80       514

    accuracy                           0.91      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.91      0.91      0.91      4152



In [30]:
pred(model5)

              precision    recall  f1-score   support

           0       0.70      0.85      0.77       524
           1       0.65      0.47      0.54       626
           2       0.37      0.61      0.46       543
           3       0.71      0.93      0.81       657
           4       0.96      1.00      0.98       804
           5       0.61      0.30      0.40       484
           6       0.53      0.24      0.33       514

    accuracy                           0.66      4152
   macro avg       0.65      0.63      0.61      4152
weighted avg       0.67      0.66      0.64      4152



In [31]:
pred(model6)

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       524
           1       0.79      0.76      0.78       626
           2       0.84      0.83      0.84       543
           3       0.95      0.96      0.96       657
           4       1.00      1.00      1.00       804
           5       0.65      0.67      0.66       484
           6       0.70      0.71      0.71       514

    accuracy                           0.85      4152
   macro avg       0.83      0.83      0.83      4152
weighted avg       0.85      0.85      0.85      4152



In [32]:
testx = test.drop('id' , axis = 1)

In [33]:
prex = model4.predict(testx)

In [34]:
final_predicted_labels = label_encoders[target_col].inverse_transform(prex)

submission = pd.DataFrame({'id': test['id'], "NObeyesdad": final_predicted_labels})

In [35]:
submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight
