<a href="https://colab.research.google.com/github/Venura-Shiromal/Weight-Classifier/blob/main/CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Modules

In [51]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Input

## Loading Training Data

In [55]:
df = pd.read_csv("data/train.csv")

In [67]:
df.head()

Unnamed: 0,Age_Years,High_Calorie_Food,Vegetable_Intake,Meal_Frequency,Water_Intake,Screen_Time_Hours,Family_Risk,Activity_Level_Score,Gender,Family_History,Snack_Frequency,Smoking_Habit,Alcohol_Consumption,Commute_Mode,Weight_Category,Leisure Time Activity,BMI
0,26.0,1,3.0,3.0,2.679137,0.479348,1.0,0.479348,Female,yes,1,no,1,3,6,Reading,41.756023
1,25.483381,0,2.0,1.0,1.0,0.0,1.0,0.740633,Female,yes,1,no,0,3,3,Reading,26.467461
2,26.0,1,3.0,3.0,2.57721,0.402075,1.0,0.402075,Female,yes,1,no,1,3,6,Reading,39.931666
3,21.715198,1,3.031308,3.322455,1.983531,-0.005858,0.987933,0.360441,Male,yes,1,no,0,3,1,Reading,38.333337
4,17.511767,1,1.964873,1.052297,2.025586,0.981999,1.004136,1.162488,Female,yes,1,no,0,3,1,Gaming,41.558981


# Preprocessing

### Combined Fields

In [57]:
df["BMI"] = df["Weight_Kg"]/(df["Height_cm"]/100)**2

### Removing unwanted fields

In [58]:
drop_cols = [
    "PersonID", 
    "Weight_Kg", 
    #"Commute_Mode", 
    "Height_cm" 
    #"Water_Intake", 
    #"Activity_Level_Score", 
    #"Family_History"
]

In [59]:
df = df.drop(drop_cols, axis=1)

### Removing missing cells

In [60]:
df.isna().sum()[df.isna().sum() > 0]

Gender                       30
Alcohol_Consumption          37
Physical_Activity_Level    1498
dtype: int64

In [61]:
df = df.drop(["Physical_Activity_Level"], axis=1)

In [62]:
missing_cols = ["Gender", "Alcohol_Consumption"]
df = df.dropna(subset=missing_cols)

### Mapping

In [63]:
map_X1 = {
    "Yes" : 1,
    "yes" : 1,
    "yess" : 1,
    "No" : 0,
    "no" : 0
}

map_X2 = {
    "Never" : 0,
    "Occasionally" : 1,
    "Often" : 2,
    "Always" : 3
}

map_X3 = {
    "no" : 0,
    "Sometimes" : 1,
    "Frequently" : 2,
    "Always" : 3
}

map_X4 = {
    "Bike": 1,
    "Walking": 2,
    "Public_Transportation": 3,
    "Motorbike": 4,
    "Automobile": 5
}

map_Y = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

cat_cols = [
    "Gender", 
    "Family_History", 
    "Smoking_Habit", 
    "Leisure Time Activity"
]

In [64]:
df["High_Calorie_Food"] = df["High_Calorie_Food"].map(map_X1)
df["Snack_Frequency"] = df["Snack_Frequency"].map(map_X2)
df["Alcohol_Consumption"] = df["Alcohol_Consumption"].map(map_X3)
df["Commute_Mode"] = df["Commute_Mode"].map(map_X4)
df["Weight_Category"] = df["Weight_Category"].map(map_Y)

## Defining X,Y

In [65]:
x = df.drop(["Weight_Category"], axis=1)
y = df["Weight_Category"]

## Data Splitting

In [66]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

## Class Weights

In [68]:
weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), weights))

# Training

In [69]:
model = CatBoostClassifier(
    learning_rate=0.08, 
    iterations=500, 
    depth=6, 
    verbose=50, 
    early_stopping_rounds=50,
    l2_leaf_reg=3,
    class_weights=class_weights
)

In [70]:
model.fit(x_train, y_train,
          cat_features=cat_cols,
          eval_set=(x_test, y_test)
)

0:	learn: 1.8412631	test: 1.8361149	best: 1.8361149 (0)	total: 78.4ms	remaining: 39.1s
50:	learn: 0.7782623	test: 0.8325887	best: 0.8325887 (50)	total: 3.89s	remaining: 34.2s
100:	learn: 0.6370903	test: 0.7710448	best: 0.7710448 (100)	total: 10.5s	remaining: 41.6s
150:	learn: 0.5302317	test: 0.7303753	best: 0.7303753 (150)	total: 17.2s	remaining: 39.8s
200:	learn: 0.4544356	test: 0.7090339	best: 0.7090339 (200)	total: 23.7s	remaining: 35.3s
250:	learn: 0.3973945	test: 0.7011439	best: 0.6996192 (237)	total: 30.2s	remaining: 29.9s
300:	learn: 0.3554107	test: 0.6998798	best: 0.6991720 (288)	total: 36.6s	remaining: 24.2s
350:	learn: 0.3159902	test: 0.6964421	best: 0.6956176 (343)	total: 43.2s	remaining: 18.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6956175886
bestIteration = 343

Shrink model to first 344 iterations.


<catboost.core.CatBoostClassifier at 0x1a8ca56a350>

In [71]:
y_pred = model.predict(x_test)

# Evaluation

In [72]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[36  2  1  1  1  1  3]
 [ 2 38  1  4  3  3  0]
 [ 0  2 45  3  1  2  1]
 [ 0  4  3 33  3  7  4]
 [ 1  2  1  2 54  3  2]
 [ 0  1  4  2  2 44  2]
 [ 1  2  4  1  0  3 51]]


In [74]:
acc = accuracy_score(y_test, y_pred)
print(f"{acc:.2%}")

77.98%
