<a href="https://colab.research.google.com/github/Venura-Shiromal/Weight-Classifier/blob/main/CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Modules

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Input

## Loading Training Data

In [2]:
df = pd.read_csv("data/train.csv")

In [3]:
df.head()

Unnamed: 0,PersonID,Age_Years,Weight_Kg,High_Calorie_Food,Vegetable_Intake,Meal_Frequency,Water_Intake,Screen_Time_Hours,Family_Risk,Activity_Level_Score,Gender,Family_History,Snack_Frequency,Smoking_Habit,Alcohol_Consumption,Commute_Mode,Weight_Category,Height_cm,Physical_Activity_Level,Leisure Time Activity
0,P1810,26.0,109.959714,yes,3.0,3.0,2.679137,0.479348,1.0,0.479348,Female,yes,Occasionally,no,Sometimes,Public_Transportation,Obesity_Type_III,162.2771,,Reading
1,P1021,25.483381,64.848627,no,2.0,1.0,1.0,0.0,1.0,0.740633,Female,yes,Occasionally,no,no,Public_Transportation,Overweight_Level_II,156.5288,,Reading
2,P2036,26.0,104.947703,yes,3.0,3.0,2.57721,0.402075,1.0,0.402075,Female,yes,Occasionally,no,Sometimes,Public_Transportation,Obesity_Type_III,162.1167,,Reading
3,P2201,21.715198,107.868047,Yes,3.031308,3.322455,1.983531,-0.005858,0.987933,0.360441,Male,yes,Occasionally,no,no,Public_Transportation,Normal_Weight,167.748287,,Reading
4,P2649,17.511767,121.460361,yes,1.964873,1.052297,2.025586,0.981999,1.004136,1.162488,Female,yes,Occasionally,no,no,Public_Transportation,Normal_Weight,170.956194,,Gaming


# Preprocessing

### Combined Fields

In [4]:
df["BMI"] = df["Weight_Kg"]/(df["Height_cm"]/100)**2

### Removing unwanted fields

In [5]:
drop_cols = [
    "PersonID", 
    "Weight_Kg",
    "Height_cm"
]

In [6]:
df = df.drop(drop_cols, axis=1)

### Removing missing cells

In [7]:
df.isna().sum()[df.isna().sum() > 0]

Gender                       30
Alcohol_Consumption          37
Physical_Activity_Level    1498
dtype: int64

In [8]:
df = df.drop(["Physical_Activity_Level"], axis=1)

In [9]:
missing_cols = ["Gender", "Alcohol_Consumption"]
df = df.dropna(subset=missing_cols)

### Mapping

In [10]:
map_YN = {
    "Yes" : 1,
    "yes" : 1,
    "yess" : 1,
    "No" : 0,
    "no" : 0
}

map_Gender = {
    "Male": 1,
    "Female": 0
}

map_Frq = {
    "no" : 0,
    "Never" : 0,
    "Sometimes" : 1,
    "Occasionally" : 1,
    "Frequently" : 2,
    "Often" : 2,
    "Always" : 3
}

map_Cat = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

In [11]:
df["Gender"] = df["Gender"].map(map_Gender)
df["High_Calorie_Food"] = df["High_Calorie_Food"].map(map_YN)
df["Family_History"] = df["Family_History"].map(map_YN)
df["Smoking_Habit"] = df["Smoking_Habit"].map(map_YN)
df["Snack_Frequency"] = df["Snack_Frequency"].map(map_Frq)
df["Alcohol_Consumption"] = df["Alcohol_Consumption"].map(map_Frq)
df["Weight_Category"] = df["Weight_Category"].map(map_Cat)

### One Hot Encoding

In [12]:
df = pd.get_dummies(df, columns=['Commute_Mode','Leisure Time Activity'])

## Defining X,Y

In [13]:
x = df.drop(["Weight_Category"], axis=1)
y = df["Weight_Category"]

## Data Splitting

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

## Class Weights

In [15]:
weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), weights))

# Training

In [91]:
model = CatBoostClassifier(
    learning_rate=0.08, 
    iterations=300, 
    depth=8, 
    verbose=20, 
    early_stopping_rounds=50,
    l2_leaf_reg=5,
    class_weights=class_weights
)

In [92]:
model.fit(x_train, y_train,
          eval_set=(x_test, y_test)
)

0:	learn: 1.8677045	test: 1.8683779	best: 1.8683779 (0)	total: 23.3ms	remaining: 6.98s
20:	learn: 0.9476510	test: 0.9523006	best: 0.9523006 (20)	total: 417ms	remaining: 5.53s
40:	learn: 0.7536300	test: 0.8148941	best: 0.8148941 (40)	total: 803ms	remaining: 5.07s
60:	learn: 0.6598899	test: 0.7678718	best: 0.7678718 (60)	total: 1.19s	remaining: 4.67s
80:	learn: 0.5866515	test: 0.7376108	best: 0.7376108 (80)	total: 1.58s	remaining: 4.27s
100:	learn: 0.5287718	test: 0.7179816	best: 0.7179816 (100)	total: 1.96s	remaining: 3.86s
120:	learn: 0.4840449	test: 0.7076320	best: 0.7075678 (118)	total: 2.36s	remaining: 3.49s
140:	learn: 0.4443821	test: 0.6999949	best: 0.6999949 (140)	total: 2.75s	remaining: 3.1s
160:	learn: 0.4085500	test: 0.6939061	best: 0.6939061 (160)	total: 3.13s	remaining: 2.7s
180:	learn: 0.3802244	test: 0.6903962	best: 0.6903962 (180)	total: 3.55s	remaining: 2.33s
200:	learn: 0.3580723	test: 0.6893077	best: 0.6893077 (200)	total: 3.94s	remaining: 1.94s
220:	learn: 0.3346216	t

<catboost.core.CatBoostClassifier at 0x22def935a90>

In [96]:
y_pred = model.predict(x_test)

# Evaluation

In [97]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[37  1  3  0  1  2  1]
 [ 2 39  1  1  4  2  2]
 [ 0  0 45  4  1  3  1]
 [ 0  8  1 33  5  7  0]
 [ 2  2  1  4 53  2  1]
 [ 0  1  3  2  3 44  2]
 [ 1  3  1  1  0  3 53]]


In [98]:
acc = accuracy_score(y_test, y_pred)
print(f"{acc:.2%}")

78.76%
