<a href="https://colab.research.google.com/github/Venura-Shiromal/Ai-session-term1/blob/main/CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Modules

In [129]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Input

## Loading Training Data

In [130]:
df = pd.read_csv("data/train.csv")

In [131]:
df.head()

Unnamed: 0,PersonID,Age_Years,Weight_Kg,High_Calorie_Food,Vegetable_Intake,Meal_Frequency,Water_Intake,Screen_Time_Hours,Family_Risk,Activity_Level_Score,Gender,Family_History,Snack_Frequency,Smoking_Habit,Alcohol_Consumption,Commute_Mode,Weight_Category,Height_cm,Physical_Activity_Level,Leisure Time Activity
0,P1810,26.0,109.959714,yes,3.0,3.0,2.679137,0.479348,1.0,0.479348,Female,yes,Occasionally,no,Sometimes,Public_Transportation,Obesity_Type_III,162.2771,,Reading
1,P1021,25.483381,64.848627,no,2.0,1.0,1.0,0.0,1.0,0.740633,Female,yes,Occasionally,no,no,Public_Transportation,Overweight_Level_II,156.5288,,Reading
2,P2036,26.0,104.947703,yes,3.0,3.0,2.57721,0.402075,1.0,0.402075,Female,yes,Occasionally,no,Sometimes,Public_Transportation,Obesity_Type_III,162.1167,,Reading
3,P2201,21.715198,107.868047,Yes,3.031308,3.322455,1.983531,-0.005858,0.987933,0.360441,Male,yes,Occasionally,no,no,Public_Transportation,Normal_Weight,167.748287,,Reading
4,P2649,17.511767,121.460361,yes,1.964873,1.052297,2.025586,0.981999,1.004136,1.162488,Female,yes,Occasionally,no,no,Public_Transportation,Normal_Weight,170.956194,,Gaming


# Preprocessing

### Removing unwanted fields

In [132]:
drop_cols = [
    "PersonID", 
    #"Weight_Kg", 
    #"Commute_Mode", 
    #"Height_cm", 
    #"Water_Intake", 
    #"Activity_Level_Score", 
    #"Family_History"
]

In [133]:
df = df.drop(drop_cols, axis=1)

### Removing missing cells

In [134]:
df.isna().sum()[df.isna().sum() > 0]

Gender                       30
Alcohol_Consumption          37
Physical_Activity_Level    1498
dtype: int64

In [135]:
df = df.drop(["Physical_Activity_Level"], axis=1)

In [136]:
missing_cols = ["Gender", "Alcohol_Consumption"]
df = df.dropna(subset=missing_cols)

### Mapping

In [144]:
map_X1 = {
    "Yes" : 1,
    "yes" : 1,
    "yess" : 1,
    "No" : 0,
    "no" : 0
}

map_X2 = {
    "Never" : 0,
    "Occasionally" : 1,
    "Often" : 2,
    "Always" : 3
}

map_X3 = {
    "no" : 0,
    "Sometimes" : 1,
    "Frequently" : 2,
    "Always" : 3
}

map_Y = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

cat_cols = [
    "Gender", 
    "Family_History", 
    "Smoking_Habit", 
    "Leisure Time Activity",
    "Commute_Mode"
]

In [138]:
df["High_Calorie_Food"] = df["High_Calorie_Food"].map(map_X1)
df["Snack_Frequency"] = df["Snack_Frequency"].map(map_X2)
df["Alcohol_Consumption"] = df["Alcohol_Consumption"].map(map_X3)
df["Weight_Category"] = df["Weight_Category"].map(map_Y)

## Defining X,Y

In [139]:
x = df.drop(["Weight_Category"], axis=1)
y = df["Weight_Category"]

## Data Splitting

In [140]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

## Class Weights

In [141]:
weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), weights))

# Training

In [167]:
model = CatBoostClassifier(
    learning_rate=0.08, 
    iterations=500, 
    depth=6, 
    verbose=50, 
    early_stopping_rounds=50,
    l2_leaf_reg=3,
    class_weights=class_weights
)

In [168]:
model.fit(x_train, y_train,
          cat_features=cat_cols,
          eval_set=(x_test, y_test)
)

0:	learn: 1.8592854	test: 1.8558496	best: 1.8558496 (0)	total: 130ms	remaining: 1m 4s
50:	learn: 0.9565121	test: 1.0517738	best: 1.0517738 (50)	total: 4.8s	remaining: 42.3s
100:	learn: 0.7957025	test: 0.9699493	best: 0.9699493 (100)	total: 9.47s	remaining: 37.4s
150:	learn: 0.6323375	test: 0.8818781	best: 0.8818781 (150)	total: 14.3s	remaining: 33.1s
200:	learn: 0.5377818	test: 0.8472956	best: 0.8472956 (200)	total: 18.8s	remaining: 27.9s
250:	learn: 0.4841380	test: 0.8344550	best: 0.8339626 (248)	total: 23.5s	remaining: 23.3s
300:	learn: 0.4252436	test: 0.8174198	best: 0.8165428 (298)	total: 28s	remaining: 18.5s
350:	learn: 0.3830704	test: 0.8060591	best: 0.8055482 (346)	total: 32.9s	remaining: 14s
400:	learn: 0.3473402	test: 0.8021365	best: 0.8021365 (400)	total: 37.5s	remaining: 9.26s
450:	learn: 0.3158332	test: 0.7972679	best: 0.7970016 (448)	total: 42.3s	remaining: 4.6s
499:	learn: 0.2865060	test: 0.7927389	best: 0.7927389 (499)	total: 46.9s	remaining: 0us

bestTest = 0.7927388683

<catboost.core.CatBoostClassifier at 0x1c518ad5d90>

In [169]:
y_pred = model.predict(x_test)

# Evaluation

In [170]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[37  1  2  1  2  2  0]
 [ 4 33  4  2  3  4  1]
 [ 0  4 43  1  5  0  1]
 [ 1  5  2 33  6  4  3]
 [ 2  3  1  1 56  1  1]
 [ 2  2  4  1  2 42  2]
 [ 0  1  1  1  2  4 53]]


In [171]:
acc = accuracy_score(y_test, y_pred)
print(f"{acc:.2%}")

76.94%
