# TODO: Add a cool Title here

## 1. Briefing

|            |          |                     |
|------------|----------|---------------------|
| Subject    |          | Machine Learning    |
| Supervisor |          | Prof. Aswin Kannan  |
|            |          |                     |
| Team Name  |          | MT2025724_MT2025732 |
| Member 1   | Name:    | Shreg               |
|            | Roll No: | MT2025724           |
| Member 2   | Name:    | Anisha              |
|            | Roll No: | MT2025732           |
|            |          |                     |

TODO: Add some numericals like the number of models compared, number of data processing techniques applied...

### Keywords

## 2. Summary

* 4–8 bullet points summarizing highest-level conclusions (best model(s), tradeoffs, recommended next steps).
* Short comparative table of winners by criteria (accuracy, latency, model size, fairness, etc.)

## 3. Introduction

* Problem statement and motivation
* Objectives of the comparative study (what questions you will answer)
* Scope (datasets, model families, compute constraints)
* Contributions (what unique analyses you provide — e.g., timing, memory profiling, calibration, fairness tests)


In [None]:
# Notification system for slow steps - I multitask, so a ping really helps when long steps are completed to bring back my attention.
# Remember, _attention is all you need_! xD
import chime

# The universe loves chaos, but reproducibility doesn’t.
# Imported mainly to set seed for other libraries which depend on it.
import random

# Because time waits for no one, but at least we can measure how fast it runs away.
import time

# Metadata serialization
# Question for you, what's the difference between JS and JSON?
import json

# Big fan of java's streams. A programmer who cant make an easy task difficult, is not a programmer!
import functools, itertools

# The Swiss army knife of data
import pandas as pd

# Where math meets meth
import numpy as np

import scipy as sp

# Bag of all ML models
import sklearn as skl

# An artist bringing data to life
import matplotlib.pyplot as plt

# The artist's wardrobe
import matplotlib as mpl

# Pyplot's quirkier cousin
import seaborn as sns

# Never going to use it, but make me feel prepared
from pprint import pprint

# If I can’t see it moving, it’s not working.
from tqdm import tqdm

In [None]:
chime.theme("material")
chime.info()

In [None]:
for option in ['../../data/ait-511-course-project-1-obesity-risk/train.csv', 'train.csv', 'data/train.csv']:
    try:
        ds_source = pd.read_csv(option)
        print('Training data found at', option)
        ds_test = pd.read_csv(option.replace('train.csv', 'test.csv'))
        break
    except:
        print('No training data found at', option)

try:
    ds_source
except NameError:
    print('No training data found anywhere')

chime.info()

In [None]:
ds_source.shape

In [None]:
ds_source.info()

In [None]:
ds_source.head(5)

In [None]:
ds_source.describe(include='all')

In [None]:
dropable_features = ['id', 'WeightCategory']
binary_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
multi_cat_features = ['CAEC', 'CALC', 'MTRANS']
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

labels = ds_source['WeightCategory'].unique()
labels_encoder = {label: index for index, label in enumerate(labels)}
labels_decoder = {index: label for index, label in enumerate(labels)}

## Missing Value

In [None]:
ds_source.isnull().sum()

In [None]:
ds_source[numerical_features].hist(bins=20, figsize=(12, 6), layout=(2, -1))
plt.tight_layout()
plt.show()

123

In [None]:
for column in ['FCVC', 'NCP']:
    ds_source[column] = ds_source[column].round()

In [None]:
plt.hist(sp.stats.boxcox(ds_source['CH2O'])[0], bins=20)

In [None]:
sns.boxplot(x='WeightCategory', y='CH2O', data=ds_train)

In [None]:
sns.histplot(np.log(np.log(ds_source['Age'])), bins=20)

In [None]:
# plt.plot(sp.stats.boxcox(ds_train['Age']))

In [None]:
_age_data = sp.stats.boxcox(ds_source['Age'])
plt.hist(_age_data[0], bins=30);
plt.title('Box-Cox Transformed Data')
print("Optimal Lambda", _age_data[1])

---
We will be creating two dataframes, one with the distribution and outlier handling.
This is required for algorithms involvong regression


In [None]:
ds_source_lr = ds_source.copy()
ds_source_lr['Age'] = sp.stats.boxcox(ds_source['Age'])[0]

    ## Outlier

In [None]:
plt.figure(figsize=(10, 5))
for i in range(len(numerical_features)):
    plt.subplot(1, len(numerical_features), i + 1)
    sns.boxplot(y=numerical_features[i], data=ds_source_lr, color='purple')
    plt.tight_layout()

In [None]:
# Count the number of outliers
def count_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    return ((col < (Q1 - 1.5 * IQR)) | (col > (Q3 + 1.5 * IQR))).sum()


outliers_count = ds_source[numerical_features].apply(count_outliers)
outliers_count

In [None]:
# Check if outliers impact mean
for i in numerical_features:
    print(f"Mean of {i} is {ds_source[i].mean().round(2)}")
    print(f"Median of {i} is {ds_source[i].median().round(2)}")
print(" ")

Processing steps for categoricals

In [None]:
ds_source[multi_cat_features]

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(20, 10))
for ax, column in zip(list(axs[0]) + list(axs[1]), binary_features + multi_cat_features):
    sns.countplot(ds_source[column], ax=ax)

In [None]:
ds_source['MTRANS'] = ds_source['MTRANS'].map(lambda x: x if x in ('Public_Transportation', 'Automobile') else 'Others')

In [None]:
pop_drop_columns = ('dropper', 'drop', dropable_features)

In [None]:
pop_stan_scaler = ('stan_scaler', skl.preprocessing.StandardScaler(), numerical_features)

In [None]:
pop_binarizer = ('binarizer', skl.preprocessing.OneHotEncoder(drop='if_binary'), binary_features)

In [None]:
pop_one_hotter = ('one_hotter', skl.preprocessing.OneHotEncoder(), ['MTRANS'])

In [None]:
pop_label_enc = ('label_enc', skl.preprocessing.OrdinalEncoder(), ['CAEC', 'CALC'])

In [None]:
def _pop_derived_features(_ds_in: pd.DataFrame) -> pd.DataFrame:
    _ans = pd.DataFrame()
    _ans["BMI"] = _ds_in["Weight"] / (_ds_in["Height"] ** 2)
    _ans["Water_Intake_per_Meal"] = _ds_in["CH2O"] / (_ds_in["NCP"] + 1e-6)
    _ans["Activity_to_Tech_Ratio"] = _ds_in["FAF"] / (_ds_in["TUE"] + 0.1)
    _ans["Healthy_Lifestyle_Score"] = (_ds_in["FCVC"] * 0.3 + _ds_in["FAF"] * 0.3
                                       + _ds_in["CH2O"] * 0.4 - _ds_in["FAVC"].map({"yes": 1, "no": 0}))
    _ans["Has_FamilyRisk_and_FAVC"] = np.where(
        (_ds_in["family_history_with_overweight"] == "yes") & (_ds_in["FAVC"] == "yes"), 1, 0)
    _ans["Calorie_Monitoring_Interaction"] = (_ds_in["SCC"].map({"yes": 1, "no": 0})
                                              * _ds_in["FAVC"].map({"yes": 1, "no": 0}))
    return _ans


pop_derived_features = ('derived', skl.preprocessing.FunctionTransformer(_pop_derived_features), slice(None))

In [None]:
pop_everything = skl.compose.ColumnTransformer([pop_stan_scaler, pop_derived_features, pop_binarizer, pop_one_hotter])

In [None]:
pop_binarizer2 = ('binarizer', skl.preprocessing.OneHotEncoder(drop='if_binary', sparse_output=False), binary_features)
pop_one_hotter2 = ('one_hotter', skl.preprocessing.OneHotEncoder(sparse_output=False), multi_cat_features)
pop_everything = skl.compose.ColumnTransformer([pop_stan_scaler, pop_derived_features, pop_binarizer2, pop_one_hotter2])
pop_everything.set_output(transform='pandas')

corr = pop_everything.fit_transform(ds_source).corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap='coolwarm', center=0, annot=False, vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap")
plt.show()

TVT S

Mdel

In [141]:

pop_everything = skl.compose.ColumnTransformer(
    [pop_stan_scaler, pop_derived_features, pop_binarizer, pop_one_hotter, pop_label_enc])
data_x = pop_everything.fit_transform(ds_source)
data_test = pop_everything.fit_transform(ds_test)
data_y = ds_source['WeightCategory'].map(labels_encoder).to_numpy()


In [142]:
model = skl.linear_model.LogisticRegression(max_iter=1000)
model.fit(data_x, data_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [136]:
skl.metrics.accuracy_score(data_y, model.predict(data_x))

0.8642245541749823

In [138]:
model.predict(data_test)

array([4., 5., 6., ..., 2., 6., 3.], shape=(5225,))

In [161]:
ds_res = pd.DataFrame()
ds_res['id'] = ds_test['id']

In [162]:
ds_res['WeightCategory'] = model.predict(data_test)

In [164]:
ds_res['WeightCategory'] = ds_res['WeightCategory'].map(labels_decoder)

In [166]:
ds_res.to_csv('submission.csv', index=False)