In [1]:
# Cell 1: Install required packages (run this once in Colab)
!pip install catboost optuna --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error

from catboost import CatBoostRegressor, Pool


In [29]:
general = pd.read_csv("SMHS_general-1.csv")
validation = pd.read_csv("SMHS_validation_students-1.csv")

general.shape, validation.shape


((1500, 29), (100, 28))

In [17]:
target = "metabolic_pressure"
drop_cols = ["id", "latent_cluster"]

X = general.drop(columns=[target] + drop_cols)
y = general[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [31]:
target = "metabolic_pressure"
drop_cols = ["id", "latent_cluster"]

X = general.drop(columns=[target] + drop_cols)
y = general[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [32]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

X_train[cat_cols] = X_train[cat_cols].astype("category")
X_test[cat_cols] = X_test[cat_cols].astype("category")


In [33]:
pt = PowerTransformer(method="yeo-johnson", standardize=True)
X_train[num_cols] = pt.fit_transform(X_train[num_cols])
X_test[num_cols] = pt.transform(X_test[num_cols])


In [34]:
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
test_pool = Pool(X_test, y_test, cat_features=cat_cols)


In [35]:
cat = CatBoostRegressor(
    loss_function="RMSE",
    eval_metric="RMSE",
    iterations=5000,
    depth=10,
    learning_rate=0.01,
    random_strength=1,
    bagging_temperature=0.3,
    l2_leaf_reg=5,
    border_count=254,
    grow_policy="Lossguide",
    verbose=200,
    random_state=42
)

cat.fit(train_pool, eval_set=test_pool)


0:	learn: 14.6885748	test: 15.5069746	best: 15.5069746 (0)	total: 93.7ms	remaining: 7m 48s
200:	learn: 5.6836375	test: 6.2618749	best: 6.2618749 (200)	total: 9.06s	remaining: 3m 36s
400:	learn: 4.2670010	test: 5.0007409	best: 5.0007409 (400)	total: 13s	remaining: 2m 29s
600:	learn: 3.7051546	test: 4.6398772	best: 4.6398772 (600)	total: 16.5s	remaining: 2m
800:	learn: 3.2333639	test: 4.4961063	best: 4.4961063 (800)	total: 21.1s	remaining: 1m 50s
1000:	learn: 2.7912948	test: 4.4286194	best: 4.4286194 (1000)	total: 25.4s	remaining: 1m 41s
1200:	learn: 2.4079905	test: 4.4113874	best: 4.4113874 (1200)	total: 29s	remaining: 1m 31s
1400:	learn: 2.1023228	test: 4.4162339	best: 4.4101760 (1213)	total: 33.6s	remaining: 1m 26s
1600:	learn: 1.8420167	test: 4.4239017	best: 4.4101760 (1213)	total: 37.8s	remaining: 1m 20s
1800:	learn: 1.6197311	test: 4.4335319	best: 4.4101760 (1213)	total: 41.5s	remaining: 1m 13s
2000:	learn: 1.4277568	test: 4.4424743	best: 4.4101760 (1213)	total: 46.2s	remaining: 1m

<catboost.core.CatBoostRegressor at 0x7a4be59d2060>

In [36]:
pred_test = cat.predict(test_pool)

mae = mean_absolute_error(y_test, pred_test)
rmse = mean_squared_error(y_test, pred_test) ** 0.5

print("Local Test MAE:", mae)
print("Local Test RMSE:", rmse)


Local Test MAE: 3.461813610789359
Local Test RMSE: 4.410176180989675


In [37]:
val = validation.copy()

val[cat_cols] = val[cat_cols].astype("category")

val_num = val[num_cols]
val_num_transformed = pt.transform(val_num)
val[num_cols] = val_num_transformed


In [39]:
val_pool = Pool(val.drop(columns=["id"]), cat_features=cat_cols)

val_pred = cat.predict(val_pool)

submission = pd.DataFrame({
    "id": validation["id"],
    "prediction": val_pred
})

submission.to_csv("catboost_leaderboard_submission2.csv", index=False)

submission.head()


Unnamed: 0,id,prediction
0,New_1501,57.920916
1,New_1502,55.426027
2,New_1503,66.31339
3,New_1504,33.764703
4,New_1505,37.528418
