In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/XinyuanD/Datathon2026_Final/refs/heads/main/cleaned_health_data.csv')
df.head()

Unnamed: 0,TOPIC,TAXONOMY,CLASSIFICATION,GROUP,GROUP_ORDER,SUBGROUP,SUBGROUP_ORDER,ESTIMATE_TYPE,TIME_PERIOD,ESTIMATE
0,Angina/angina pectoris,Cardiovascular diseases,Total,Total,1,18 years and older,1,"Percent of population, crude",2019,1.7
1,Angina/angina pectoris,Cardiovascular diseases,Total,Total,1,18 years and older,1,"Percent of population, crude",2020,1.5
2,Angina/angina pectoris,Cardiovascular diseases,Total,Total,1,18 years and older,1,"Percent of population, crude",2021,1.5
3,Angina/angina pectoris,Cardiovascular diseases,Total,Total,1,18 years and older,1,"Percent of population, crude",2022,1.6
4,Angina/angina pectoris,Cardiovascular diseases,Total,Total,1,18 years and older,1,"Percent of population, crude",2023,1.6


In [3]:
def calculate_z(group):
    std = group.std()
    if std == 0 or np.isnan(std):
        return group - group.mean()
    return (group - group.mean()) / std

df['z_score'] = df.groupby('TOPIC')['ESTIMATE'].transform(calculate_z).fillna(0)

In [4]:
df = df.sort_values(['TOPIC', 'SUBGROUP', 'TIME_PERIOD'])

# History features
df['prev_z_score'] = df.groupby(['TOPIC', 'SUBGROUP'])['z_score'].shift(1)
df['rolling_3yr_z'] = df.groupby(['TOPIC', 'SUBGROUP'])['z_score'].transform(
    lambda x: x.shift(1).rolling(window=3, min_periods=1).mean()
)
df['z_delta'] = df.groupby(['TOPIC', 'SUBGROUP'])['prev_z_score'].diff()

cat_features = ['GROUP', 'SUBGROUP', 'TAXONOMY', 'CLASSIFICATION']
for col in cat_features:
    df[col] = df[col].astype('category')

In [5]:
df_model = df.dropna(subset=['prev_z_score', 'rolling_3yr_z']).copy()

train = df_model[df_model['TIME_PERIOD'] < 2024].copy()
test = df_model[df_model['TIME_PERIOD'] == 2024].copy()

features = cat_features + ['prev_z_score', 'rolling_3yr_z', 'z_delta']

X_train, y_train = train[features], train['z_score']
X_test, y_test = test[features], test['z_score']

In [6]:
# XGBOOST
model = xgb.XGBRegressor(
    n_estimators=3000,
    learning_rate=0.005,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    early_stopping_rounds=100,
    enable_categorical=True,
    tree_method='hist'
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=100
)

[0]	validation_0-rmse:1.01152
[100]	validation_0-rmse:0.73718
[200]	validation_0-rmse:0.58460
[300]	validation_0-rmse:0.50586
[400]	validation_0-rmse:0.46782
[500]	validation_0-rmse:0.44947
[600]	validation_0-rmse:0.44105
[700]	validation_0-rmse:0.43634
[800]	validation_0-rmse:0.43453
[900]	validation_0-rmse:0.43312
[1000]	validation_0-rmse:0.43178
[1100]	validation_0-rmse:0.43113
[1200]	validation_0-rmse:0.43011
[1300]	validation_0-rmse:0.42960
[1400]	validation_0-rmse:0.42930
[1500]	validation_0-rmse:0.42904
[1600]	validation_0-rmse:0.42868
[1691]	validation_0-rmse:0.42865


In [7]:
# Test
preds = model.predict(X_test)
final_r2 = r2_score(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print(f"\nRESULTS:")
print(f"Final R2 Score: {final_r2:.4f}")
print(f"RMSE: {rmse:.4f}")


RESULTS:
Final R2 Score: 0.8213
RMSE: 0.4286
