In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline


In [31]:
# load training data
COLS_TO_EXCLUDE = ["city", "total_cases", "week_start_date"]

features_train = pd.read_csv("dengue_features_train.csv")
labels_train   = pd.read_csv("dengue_labels_train.csv")
features_test  = pd.read_csv("dengue_features_test.csv")

train = features_train.merge(labels_train, on=["city", "year", "weekofyear"])

In [32]:
def add_week_cyclical_features(df):
    df = df.copy()
    df["week_sin"] = np.sin(2 * np.pi * df["weekofyear"] / 52)
    df["week_cos"] = np.cos(2 * np.pi * df["weekofyear"] / 52)
    return df

In [33]:
def preprocess_city_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # time-order (safe)
    df.sort_values(["year", "weekofyear"], inplace=True)
    # fill missing
    df.ffill(inplace=True)
    df.bfill(inplace=True)

    df = add_week_cyclical_features(df)
    
    return df

In [34]:
def make_model(degree: int, alpha: float) -> Pipeline:
    # Ridge stabilizes polynomial explosion
    return Pipeline([
        ("scale", StandardScaler()),
        ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
        ("ridge", Ridge(alpha=alpha, random_state=0))
    ])

In [35]:
def select_hyperparams_timecv(X, y, degrees, alphas, n_splits=5):
    """
    Choose (degree, alpha) by TimeSeriesSplit CV using MAE.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    best = None

    for d in degrees:
        for a in alphas:
            maes = []
            for tr_idx, va_idx in tscv.split(X):
                X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                model = make_model(d, a)
                model.fit(X_tr, y_tr)
                pred = model.predict(X_va)
                maes.append(mean_absolute_error(y_va, pred))

            avg_mae = float(np.mean(maes))
            if (best is None) or (avg_mae < best["mae"]):
                best = {"degree": d, "alpha": a, "mae": avg_mae}

    return best

In [36]:
def fit_predict_city(train_city: pd.DataFrame, test_city: pd.DataFrame, degrees, alphas):
    train_city = preprocess_city_df(train_city)
    test_city  = preprocess_city_df(test_city)

    feature_cols = [c for c in train_city.columns if c not in COLS_TO_EXCLUDE]
    X = train_city[feature_cols]
    y = train_city["total_cases"]

    best = select_hyperparams_timecv(X, y, degrees=degrees, alphas=alphas, n_splits=5)
    print(f"[{train_city['city'].iloc[0]}] best params -> degree={best['degree']} alpha={best['alpha']} cv_mae={best['mae']:.4f}")

    # fit on full city train
    model = make_model(best["degree"], best["alpha"])
    model.fit(X, y)

    # predict on city test
    X_test = test_city[feature_cols]
    pred = model.predict(X_test)

    # cases can't be negative
    pred = np.clip(pred, 0, None)

    return pred, best

In [37]:
# ----------------------------
# Split by city
# ----------------------------
sj_train = train[train["city"] == "sj"].copy()
iq_train = train[train["city"] == "iq"].copy()

sj_test  = features_test[features_test["city"] == "sj"].copy()
iq_test  = features_test[features_test["city"] == "iq"].copy()

In [38]:
# Hyperparam grids
# ----------------------------
degrees = [1, 2, 3, 4]
# Ridge alpha grid (log-like)
alphas  = [0.1, 1.0, 10.0, 100.0, 300.0, 1000.0]

# ----------------------------
# Train + Predict
# ----------------------------
sj_pred, sj_best = fit_predict_city(sj_train, sj_test, degrees, alphas)
iq_pred, iq_best = fit_predict_city(iq_train, iq_test, degrees, alphas)

[sj] best params -> degree=1 alpha=1000.0 cv_mae=31.1558
[iq] best params -> degree=1 alpha=1000.0 cv_mae=6.5134


In [39]:
# Build submission
# ----------------------------
submission = features_test[["city", "year", "weekofyear"]].copy()
submission["total_cases"] = 0.0

submission.loc[submission["city"] == "sj", "total_cases"] = sj_pred
submission.loc[submission["city"] == "iq", "total_cases"] = iq_pred


In [41]:
# DengAI expects integer counts
submission["total_cases"] = np.round(submission["total_cases"]).astype(int)

submission.to_csv("submission_poly_ridge.csv", index=False)
print("Saved: submission_poly_ridge.csv")
print(submission.head())

Saved: submission_poly_ridge.csv
  city  year  weekofyear  total_cases
0   sj  2008          18           14
1   sj  2008          19           10
2   sj  2008          20           13
3   sj  2008          21           17
4   sj  2008          22           20


In [None]:
# SUBMISSION SCORE : 29.9976