In [82]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from collections import deque


In [83]:
# load training data
COLS_TO_EXCLUDE = ["city", "total_cases", "week_start_date"]

features_train = pd.read_csv("dengue_features_train.csv")
labels_train   = pd.read_csv("dengue_labels_train.csv")
features_test  = pd.read_csv("dengue_features_test.csv")

train = features_train.merge(labels_train, on=["city", "year", "weekofyear"])

In [84]:
def add_week_cyclical_features(df):
    df = df.copy()
    df["week_sin"] = np.sin(2 * np.pi * df["weekofyear"] / 52)
    df["week_cos"] = np.cos(2 * np.pi * df["weekofyear"] / 52)
    return df

In [85]:
def add_target_lag_features_train(df, lags=(1,2,3,4), roll_window=4):
    """
    Train dataframe içinde total_cases kullanarak lag ve rolling mean üretir.
    rolling_mean_4: önceki 4 haftanın ortalaması (current week dahil değil)
    """
    df = df.copy()
    df.sort_values(["year", "weekofyear"], inplace=True)

    y = df["total_cases"]
    for k in lags:
        df[f"cases_lag_{k}"] = y.shift(k)

    df[f"cases_roll_mean_{roll_window}"] = y.shift(1).rolling(roll_window).mean()

    return df

In [86]:
def recursive_predict_with_lags(model, train_city_df, test_city_df, base_feature_cols,
                                lags=(1,2,3,4), roll_window=4):
    """
    Test için adım adım tahmin yapar.
    Lag'ler: train’in son gerçek total_cases değerleri + test’teki önceki tahminler üzerinden üretilir.
    """
    train_city_df = train_city_df.copy()
    test_city_df  = test_city_df.copy()

    train_city_df.sort_values(["year", "weekofyear"], inplace=True)
    test_city_df.sort_values(["year", "weekofyear"], inplace=True)

    # başlangıç lag hafızası: train'in son roll_window gerçek değerleri
    history = deque(train_city_df["total_cases"].iloc[-roll_window:].tolist(), maxlen=roll_window)

    preds = []
    for _, row in test_city_df.iterrows():
        # lag features
        feat = row[base_feature_cols].to_dict()

        # lag_1..lag_4 (history[-1] en yakın geçmiş)
        hist_list = list(history)  # oldest -> newest
        for k in lags:
            feat[f"cases_lag_{k}"] = hist_list[-k]

        feat[f"cases_roll_mean_{roll_window}"] = float(np.mean(hist_list))

        # tek satır X
        X_one = pd.DataFrame([feat])

        yhat = model.predict(X_one)[0]
        yhat = float(max(0.0, yhat))  # negatif olmasın
        preds.append(yhat)

        # history güncelle (artık bu hafta için "gözlenen" değerimiz tahmin)
        history.append(yhat)

    return np.array(preds)

In [88]:
def preprocess_city_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # time-order (safe)
    df.sort_values(["year", "weekofyear"], inplace=True)
    # fill missing
    df.ffill(inplace=True)
    df.bfill(inplace=True)

    df = add_week_cyclical_features(df)
    
    return df

In [89]:
def mi_score(X, y):
    # mutual_info_regression default'ta randomness içerir, sabitleyelim
    return mutual_info_regression(X, y, random_state=0)

In [90]:
def make_model(degree: int, alpha: float, base_cols, lag_cols, k_best: int):
    """
    base_cols: seçime tabi tutulacak feature'lar (weather + week_sin/cos vb.)
    lag_cols : her zaman dahil edilecek lag feature'lar (poly uygulanmayacak!)
    """
    k_best = min(k_best, len(base_cols))

    # ✅ Poly sadece base feature branch'inde
    base_branch = Pipeline([
        ("select_k", SelectKBest(score_func=mi_score, k=k_best)),
        ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
    ])

    pre = ColumnTransformer(
        transformers=[
            ("base", base_branch, base_cols),     # ✅ poly burada
            ("lags", "passthrough", lag_cols),    # ✅ lag'lere poly YOK
        ],
        remainder="drop"
    )

    # Not: poly base tarafında olduğu için burada tekrar poly yok
    return Pipeline([
        ("pre", pre),
        ("scale", StandardScaler()),
        ("ridge", Ridge(alpha=alpha, random_state=0))
    ])

In [91]:
def select_hyperparams_timecv(X, y, degrees, alphas, n_splits=5):
    """
    Choose (degree, alpha) by TimeSeriesSplit CV using MAE.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    best = None

    for d in degrees:
        for a in alphas:
            maes = []
            for tr_idx, va_idx in tscv.split(X):
                X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                model = make_model(d, a)
                model.fit(X_tr, y_tr)
                pred = model.predict(X_va)
                maes.append(mean_absolute_error(y_va, pred))

            avg_mae = float(np.mean(maes))
            if (best is None) or (avg_mae < best["mae"]):
                best = {"degree": d, "alpha": a, "mae": avg_mae}

    return best

In [92]:
def fit_predict_city(train_city, test_city, degrees, alphas, k_grid, n_splits=5):
    # preprocess (sin/cos + fill)
    train_city = preprocess_city_df(train_city)
    test_city  = preprocess_city_df(test_city)

    base_feature_cols = [c for c in train_city.columns if c not in COLS_TO_EXCLUDE]

    train_city_lagged = add_target_lag_features_train(train_city, lags=(1,2,3,4), roll_window=4)

    lag_cols = [f"cases_lag_{k}" for k in (1,2,3,4)] + ["cases_roll_mean_4"]
    train_city_lagged.dropna(subset=lag_cols, inplace=True)

    # X artık sadece bu kolonları taşısın (pipeline kolon ismiyle seçecek)
    feature_cols_all = base_feature_cols + lag_cols
    X = train_city_lagged[feature_cols_all]
    y = train_city_lagged["total_cases"]


    # --- TimeSeriesSplit CV ile (degree, alpha) seç
    best = None
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # k_best base feature sayısını aşamaz
    k_grid_eff = [k for k in k_grid if k <= len(base_feature_cols)]
    if len(k_grid_eff) == 0:
        k_grid_eff = [min(10, len(base_feature_cols))]  # fallback

    for d in degrees:
        for a in alphas:
            for k_best in k_grid_eff:
                maes = []
                for tr_idx, va_idx in tscv.split(X):
                    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                    model = make_model(d, a, base_feature_cols, lag_cols, k_best)
                    model.fit(X_tr, y_tr)
                    pred = model.predict(X_va)
                    maes.append(mean_absolute_error(y_va, pred))

                avg = float(np.mean(maes))
                if best is None or avg < best["mae"]:
                    best = {"degree": d, "alpha": a, "k_best": k_best, "mae": avg}

    print(f"[{train_city['city'].iloc[0]}] best -> degree={best['degree']} alpha={best['alpha']} k={best['k_best']} cv_mae={best['mae']:.4f}")


    # --- Full train fit
    final_model = make_model(best["degree"], best["alpha"], base_feature_cols, lag_cols, best["k_best"])
    final_model.fit(X, y)

    # --- TEST SIDE: recursive prediction to generate lag features
    test_pred = recursive_predict_with_lags(
        model=final_model,
        train_city_df=train_city,     # lag history starts from REAL train y
        test_city_df=test_city,
        base_feature_cols=base_feature_cols,
        lags=(1,2,3,4),
        roll_window=4
    )

    return test_pred, best

In [93]:
# ----------------------------
# Split by city
# ----------------------------
sj_train = train[train["city"] == "sj"].copy()
iq_train = train[train["city"] == "iq"].copy()


sj_test  = features_test[features_test["city"] == "sj"].copy()
iq_test  = features_test[features_test["city"] == "iq"].copy()

In [94]:
# --- City-specific hyperparameter grids ---
degrees_sj = [1, 2]            # sj genelde degree büyüyünce patlıyor
alphas_sj  = [10, 30, 100, 300, 1000, 3000, 10000]  # daha güçlü regularization

degrees_iq = [1, 2]            # iq de çoğunlukla 1-2 yeterli
alphas_iq  = [0.1, 1, 10, 30, 100, 300, 1000]       # daha hafif regularization

k_grid_sj = [6, 8, 10, 12, 14]
k_grid_iq = [6, 8, 10, 12]

# ----------------------------
# Train + Predict
# ----------------------------
sj_pred, sj_best = fit_predict_city(
    sj_train, sj_test,
    degrees=degrees_sj,
    alphas=alphas_sj,
    k_grid=k_grid_sj,
    n_splits=5
)

iq_pred, iq_best = fit_predict_city(
    iq_train, iq_test,
    degrees=degrees_iq,
    alphas=alphas_iq,
    k_grid=k_grid_iq,
    n_splits=5
)

[sj] best -> degree=1 alpha=10 k=6 cv_mae=9.1901
[iq] best -> degree=1 alpha=100 k=6 cv_mae=4.7527


In [95]:
# Build submission
# ----------------------------
submission = features_test[["city", "year", "weekofyear"]].copy()
submission["total_cases"] = 0.0

submission.loc[submission["city"] == "sj", "total_cases"] = sj_pred
submission.loc[submission["city"] == "iq", "total_cases"] = iq_pred


In [96]:
# DengAI expects integer counts
submission["total_cases"] = np.round(submission["total_cases"]).astype(int)

submission.to_csv("submission_poly_ridge.csv", index=False)
print("Saved: submission_poly_ridge.csv")
print(submission.head())

Saved: submission_poly_ridge.csv
  city  year  weekofyear  total_cases
0   sj  2008          18            3
1   sj  2008          19            2
2   sj  2008          20            3
3   sj  2008          21            4
4   sj  2008          22            5
