In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline


In [12]:
# load training data
features_train = pd.read_csv("dengue_features_train.csv")
labels_train = pd.read_csv("dengue_labels_train.csv")
# total cases = city + year + weekofyear
train = features_train.merge(labels_train, on=["city", "year", "weekofyear"])

# Extract city data for separate analysis
sj = train[train["city"] == "sj"].copy()
iq = train[train["city"] == "iq"].copy()

In [13]:
# Fill missing values for k-polynominal regression and we assume k=2 quadratic function for the better fitting 

for df in [sj, iq]:
    df.ffill(inplace=True)
    df.bfill(inplace=True)

In [16]:
# Define feature columns and target variable
cols_to_exclude = ["city", "total_cases", "week_start_date"]
feature_cols = [c for c in sj.columns if c not in cols_to_exclude]

X_sj_full  = sj[feature_cols]
y_sj_full = sj["total_cases"]

X_iq_full  = iq[feature_cols]
y_iq_full  = iq["total_cases"]

In [18]:
# Time series split function
def time_series_split(X, y, val_ratio=0.2):
    n = len(X)
    n_val = int(n * val_ratio)

    X_train = X.iloc[:-n_val]
    X_val   = X.iloc[-n_val:]

    y_train = y.iloc[:-n_val]
    y_val   = y.iloc[-n_val:]

    return X_train, X_val, y_train, y_val

X_sj_train, X_sj_val, y_sj_train, y_sj_val = time_series_split(X_sj_full, y_sj_full, val_ratio=0.2)
X_iq_train, X_iq_val, y_iq_train, y_iq_val = time_series_split(X_iq_full, y_iq_full, val_ratio=0.2)

print("San Juan train/val boyutlar覺:", X_sj_train.shape, X_sj_val.shape)
print("Iquitos train/val boyutlar覺:", X_iq_train.shape, X_iq_val.shape)


San Juan train/val boyutlar覺: (749, 22) (187, 22)
Iquitos train/val boyutlar覺: (416, 22) (104, 22)


In [19]:
# Create polynomial regression model pipeline
def make_poly_model(degree=2):
    return Pipeline([
        ("scale", StandardScaler()),
        ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
        ("linreg", LinearRegression())
    ])


In [20]:
# Train and validate function
def train_and_validate(degree, X_train, y_train, X_val, y_val):
    model = make_poly_model(degree)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    return model, mae


In [24]:
degrees = [1, 2, 3, 4]

results = []

for d in degrees:
    model_sj, mae_sj = train_and_validate(d, X_sj_train, y_sj_train, X_sj_val, y_sj_val)
    model_iq, mae_iq = train_and_validate(d, X_iq_train, y_iq_train, X_iq_val, y_iq_val)

    results.append({
        "degree": d,
        "mae_sj": mae_sj,
        "mae_iq": mae_iq
    })

results_df = pd.DataFrame(results)
print(results_df)


   degree      mae_sj     mae_iq
0       1   23.701808   8.723022
1       2   44.427306  16.435350
2       3  142.231408  27.841559
3       4  127.479903  20.061307
