# DengAI: Predicting Disease Spread
**XGBoost + LightGBM + CatBoost ensemble**

1. Run Cell 1 to install & upload your 3 CSV files
2. Run Cell 2 to train & predict
3. Run Cell 3 to download submission.csv
4. Upload submission.csv to DrivenData

In [None]:
!pip install -q xgboost lightgbm catboost
from google.colab import files
print('Select your 3 CSV files: dengue_features_train.csv, dengue_labels_train.csv, dengue_features_test.csv')
uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')
test_features = pd.read_csv('dengue_features_test.csv')
train = train_features.merge(train_labels, on=['city','year','weekofyear'])
print(f'Train: {train.shape}, Test: {test_features.shape}')

def engineer(df):
    df = df.copy()
    df['month'] = pd.to_datetime(df['week_start_date']).dt.month
    df.drop('week_start_date', axis=1, inplace=True)
    df['is_wet'] = df['month'].apply(lambda m: 1 if m in [5,6,7,8,9,10,11] else 0)
    df['week_sin'] = np.sin(2*np.pi*df['weekofyear']/52)
    df['week_cos'] = np.cos(2*np.pi*df['weekofyear']/52)
    ndvi = [c for c in df.columns if 'ndvi' in c]
    climate = [c for c in df.columns if c not in ['city','year','weekofyear','total_cases','month','is_wet','week_sin','week_cos']]
    for city in df.city.unique():
        m = df.city==city
        df.loc[m,climate] = df.loc[m,climate].ffill().bfill()
    df['ndvi_avg'] = df[ndvi].mean(axis=1)
    if 'reanalysis_max_air_temp_k' in df.columns:
        df['temp_range'] = df['reanalysis_max_air_temp_k']-df['reanalysis_min_air_temp_k']
    if 'reanalysis_specific_humidity_g_per_kg' in df.columns:
        df['humid_temp'] = df['reanalysis_specific_humidity_g_per_kg']*df['reanalysis_avg_temp_k']
    precip = [c for c in df.columns if 'precip' in c]
    if precip: df['precip_avg'] = df[precip].mean(axis=1)
    lag_cols = [c for c in ['reanalysis_specific_humidity_g_per_kg','reanalysis_dew_point_temp_k',
        'reanalysis_avg_temp_k','station_avg_temp_c','precipitation_amt_mm','ndvi_avg','humid_temp'] if c in df.columns]
    for city in df.city.unique():
        m = df.city==city
        cd = df.loc[m].copy()
        for col in lag_cols:
            for lag in [1,2,3,4]: df.loc[m,f'{col}_l{lag}'] = cd[col].shift(lag)
            for w in [4,8,12]: df.loc[m,f'{col}_r{w}'] = cd[col].rolling(w,min_periods=1).mean()
            df.loc[m,f'{col}_s4'] = cd[col].rolling(4,min_periods=1).std()
        if 'total_cases' in cd.columns:
            for lag in [1,2,3,4]: df.loc[m,f'cases_l{lag}'] = cd['total_cases'].shift(lag)
            df.loc[m,'cases_r4'] = cd['total_cases'].rolling(4,min_periods=1).mean()
            df.loc[m,'cases_r8'] = cd['total_cases'].rolling(8,min_periods=1).mean()
    return df.ffill().bfill().fillna(0)

print('Engineering features...')
train_e = engineer(train)
test_e = engineer(test_features)
drop = ['city','total_cases']
fcols = [c for c in train_e.columns if c not in drop]
for c in fcols:
    if c not in test_e.columns: test_e[c]=0
test_e2 = test_e[fcols]
print(f'Features: {len(fcols)}')

results = {}
for city in ['sj','iq']:
    ct = train_e[train_e.city==city]
    X,y = ct[fcols].values, ct['total_cases'].values
    Xt = test_e2[test_e.city==city].values
    print(f'\n{city.upper()}: {len(X)} train, {len(Xt)} test')
    all_preds = []
    for seed in [42,123,456,789]:
        models = [
            XGBRegressor(n_estimators=1000,max_depth=5,learning_rate=0.03,subsample=0.8,colsample_bytree=0.7,reg_alpha=0.1,reg_lambda=1,min_child_weight=5,random_state=seed,verbosity=0),
            LGBMRegressor(n_estimators=1000,max_depth=5,learning_rate=0.03,subsample=0.8,colsample_bytree=0.7,reg_alpha=0.1,reg_lambda=1,min_child_samples=10,random_state=seed,verbose=-1),
            CatBoostRegressor(iterations=1000,depth=5,learning_rate=0.03,l2_leaf_reg=3,random_seed=seed,verbose=0),
        ]
        pred = np.zeros(len(Xt))
        for m in models:
            m.fit(X,y)
            pred += m.predict(Xt)/3
        all_preds.append(pred)
    results[city] = np.clip(np.round(np.mean(all_preds,axis=0)),0,None).astype(int)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    for tr,vl in tscv.split(X):
        m = LGBMRegressor(n_estimators=1000,max_depth=5,learning_rate=0.03,random_state=42,verbose=-1)
        m.fit(X[tr],y[tr])
        p = np.clip(np.round(m.predict(X[vl])),0,None)
        scores.append(mean_absolute_error(y[vl],p))
    print(f'  CV MAE: {np.mean(scores):.2f}')
    print(f'  Predictions: min={results[city].min()}, max={results[city].max()}, mean={results[city].mean():.1f}')

sj_t = test_features[test_features.city=='sj'][['city','year','weekofyear']].copy()
iq_t = test_features[test_features.city=='iq'][['city','year','weekofyear']].copy()
sj_t['total_cases']=results['sj']
iq_t['total_cases']=results['iq']
sub = pd.concat([sj_t,iq_t],ignore_index=True)
sub.to_csv('submission.csv',index=False)
print(f'\nDone! submission.csv ready ({len(sub)} rows)')

In [None]:
from google.colab import files
files.download('submission.csv')