# Project Title: Outsmarting Outbreaks

## AIM: To predict the total number of climate-sensitive waterborne disease outbreak in Tanzania

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.spatial import cKDTree

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomTreesEmbedding
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

import random
import joblib
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, OneHotEncoder

In [2]:
SEED = 2023
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# Load datasets
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
toilets = pd.read_csv("toilets.csv")
waste_management = pd.read_csv("waste_management.csv")
water_sources = pd.read_csv("water_sources.csv")

In [4]:
# Combine train and test datasets for consistent preprocessing
hospital_data = pd.concat([train, test])

In [5]:
# Drop unnecessary columns from supplementary datasets
for df in [toilets, waste_management, water_sources]:
    df.drop(columns=['Year', 'Month'], inplace=True)

In [6]:
# Rename columns for clarity
def rename_columns(df, prefix):
    for col in df.columns:
        if col not in ['Month_Year_lat_lon', 'lat_lon']:
            df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)

rename_columns(toilets, "toilet")
rename_columns(waste_management, "waste")
rename_columns(water_sources, "water")

In [7]:
# Fill missing values in the 'Total' column
hospital_data['Total'].fillna(0, inplace=True)

In [8]:
# Drop rows with missing latitude and longitude in water sources
water_sources.dropna(subset=['water_Transformed_Latitude'], inplace=True)

In [9]:
def find_nearest(hospital_df, location_df, lat_col, lon_col, id_col):
    # Create a cKDTree for efficient nearest neighbour search
    tree = cKDTree(location_df[[lat_col, lon_col]].values)
    nearest = {}
    # Loop through each hospital and find the nearest site in location_df
    for _, row in hospital_df.iterrows():
        _, idx = tree.query([row['Transformed_Latitude'], row['Transformed_Longitude']])
        nearest[row['ID']] = location_df.iloc[idx][id_col]
    return nearest

In [10]:
# Ensure unique identifier columns exist in all supplementary datasets
for df, prefix in [(toilets, 'toilet'), (waste_management, 'waste'), (water_sources, 'water')]:
    df[f"{prefix}_Month_Year_lat_lon"] = (
        df[f"{prefix}_Month_Year"] + '_' +
        df[f"{prefix}_Transformed_Latitude"].astype(str) + '_' +
        df[f"{prefix}_Transformed_Longitude"].astype(str)
    )

In [11]:
# Merge datasets with nearest locations
merged_data = hospital_data.copy()
datasets = [
    (toilets, 'toilet', 'toilet_Month_Year_lat_lon'),
    (waste_management, 'waste', 'waste_Month_Year_lat_lon'),
    (water_sources, 'water', 'water_Month_Year_lat_lon'),
]

In [12]:
for df, prefix, id_col in datasets:
    nearest = find_nearest(merged_data, df, f"{prefix}_Transformed_Latitude", f"{prefix}_Transformed_Longitude", id_col)
    nearest_df = pd.DataFrame(list(nearest.items()), columns=['ID', id_col])
    merged_data = merged_data.merge(nearest_df, on="ID").merge(df, on=id_col)

In [13]:
merged_data.shape

(29332, 135)

## Start modeling

In [14]:
# Split merged data into train and test sets
train_df = merged_data[merged_data['Year'] < 2023]
test_df = merged_data[merged_data['Year'] == 2023]

In [15]:
# Specify the target column
target_column = 'Total'

# Feature and target split
X = train_df.drop(columns=[target_column, 'ID', 'Location'])  # Exclude unnecessary columns
y = train_df[target_column]

In [16]:
print('mean of target per year :')
display(train_df.groupby(['ID', 'Year'])[target_column].max().reset_index().groupby('Year')[target_column].mean())

mean of target per year :


Year
2019    18.295832
2020    15.540424
2021    13.049686
2022    10.775961
Name: Total, dtype: float64

### Data Preprocessing

In [17]:
NBR_FOLDS = 10
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer


def preprocess(dt_train, dt_test, mode = 'max', one_hot = False, tar_red = 3, dis = None, training_year = 2022, extra_ftrs = False):

    if dis is not None:
        dt_train = dt_train[dt_train['Disease'] == dis]
    dt_train_old = dt_train[dt_train['Year']==training_year-1].reset_index(drop = True)
    dt_train = dt_train[dt_train['Year']==training_year].reset_index(drop = True)


    
    # make the model more conservative by reducing the target values to follow the same bias per year : (2019 : 18), (2020 : 15), ..., (2023 : 7)
    # 2019  :  18.295832
    # 2020  :  15.540424
    # 2021  :  13.049686
    # 2022  :  10.775961
    dt_train.loc[dt_train[target_column]>tar_red, target_column] -= tar_red
    dt_train_old.loc[dt_train_old[target_column]>2*tar_red, target_column] -= 2*tar_red



    # handling samples with the same id :
    # features are the same per id for each year,but the target is different sometimes, 
    # it is advisable in this case to not train a model with the same input and different outputs,
    # so we will also make the output unique and select the mean or max target per id.
    ll = len(dt_train)
    if mode == 'mean':
        tar = dt_train.groupby(['ID', 'Year'])[target_column].mean()
    elif mode == 'max':
        tar = dt_train.groupby(['ID', 'Year'])[target_column].max()
    else:
        print('mode is not defined')
        raise Exception
    dt_train = dt_train.groupby(['ID', 'Year']).last().reset_index()
    dt_train[target_column] = tar.values
    print('dt_train :', ll, '->', len(dt_train))
    
    
    
    ll = len(dt_train_old)
    if mode == 'mean':
        tar = dt_train_old.groupby(['ID', 'Year'])[target_column].mean()
    elif mode == 'max':
        tar = dt_train_old.groupby(['ID', 'Year'])[target_column].max()
    else:
        print('mode is not defined')
        raise Exception
    dt_train_old = dt_train_old.groupby(['ID', 'Year']).last().reset_index()
    dt_train_old[target_column] = tar.values
    print('dt_train :', ll, '->', len(dt_train))

    # lgbm can be trained directly with categorical columns : object -> category
    for c in dt_train.dtypes[dt_train.dtypes=='object'].index:
        if c in  [target_column, 'ID']:
            continue;
        dt_train_old[c] = dt_train_old[c].astype('category')
        dt_train[c] = dt_train[c].astype('category')
        dt_test[c] = dt_test[c].astype('category')

 
    dt_features = [i for i in dt_train.columns if i not in [target_column, 'ID', 'Location']]
    dt_features = [i for i in dt_features if 'Latitude' not in i and 'Longitude' not in i]

    dt_label = target_column
    
    num_cols = [i for i in dt_features if dt_train[i].dtype not in ['object', 'category']]
    cat_cols = [i for i in dt_features if dt_train[i].dtype in ['object', 'category']]
    
    
    if extra_ftrs:
        NBR_FTRS = 20
        
        NBR_KM = NBR_FTRS
        for ncl in range(2,NBR_KM):
            cls = KMeans(n_clusters=ncl, random_state = 0)
            cls.fit(pd.concat([dt_train_old[num_cols], dt_train[num_cols], dt_test[num_cols]]))
            dt_train['kmeans_cluster'+str(ncl)] = cls.predict(dt_train[num_cols])
            dt_test['kmeans_cluster'+str(ncl)] = cls.predict(dt_test[num_cols])
            dt_train_old['kmeans_cluster'+str(ncl)] = cls.predict(dt_train_old[num_cols])

        NBR_PCA = NBR_FTRS
        pca = PCA(n_components=NBR_PCA, random_state = 0)
        pca.fit(pd.concat([dt_train_old[num_cols], dt_train[num_cols], dt_test[num_cols]]))
        dt_train[['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)]] = pca.transform(dt_train[num_cols])
        dt_test[['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)]] = pca.transform(dt_test[num_cols])
        dt_train_old[['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)]] = pca.transform(dt_train_old[num_cols])

        random_tree = RandomTreesEmbedding(n_estimators=NBR_FTRS, max_depth=1, random_state=0)
        random_tree.fit(pd.concat([dt_train_old[num_cols], dt_train[num_cols], dt_test[num_cols]]))
        trn_trans = random_tree.transform(dt_train[num_cols]).toarray()
        NBR_TREE = trn_trans.shape[1]
        dt_train[['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]] = trn_trans
        dt_test[['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]] = random_tree.transform(dt_test[num_cols]).toarray()
        dt_train_old[['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]] = random_tree.transform(dt_train_old[num_cols]).toarray()

        num_cols += ['kmeans_cluster'+str(ncl) for ncl in range(2,NBR_KM)] + ['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)] + ['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]
    
    dt_features = cat_cols + num_cols

    
    if one_hot:
        cat_cols = [i for i in dt_features if i not in num_cols]

        enc = OneHotEncoder()
        enc_cat = enc.fit_transform(pd.concat([dt_train[cat_cols], dt_test[cat_cols]])).toarray()

        dt_train_old[[f'enc_cat{i}' for i in range(enc_cat.shape[1])]] = enc.transform(dt_train_old[cat_cols]).toarray()
        dt_train[[f'enc_cat{i}' for i in range(enc_cat.shape[1])]] = enc_cat[:len(dt_train)]
        dt_test[[f'enc_cat{i}' for i in range(enc_cat.shape[1])]] = enc_cat[len(dt_train):]

        dt_features = num_cols + [f'enc_cat{i}' for i in range(enc_cat.shape[1])]
        num_cols = dt_features
    
    # row-wise scaling
    scaler = Normalizer()
    scaler.fit(pd.concat([dt_train[num_cols], dt_test[num_cols]]))
    dt_train_old[num_cols] = scaler.transform(dt_train_old[num_cols])
    dt_train[num_cols] = scaler.transform(dt_train[num_cols])
    dt_test[num_cols] = scaler.transform(dt_test[num_cols])


    return dt_train, dt_test, dt_features, dt_label, dt_train_old

### Model training

In [18]:
#from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor, HuberRegressor, SGDRegressor
def train_and_get_sub(dt_train, dt_test, dt_features, dt_label, dt_train_old = None, model_name = 'lgb'):

    skf = StratifiedKFold(n_splits=NBR_FOLDS)
    for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=dt_train[dt_features],
                                                                      y = dt_train['Location'].astype(str) + dt_train['Disease'].astype(str))):
        dt_train.loc[valid_indicies, "kfold"] = fold
    
    oof = np.zeros(len(dt_train))
    preds = np.zeros(len(dt_test))

    for fold in range(NBR_FOLDS):
        print(fold)
        train     = dt_train[dt_train['kfold'] !=fold].reset_index(drop = True)
        val       = dt_train[dt_train['kfold'] ==fold].reset_index(drop=True)
        cat_features = [i for i in dt_features if train[i].dtype == 'category']
        
        if dt_train_old is not None:
            train = pd.concat([train, dt_train_old]).astype(train.dtypes)

        train_dataset = lgb.Dataset(train[dt_features], train[dt_label])
        eval_dataset  = lgb.Dataset(val[dt_features], val[dt_label])


        model = lgb.train(
                        params = lgb_params,
                        train_set = train_dataset,
                        num_boost_round = 100000,
                        valid_sets = [train_dataset, eval_dataset],
                        callbacks = [lgb.early_stopping(200), lgb.log_evaluation(500)],
                    )

        oof[dt_train['kfold'] ==fold] += model.predict(val[dt_features])
        preds += model.predict(dt_test[dt_features])

    preds /= NBR_FOLDS

    mae = mean_absolute_error(dt_train[dt_label], oof)
    print(f"Mean Absolute Error (MAE): {mae}")
    mae = mean_absolute_error(dt_train[dt_label], np.floor(oof.clip(0)))
    print(f"Mean Absolute Error (MAE) after postprocessing: {mae}")
    
    dt_train['oof'] = oof
    

    sub = dt_test[['ID']].copy()
    sub['Predicted_Total'] = preds
    return sub, dt_train

### Evaluation and Submission

In [19]:
lgb_params = {
    'objective': 'mae',
    'metric' : 'mae',
    'learning_rate': 0.03,
    'max_depth': 5,
    'seed': 42,
    'n_jobs': 14,
    'boosting':'goss',
    'top_rate':0.3,
    'verbose' : -1,
}

dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
sub1, dt_train1 = train_and_get_sub(dt_train, dt_test, dt_features, dt_label)

dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
sub1_old, dt_train1_old = train_and_get_sub(dt_train, dt_test, dt_features, dt_label, dt_train_old)

bst_w, bst_score = -1, np.inf
for w in np.linspace(0, 1):
    tmp_oof = w * dt_train1['oof'] + (1-w) * dt_train1_old['oof']
    score = mean_absolute_error(dt_train1['Total'], tmp_oof)
    if score < bst_score:
        bst_w, bst_score = w, score

dt_train1['oof'] = bst_w * dt_train1['oof'] + (1-bst_w) * dt_train1_old['oof']
sub1['Predicted_Total'] = bst_w * sub1['Predicted_Total'] + (1-bst_w) * sub1_old['Predicted_Total']

print('MAE score ens :', mean_absolute_error(dt_train1['Total'], dt_train1['oof']))
print('MAE score ens clipped :', mean_absolute_error(dt_train1['Total'], np.floor(dt_train1['oof'].clip(0))))


diss = ['Typhoid', 'Diarrhea', 'Malaria', 'Schistosomiasis', 'Intestinal Worms']
subs_local, local_trns = [], []
for dis in diss:
    
    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    _sub, _dt_train = train_and_get_sub(dt_train, dt_test, dt_features, dt_label)
    
    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    _sub_old, _dt_train_old = train_and_get_sub(dt_train, dt_test, dt_features, dt_label, dt_train_old)

    
    bst_w, bst_score = -1, np.inf
    for w in np.linspace(0, 1):
        tmp_oof = w * _dt_train['oof'] + (1-w) * _dt_train_old['oof']
        score = mean_absolute_error(dt_train['Total'], tmp_oof)
        if score < bst_score:
            bst_w, bst_score = w, score
    _dt_train['oof'] = bst_w * _dt_train['oof'] + (1-bst_w) * _dt_train_old['oof']
    _sub['Predicted_Total'] = bst_w * _sub['Predicted_Total'] + (1-bst_w) * _sub_old['Predicted_Total']
    subs_local.append(_sub)
    local_trns.append(_dt_train)
sub2 = _sub.copy()
for i, dis in enumerate(diss):
    sub2.loc[dt_test['Disease'] == dis, 'Predicted_Total'] = subs_local[i].loc[dt_test['Disease'] == dis, 'Predicted_Total']
    print(dis, 'mae score :', mean_absolute_error(local_trns[i]['Total'], local_trns[i]['oof']))


w = 0.3
sub = sub1.copy()
sub.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] = w*sub1.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] + (1-w)*sub2.loc[dt_test['Disease'].isin(diss), 'Predicted_Total']

sub['Predicted_Total'] = np.floor(sub['Predicted_Total'].values.clip(0))

dt_train : 7194 -> 3852
dt_train : 5973 -> 3852




0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.35489	valid_1's l1: 4.82287
[1000]	training's l1: 5.95035	valid_1's l1: 4.56423
[1500]	training's l1: 5.75789	valid_1's l1: 4.39959
[2000]	training's l1: 5.60883	valid_1's l1: 4.30157
[2500]	training's l1: 5.29236	valid_1's l1: 4.04819
Early stopping, best iteration is:
[2504]	training's l1: 5.29189	valid_1's l1: 4.04791
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.23949	valid_1's l1: 5.13898
[1000]	training's l1: 5.79674	valid_1's l1: 4.87059
[1500]	training's l1: 5.53362	valid_1's l1: 4.68675
[2000]	training's l1: 5.43501	valid_1's l1: 4.64234
[2500]	training's l1: 5.38367	valid_1's l1: 4.61603
[3000]	training's l1: 5.32073	valid_1's l1: 4.56834
Early stopping, best iteration is:
[3046]	training's l1: 5.31975	valid_1's l1: 4.56647
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.0194	valid_1's l1: 5.12146
[1000]	trainin



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.93546	valid_1's l1: 4.76819
[1000]	training's l1: 6.65332	valid_1's l1: 4.57059
[1500]	training's l1: 6.36879	valid_1's l1: 4.38685
[2000]	training's l1: 6.24904	valid_1's l1: 4.35538
[2500]	training's l1: 6.07854	valid_1's l1: 4.27248
[3000]	training's l1: 5.98534	valid_1's l1: 4.19973
[3500]	training's l1: 5.91537	valid_1's l1: 4.16543
Early stopping, best iteration is:
[3688]	training's l1: 5.89096	valid_1's l1: 4.14916
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.90089	valid_1's l1: 5.1801
[1000]	training's l1: 6.66227	valid_1's l1: 5.0024
[1500]	training's l1: 6.47442	valid_1's l1: 4.93107
[2000]	training's l1: 6.24065	valid_1's l1: 4.77699
[2500]	training's l1: 6.07131	valid_1's l1: 4.68592
[3000]	training's l1: 5.97272	valid_1's l1: 4.64109
[3500]	training's l1: 5.92405	valid_1's l1: 4.61583
Early stopping, best iteration is:
[3535]	training's l1: 5.91



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.05835	valid_1's l1: 2.32404
[1000]	training's l1: 7.78212	valid_1's l1: 2.15538
[1500]	training's l1: 7.72171	valid_1's l1: 2.08565
[2000]	training's l1: 7.63993	valid_1's l1: 1.98189
[2500]	training's l1: 7.62914	valid_1's l1: 1.97509
Early stopping, best iteration is:
[2779]	training's l1: 7.61183	valid_1's l1: 1.96496
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.71455	valid_1's l1: 2.8583
Early stopping, best iteration is:
[383]	training's l1: 7.73229	valid_1's l1: 2.8281
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[151]	training's l1: 7.73304	valid_1's l1: 1.91866
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[33]	training's l1: 8.34067	valid_1's l1: 3.98288
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iterati



0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[8]	training's l1: 7.98383	valid_1's l1: 2.53395
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.26277	valid_1's l1: 3.31243
Early stopping, best iteration is:
[432]	training's l1: 7.29078	valid_1's l1: 3.28623
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[79]	training's l1: 7.3854	valid_1's l1: 2.05906
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[45]	training's l1: 7.53152	valid_1's l1: 4.15799
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[12]	training's l1: 7.6354	valid_1's l1: 7.88756
5
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.17417	valid_1's l1: 20.9683
[1000]	training's l1: 5.87591	valid_1's l1: 20.7034
Early stopping, best iteration is:
[841



0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[153]	training's l1: 9.98844	valid_1's l1: 8.62152
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[166]	training's l1: 9.38159	valid_1's l1: 12.179
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[54]	training's l1: 12.5662	valid_1's l1: 12.0542
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[276]	training's l1: 9.20716	valid_1's l1: 16.7263
4
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.7729	valid_1's l1: 18.5092
[1000]	training's l1: 8.26149	valid_1's l1: 18.0876
Early stopping, best iteration is:
[1032]	training's l1: 8.237	valid_1's l1: 18.0459
5
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 9.53367	valid_1's l1: 8.26588
[1000]	training's l1: 9.09788	valid_



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 10.1159	valid_1's l1: 9.07801
[1000]	training's l1: 9.61561	valid_1's l1: 8.81991
[1500]	training's l1: 9.27888	valid_1's l1: 8.50978
Early stopping, best iteration is:
[1397]	training's l1: 9.30855	valid_1's l1: 8.46192
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 10.1997	valid_1's l1: 12.7218
Early stopping, best iteration is:
[460]	training's l1: 10.2772	valid_1's l1: 12.6603
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[104]	training's l1: 11.7171	valid_1's l1: 11.6048
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 9.79823	valid_1's l1: 16.3988
Early stopping, best iteration is:
[306]	training's l1: 10.2881	valid_1's l1: 16.3916
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[81]	training's l1: 11.768	valid_1's l1: 20



0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[37]	training's l1: 5.92545	valid_1's l1: 3.15677
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[121]	training's l1: 4.62926	valid_1's l1: 4.04553
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[134]	training's l1: 4.45812	valid_1's l1: 5.38601
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[142]	training's l1: 4.11856	valid_1's l1: 7.67945
4
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.47941	valid_1's l1: 8.90127
Early stopping, best iteration is:
[414]	training's l1: 3.53836	valid_1's l1: 8.88251
5
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.65502	valid_1's l1: 8.07537
[1000]	training's l1: 3.37331	valid_1's l1: 7.99149
[1500]	training's l1: 3.20347	val



0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[20]	training's l1: 8.63037	valid_1's l1: 3.34377
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[38]	training's l1: 7.5748	valid_1's l1: 3.81459
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[51]	training's l1: 7.11128	valid_1's l1: 5.11273
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[57]	training's l1: 6.62941	valid_1's l1: 8.50187
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[267]	training's l1: 5.61109	valid_1's l1: 8.18919
5
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.29677	valid_1's l1: 8.12651
[1000]	training's l1: 4.93773	valid_1's l1: 7.99679
[1500]	training's l1: 4.70999	valid_1's l1: 7.82195
[2000]	training's l1: 4.61621	valid_



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.46226	valid_1's l1: 0.631932
[1000]	training's l1: 1.445	valid_1's l1: 0.605647
[1500]	training's l1: 1.43558	valid_1's l1: 0.597534
[2000]	training's l1: 1.43059	valid_1's l1: 0.588405
[2500]	training's l1: 1.42057	valid_1's l1: 0.569006
[3000]	training's l1: 1.41021	valid_1's l1: 0.548664
[3500]	training's l1: 1.40229	valid_1's l1: 0.535229
[4000]	training's l1: 1.39567	valid_1's l1: 0.525957
[4500]	training's l1: 1.39254	valid_1's l1: 0.5205
[5000]	training's l1: 1.38976	valid_1's l1: 0.516952
[5500]	training's l1: 1.38696	valid_1's l1: 0.513698
[6000]	training's l1: 1.38197	valid_1's l1: 0.507708
Early stopping, best iteration is:
[6211]	training's l1: 1.38062	valid_1's l1: 0.503368
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.40866	valid_1's l1: 0.48456
[1000]	training's l1: 1.39712	valid_1's l1: 0.480419
[1500]	training's l1: 1.38987	valid_1's l1: 0.475



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.30574	valid_1's l1: 0.429419
[1000]	training's l1: 1.30302	valid_1's l1: 0.419032
Early stopping, best iteration is:
[1236]	training's l1: 1.30037	valid_1's l1: 0.417187
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.31594	valid_1's l1: 0.454304
[1000]	training's l1: 1.30987	valid_1's l1: 0.445276
Early stopping, best iteration is:
[982]	training's l1: 1.31002	valid_1's l1: 0.445203
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.25469	valid_1's l1: 1.33897
[1000]	training's l1: 1.25302	valid_1's l1: 1.33302
[1500]	training's l1: 1.25097	valid_1's l1: 1.32751
Early stopping, best iteration is:
[1685]	training's l1: 1.25019	valid_1's l1: 1.31731
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[55]	training's l1: 1.26533	valid_1's l1: 1.35792
4
Training until validation scor



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.08917	valid_1's l1: 8.72548
[1000]	training's l1: 6.7895	valid_1's l1: 8.6685
Early stopping, best iteration is:
[1079]	training's l1: 6.75016	valid_1's l1: 8.66252
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.22117	valid_1's l1: 7.6777
Early stopping, best iteration is:
[761]	training's l1: 6.99698	valid_1's l1: 7.63773
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[75]	training's l1: 8.15198	valid_1's l1: 11.5303
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.02839	valid_1's l1: 9.00263
Early stopping, best iteration is:
[451]	training's l1: 7.08529	valid_1's l1: 8.98514
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[120]	training's l1: 7.64997	valid_1's l1: 12.3699
5
Training until validation scores don't improve



0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[139]	training's l1: 9.08238	valid_1's l1: 8.92617
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.49006	valid_1's l1: 7.83177
Early stopping, best iteration is:
[308]	training's l1: 8.79445	valid_1's l1: 7.78221
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[206]	training's l1: 8.9196	valid_1's l1: 10.834
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[49]	training's l1: 10.3769	valid_1's l1: 9.73401
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[229]	training's l1: 8.69968	valid_1's l1: 11.9297
5
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.38631	valid_1's l1: 11.3928
Early stopping, best iteration is:
[786]	training's l1: 8.0789	valid_1's l1: 11.2263
6


In [20]:
test_preds = sub['Predicted_Total'].values.copy()

In [21]:
lgb_params = {
    'objective': 'mae',
    'metric' : 'mae',
    'learning_rate': 0.03,
    'max_depth': 5,
    'seed': 42,
    'n_jobs': 14,
    'boosting':'goss',
    'top_rate':0.3,
    'verbose' : -1,
}

diss = ['Typhoid', 'Diarrhea', 'Malaria', 'Schistosomiasis', 'Intestinal Worms']


dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
pl = dt_test.reset_index(drop = True).copy()
pl['Total'] = test_preds.copy()
skf = StratifiedKFold(n_splits=NBR_FOLDS)
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str) + pl['Disease'].astype(str))):
    pl.loc[valid_indicies, "kfold"] = fold
sub1, dt_train1 = train_and_get_sub(pl, dt_test, dt_features, dt_label, dt_train)

dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
pl = dt_test.reset_index(drop = True).copy()
pl['Total'] = test_preds.copy()
skf = StratifiedKFold(n_splits=NBR_FOLDS)
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str) + pl['Disease'].astype(str))):
    pl.loc[valid_indicies, "kfold"] = fold
sub1_old, dt_train1_old = train_and_get_sub(pl, dt_test, dt_features, dt_label, 
                                           pd.concat([dt_train, dt_train_old]).reset_index(drop = True))


bst_w, bst_score = -1, np.inf
for w in np.linspace(0, 1):
    tmp_oof = w * dt_train1['oof'] + (1-w) * dt_train1_old['oof']
    score = mean_absolute_error(dt_train1['Total'], tmp_oof)
    if score < bst_score:
        bst_w, bst_score = w, score

dt_train1['oof'] = bst_w * dt_train1['oof'] + (1-bst_w) * dt_train1_old['oof']
sub1['Predicted_Total'] = bst_w * sub1['Predicted_Total'] + (1-bst_w) * sub1_old['Predicted_Total']

print('MAE score ens :', mean_absolute_error(dt_train1['Total'], dt_train1['oof']))
print('MAE score ens clipped :', mean_absolute_error(dt_train1['Total'], np.floor(dt_train1['oof'].clip(0))))

subs_local, local_trns = [], []
for dis in diss:
    
    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    pl = dt_test.reset_index(drop = True).copy()
    pl['Total'] = test_preds.copy()
    pl = pl[pl['Disease']==dis].reset_index(drop = True)
    skf = StratifiedKFold(n_splits=NBR_FOLDS)
    for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str))):
        pl.loc[valid_indicies, "kfold"] = fold
    _sub, _dt_train = train_and_get_sub(pl, dt_test, dt_features, dt_label, dt_train)


    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    pl = dt_test.reset_index(drop = True).copy()
    pl['Total'] = test_preds.copy()
    pl = pl[pl['Disease']==dis].reset_index(drop = True)
    skf = StratifiedKFold(n_splits=NBR_FOLDS)
    for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str))):
        pl.loc[valid_indicies, "kfold"] = fold
    _sub_old, _dt_train_old = train_and_get_sub(pl, dt_test, dt_features, dt_label, pd.concat([dt_train, dt_train_old]).reset_index(drop = True))

    
    bst_w, bst_score = -1, np.inf
    for w in np.linspace(0, 1):
        tmp_oof = w * _dt_train['oof'] + (1-w) * _dt_train_old['oof']
        score = mean_absolute_error(_dt_train['Total'], tmp_oof)
        if score < bst_score:
            bst_w, bst_score = w, score
    _dt_train['oof'] = bst_w * _dt_train['oof'] + (1-bst_w) * _dt_train_old['oof']
    _sub['Predicted_Total'] = bst_w * _sub['Predicted_Total'] + (1-bst_w) * _sub_old['Predicted_Total']
    subs_local.append(_sub)
    local_trns.append(_dt_train)
sub2 = _sub.copy()
for i, dis in enumerate(diss):
    sub2.loc[dt_test['Disease'] == dis, 'Predicted_Total'] = subs_local[i].loc[dt_test['Disease'] == dis, 'Predicted_Total']
    print(dis, 'mae score :', mean_absolute_error(local_trns[i]['Total'], local_trns[i]['oof']))

w = 0.3
sub = sub1.copy()
sub.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] = w*sub1.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] + (1-w)*sub2.loc[dt_test['Disease'].isin(diss), 'Predicted_Total']

sub['Predicted_Total'] = np.floor(sub['Predicted_Total'].clip(0))

sub.to_csv('submission1.csv', index = False)
sub.loc[dt_test['Disease'] == 'Cholera', 'Predicted_Total'] = 0
sub.loc[dt_test['Disease'] == 'Dysentery', 'Predicted_Total'] = 0
sub.to_csv('submission1.csv', index = False)

dt_train : 7194 -> 3852
dt_train : 5973 -> 3852
0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.52151	valid_1's l1: 1.51662
[1000]	training's l1: 3.06187	valid_1's l1: 1.02338
[1500]	training's l1: 2.89641	valid_1's l1: 0.855807
[2000]	training's l1: 2.78733	valid_1's l1: 0.712489
[2500]	training's l1: 2.76761	valid_1's l1: 0.684099
[3000]	training's l1: 2.74124	valid_1's l1: 0.647743
[3500]	training's l1: 2.71524	valid_1's l1: 0.614549
[4000]	training's l1: 2.68466	valid_1's l1: 0.587934
[4500]	training's l1: 2.6652	valid_1's l1: 0.570981
[5000]	training's l1: 2.64537	valid_1's l1: 0.558615
[5500]	training's l1: 2.63312	valid_1's l1: 0.545854
[6000]	training's l1: 2.62153	valid_1's l1: 0.536758
[6500]	training's l1: 2.609	valid_1's l1: 0.526887
Early stopping, best iteration is:
[6571]	training's l1: 2.60799	valid_1's l1: 0.525612
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.43036	valid_1's l1: 1.26144
[1

[13500]	training's l1: 2.50599	valid_1's l1: 0.21452
Early stopping, best iteration is:
[13543]	training's l1: 2.50558	valid_1's l1: 0.214282
8
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.55321	valid_1's l1: 0.971562
[1000]	training's l1: 3.08819	valid_1's l1: 0.568167
[1500]	training's l1: 2.8822	valid_1's l1: 0.407504
[2000]	training's l1: 2.79723	valid_1's l1: 0.345969
[2500]	training's l1: 2.71643	valid_1's l1: 0.309353
[3000]	training's l1: 2.67846	valid_1's l1: 0.28564
[3500]	training's l1: 2.65685	valid_1's l1: 0.2736
[4000]	training's l1: 2.64315	valid_1's l1: 0.265046
Early stopping, best iteration is:
[3997]	training's l1: 2.64316	valid_1's l1: 0.264843
9
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.65824	valid_1's l1: 1.055
[1000]	training's l1: 2.97614	valid_1's l1: 0.456974
[1500]	training's l1: 2.84377	valid_1's l1: 0.373747
[2000]	training's l1: 2.78366	valid_1's l1: 0.33661
[2500]	training'

[6500]	training's l1: 3.7465	valid_1's l1: 0.299712
Early stopping, best iteration is:
[6416]	training's l1: 3.74839	valid_1's l1: 0.297138
7
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.99715	valid_1's l1: 1.63499
[1000]	training's l1: 4.36615	valid_1's l1: 0.993239
[1500]	training's l1: 4.17875	valid_1's l1: 0.758599
[2000]	training's l1: 4.04875	valid_1's l1: 0.620836
[2500]	training's l1: 3.96825	valid_1's l1: 0.529966
[3000]	training's l1: 3.9082	valid_1's l1: 0.482309
[3500]	training's l1: 3.86394	valid_1's l1: 0.44187
[4000]	training's l1: 3.81735	valid_1's l1: 0.405974
[4500]	training's l1: 3.77931	valid_1's l1: 0.358867
[5000]	training's l1: 3.74999	valid_1's l1: 0.32449
[5500]	training's l1: 3.72473	valid_1's l1: 0.303543
[6000]	training's l1: 3.71457	valid_1's l1: 0.299383
[6500]	training's l1: 3.6947	valid_1's l1: 0.28516
[7000]	training's l1: 3.67631	valid_1's l1: 0.270731
[7500]	training's l1: 3.6666	valid_1's l1: 0.266789
[8000]	tr

1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.35055	valid_1's l1: 0.835345
[1000]	training's l1: 5.12153	valid_1's l1: 0.758471
Early stopping, best iteration is:
[1087]	training's l1: 5.09207	valid_1's l1: 0.752157
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.37827	valid_1's l1: 0.910794
[1000]	training's l1: 5.05116	valid_1's l1: 0.856249
[1500]	training's l1: 4.96036	valid_1's l1: 0.826767
[2000]	training's l1: 4.88922	valid_1's l1: 0.808748
[2500]	training's l1: 4.85087	valid_1's l1: 0.799886
Early stopping, best iteration is:
[2511]	training's l1: 4.84955	valid_1's l1: 0.798214
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.27715	valid_1's l1: 0.539784
[1000]	training's l1: 5.03363	valid_1's l1: 0.498508
[1500]	training's l1: 4.94005	valid_1's l1: 0.494377
Early stopping, best iteration is:
[1308]	training's l1: 4.98121	valid_1's l1: 0.488396
4
Training until 

Early stopping, best iteration is:
[583]	training's l1: 4.06225	valid_1's l1: 0.503704
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.03189	valid_1's l1: 0.565159
[1000]	training's l1: 3.8108	valid_1's l1: 0.511134
Early stopping, best iteration is:
[1115]	training's l1: 3.77887	valid_1's l1: 0.506775
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.02318	valid_1's l1: 0.584121
[1000]	training's l1: 3.79818	valid_1's l1: 0.541315
Early stopping, best iteration is:
[1199]	training's l1: 3.75075	valid_1's l1: 0.521151
4
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.0166	valid_1's l1: 0.412047
Early stopping, best iteration is:
[750]	training's l1: 3.90901	valid_1's l1: 0.379723
5
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.09099	valid_1's l1: 0.439274
[1000]	training's l1: 3.92501	valid_1's l1: 0.400225
[1500]	training's l1: 3.83461

[5500]	training's l1: 0.814896	valid_1's l1: 0.00237023
[6000]	training's l1: 0.814794	valid_1's l1: 0.0023207
Early stopping, best iteration is:
[5860]	training's l1: 0.81483	valid_1's l1: 0.00231828
9
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 0.824174	valid_1's l1: 0.00968018
[1000]	training's l1: 0.820521	valid_1's l1: 0.00848818
[1500]	training's l1: 0.820226	valid_1's l1: 0.00824945
Early stopping, best iteration is:
[1795]	training's l1: 0.819872	valid_1's l1: 0.00814154
Mean Absolute Error (MAE): 0.012224133779661337
Mean Absolute Error (MAE) after postprocessing: 0.026836158192090395
dt_train : 654 -> 642
dt_train : 543 -> 642
0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.19384	valid_1's l1: 0.572307
Early stopping, best iteration is:
[658]	training's l1: 4.11592	valid_1's l1: 0.550614
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 4.16654	valid_1's l1: 0.6319