In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error,mean_absolute_error
import optuna

import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

#### Load training and test data

In [2]:
household_train = pd.read_csv('./train_hh_features.csv')
household_test = pd.read_csv('./test_hh_features.csv')

target_train_consumption = pd.read_csv('./train_hh_gt.csv')

household_train.head()

Unnamed: 0,hhid,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,...,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000,survey_id
0,100001,1,75,4,594.80627,Female,1,0,0,0,...,Yes,No,No,No,Yes,Yes,Yes,Yes,No,100000
1,100002,1,150,4,1676.2723,Female,2,0,0,0,...,Yes,No,No,No,No,Yes,Yes,No,No,100000
2,100003,1,375,4,506.93719,Male,5,0,0,2,...,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,100000
3,100004,1,375,4,824.61786,Male,5,0,0,1,...,No,Yes,No,No,No,Yes,Yes,No,No,100000
4,100005,1,525,4,351.47644,Male,7,1,0,0,...,Yes,No,No,Yes,No,Yes,Yes,Yes,No,100000


### Drop unnecessary columns

In [3]:
x = household_train.drop(columns=['survey_id','hhid'])
x_test = household_test.drop(columns=['survey_id','hhid'])

y = target_train_consumption.drop(columns=['survey_id','hhid'])

x.shape, y.shape, x_test.shape

((104234, 86), (104234, 1), (103023, 86))

In [4]:
x.head()

Unnamed: 0,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,age,...,consumed4100,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000
0,1,75,4,594.80627,Female,1,0,0,0,75,...,Yes,Yes,No,No,No,Yes,Yes,Yes,Yes,No
1,1,150,4,1676.2723,Female,2,0,0,0,61,...,Yes,Yes,No,No,No,No,Yes,Yes,No,No
2,1,375,4,506.93719,Male,5,0,0,2,49,...,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes
3,1,375,4,824.61786,Male,5,0,0,1,58,...,No,No,Yes,No,No,No,Yes,Yes,No,No
4,1,525,4,351.47644,Male,7,1,0,0,57,...,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,No


### Map categorical values

In [5]:
mapping_df = pd.read_csv('./feature_value_descriptions.csv')

In [6]:
mapping_dict = {}
for _, row in mapping_df.iterrows():
    var_name = row['Variable name']
    value = row['Value']
    value_label = row['Value label']
    
    if var_name not in mapping_dict:
        mapping_dict[var_name] = {}
    mapping_dict[var_name][value_label] = value

mapping_dict

{'any_nonagric': {'No': 0, 'Yes': 1},
 'consumed100': {'No': 0, 'Yes': 1},
 'consumed1000': {'No': 0, 'Yes': 1},
 'consumed1100': {'No': 0, 'Yes': 1},
 'consumed1200': {'No': 0, 'Yes': 1},
 'consumed1300': {'No': 0, 'Yes': 1},
 'consumed1400': {'No': 0, 'Yes': 1},
 'consumed1500': {'No': 0, 'Yes': 1},
 'consumed1600': {'No': 0, 'Yes': 1},
 'consumed1700': {'No': 0, 'Yes': 1},
 'consumed1800': {'No': 0, 'Yes': 1},
 'consumed1900': {'No': 0, 'Yes': 1},
 'consumed200': {'No': 0, 'Yes': 1},
 'consumed2000': {'No': 0, 'Yes': 1},
 'consumed2100': {'No': 0, 'Yes': 1},
 'consumed2200': {'No': 0, 'Yes': 1},
 'consumed2300': {'No': 0, 'Yes': 1},
 'consumed2400': {'No': 0, 'Yes': 1},
 'consumed2500': {'No': 0, 'Yes': 1},
 'consumed2600': {'No': 0, 'Yes': 1},
 'consumed2700': {'No': 0, 'Yes': 1},
 'consumed2800': {'No': 0, 'Yes': 1},
 'consumed2900': {'No': 0, 'Yes': 1},
 'consumed300': {'No': 0, 'Yes': 1},
 'consumed3000': {'No': 0, 'Yes': 1},
 'consumed3100': {'No': 0, 'Yes': 1},
 'consumed3200'

In [7]:
# Apply label mapping according to mapping dictionary
for column in x.columns:
    if column in mapping_dict:
        x[column] = x[column].map(mapping_dict[column])
        x_test[column] = x_test[column].map(mapping_dict[column])

x.head()

Unnamed: 0,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,age,...,consumed4100,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000
0,1,75,4,594.80627,0,1,0,0,0,75,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
1,1,150,4,1676.2723,0,2,0,0,0,61,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1,375,4,506.93719,1,5,0,0,2,49,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
3,1,375,4,824.61786,1,5,0,0,1,58,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1,525,4,351.47644,1,7,1,0,0,57,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0


In [8]:
x.isnull().sum(axis=0)

com               0
weight            0
strata            0
utl_exp_ppp17    85
male              0
                 ..
consumed4600     56
consumed4700     52
consumed4800     56
consumed4900     56
consumed5000     66
Length: 86, dtype: int64

In [9]:
x = x.fillna(x.mean())
x.isnull().sum(axis=0)

com              0
weight           0
strata           0
utl_exp_ppp17    0
male             0
                ..
consumed4600     0
consumed4700     0
consumed4800     0
consumed4900     0
consumed5000     0
Length: 86, dtype: int64

In [10]:
x_test = x_test.fillna(x_test.mean())
x_test.isnull().sum(axis=0)

com              0
weight           0
strata           0
utl_exp_ppp17    0
male             0
                ..
consumed4600     0
consumed4700     0
consumed4800     0
consumed4900     0
consumed5000     0
Length: 86, dtype: int64

In [11]:
X = np.array(x)
x_test = np.array(x_test)
Y = np.array(y)

x.shape, y.shape

((104234, 86), (104234, 1))

### Define wS-wMAPE metric

In [12]:
def wsmape(y_true, y_pred, weights):
    y_true = y_true.ravel()
    y_pred = y_pred.ravel()
    weights = weights.ravel()

    return (weights * abs(y_true - y_pred)).sum() / (weights * abs(y_true)).sum()

### Split to train and validation set

In [13]:
test_size = 0.20
seed = 42

x_train, x_val, y_train, y_val = train_test_split(X, Y, random_state=seed, test_size=test_size)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((83387, 86), (20847, 86), (83387, 1), (20847, 1))

### Extract weights

In [16]:
train_weights = x_train[:,1]
test_weights = x_val[:,1]

train_weights.shape, test_weights.shape

((83387,), (20847,))

In [19]:
train_weights[:5]

array([ 204., 1530., 1611.,  446.,  920.])

In [18]:
fx_train = np.delete(x_train, 1, axis=1)
x_test = np.delete(x_test, 1, axis=1)

fx_train.shape, x_test.shape

((83387, 85), (103023, 85))

In [20]:
fx_train[1,:]

array([  1.     ,   7.     , 297.40314,       nan,   5.     ,   1.     ,
         2.     ,   0.     ,  27.     ,       nan,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,   1.     ,
         1.     ,   0.     ,       nan,   1.     ,   0.     ,       nan,
         0.     ,       nan,       nan,   0.     ,   0.     ,   0.     ,
         0.     ,   1.     ,   0.     ,   0.     ,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,       nan,
             nan,       nan,   0.     ,   1.     ,   0.     ,   0.     ,
         0.     ,   0.     ,   1.     ,   0.     ,   1.     ,   1.     ,
         1.     ,   0.     ,   0.     ,   0.     ,   0.     ,   1.     ,
         1.     ,   1.     ,   1.     ,   1.     ,   1.     ,   0.     ,
         0.     ,   0.     ,   0.     ,   0.     , 

In [21]:
fx_val = np.delete(x_val, 1, axis=1)
fx_val.shape

(20847, 85)

In [22]:
fx_val[1,:]

array([  1.        ,   8.        , 274.69101   ,          nan,
         5.        ,   1.        ,   0.        ,   2.        ,
        39.        ,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,
                nan,   1.        ,   1.        ,   0.        ,
                nan,   1.5       ,   0.        ,          nan,
         0.66666669,          nan,          nan,   0.        ,
         0.        ,   0.        ,   0.        ,   1.        ,
         0.        ,   0.        ,          nan,          nan,
                nan,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,
                nan,          nan,          nan,          nan,
         0.        ,   1.        ,   1.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   1.  

### Train Linear Regressor

In [20]:
LinearReg = LinearRegression(n_jobs=-1)
LinearReg.fit(fx_train, y_train, sample_weight=train_weights)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,-1
,positive,False


#### Metrics in train data

In [23]:
ypred = LinearReg.predict(fx_train)

rmse = np.sqrt(mean_squared_error(y_train, ypred))
mae = mean_absolute_error(y_train, ypred)
mape = mean_absolute_percentage_error(y_train, ypred)
wsmape_score = wsmape(y_train, ypred, train_weights)

print(f'RMSE in training dataset: {rmse:.3f}')
print(f'MAE in training dataset: {mae:.3f}')
print(f'MAPE in training dataset: {mape:.3f}')
print(f'wMAPE in training dataset: {wsmape_score:.3f}')

RMSE in training dataset: 7.173
MAE in training dataset: 4.049
MAPE in training dataset: 0.434
wMAPE in training dataset: 0.340


#### Metrics in validation data

In [24]:
ypred_val = LinearReg.predict(fx_val)

rmse_test = np.sqrt(mean_squared_error(y_val, ypred_val))
mae_test = mean_absolute_error(y_val, ypred_val)
mape_test = mean_absolute_percentage_error(y_val, ypred_val)
wsmape_score_test = wsmape(y_val, ypred_val, test_weights)

print(f'RMSE in validation dataset: {rmse_test:.3f}')
print(f'MAE in validation dataset: {mae_test:.3f}')
print(f'MAPE in validation dataset: {mape_test:.3f}')
print(f'wMAPE in validation dataset: {wsmape_score_test:.3f}')

RMSE in validation dataset: 7.189
MAE in validation dataset: 4.039
MAPE in validation dataset: 0.433
wMAPE in validation dataset: 0.341


### Train RandomForest Regressor

In [25]:
randomForestReg = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=5)
randomForestReg.fit(fx_train, y_train, sample_weight=train_weights)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### Metrics in train data

In [26]:
ypred = randomForestReg.predict(fx_train)

rmse = np.sqrt(mean_squared_error(y_train, ypred))
mae = mean_absolute_error(y_train, ypred)
mape = mean_absolute_percentage_error(y_train, ypred)
wsmape_score = wsmape(y_train, ypred, train_weights)

print(f'RMSE in training dataset: {rmse:.3f}')
print(f'MAE in training dataset: {mae:.3f}')
print(f'MAPE in training dataset: {mape:.3f}')
print(f'wMAPE in training dataset: {wsmape_score:.3f}')

RMSE in training dataset: 6.952
MAE in training dataset: 3.989
MAPE in training dataset: 0.408
wMAPE in training dataset: 0.326


#### Metrics in validation data

In [27]:
ypred_val = randomForestReg.predict(fx_val)

rmse_test = np.sqrt(mean_squared_error(y_val, ypred_val))
mae_test = mean_absolute_error(y_val, ypred_val)
mape_test = mean_absolute_percentage_error(y_val, ypred_val)
wsmape_score_test = wsmape(y_val, ypred_val, test_weights)

print(f'RMSE in validation dataset: {rmse_test:.3f}')
print(f'MAE in validation dataset: {mae_test:.3f}')
print(f'MAPE in validation dataset: {mape_test:.3f}')
print(f'wMAPE in validation dataset: {wsmape_score_test:.3f}')

RMSE in validation dataset: 7.008
MAE in validation dataset: 4.014
MAPE in validation dataset: 0.411
wMAPE in validation dataset: 0.329


### Train XGBoost Regressor

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [29]:
xgbReg = XGBRegressor(objective='reg:squarederror', n_estimators=100, enable_categorical='True', device=device)
xgbReg.fit(fx_train, y_train, sample_weight=train_weights)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cpu'
,early_stopping_rounds,
,enable_categorical,'True'


#### Metrics in training data

In [30]:
ypred = xgbReg.predict(fx_train)

rmse = np.sqrt(mean_squared_error(y_train, ypred))
mae = mean_absolute_error(y_train, ypred)
mape = mean_absolute_percentage_error(y_train, ypred)
wsmape_score = wsmape(y_train, ypred, train_weights)

print(f'RMSE in training dataset: {rmse:.3f}')
print(f'MAE in training dataset: {mae:.3f}')
print(f'MAPE in training dataset: {mape:.3f}')
print(f'wMAPE in training dataset: {wsmape_score:.3f}')

RMSE in training dataset: 5.139
MAE in training dataset: 2.980
MAPE in training dataset: 0.287
wMAPE in training dataset: 0.214


#### Metrics in validation data

In [31]:
ypred_val = xgbReg.predict(fx_val)

rmse_test = np.sqrt(mean_squared_error(y_val, ypred_val))
mae_test = mean_absolute_error(y_val, ypred_val)
mape_test = mean_absolute_percentage_error(y_val, ypred_val)
wsmape_score_test = wsmape(y_val, ypred_val, test_weights)

print(f'RMSE in validation dataset: {rmse_test:.3f}')
print(f'MAE in validation dataset: {mae_test:.3f}')
print(f'MAPE in validation dataset: {mape_test:.3f}')
print(f'wMAPE in validation dataset: {wsmape_score_test:.3f}')

RMSE in validation dataset: 6.050
MAE in validation dataset: 3.410
MAPE in validation dataset: 0.318
wMAPE in validation dataset: 0.284


### Fine-tuning of XGBoost Regressor

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.3, log=True)
    gamma = trial.suggest_float("gamma", 1e-3, 5, log=True)
    booster = trial.suggest_categorical("booster", ['gbtree','gblinear','dart'])
    max_depth = trial.suggest_int("max_depth", 2, 20)

    xgbReg = XGBRegressor(objective='reg:squarederror', 
                          booster=booster, 
                          n_estimators=n_estimators, 
                          learning_rate=learning_rate, 
                          gamma=gamma, 
                          max_depth=max_depth,
                          device=device)

    xgbReg.fit(fx_train, y_train, sample_weight=train_weights)
    
    ypred = xgbReg.predict(fx_val)
    mae_score_test = mae(y_val, ypred, test_weights)

    return mae_score_test


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print(f"Best parameters: {study.best_params}")
print(f"Best MAE: {study.best_value}")

Best parameters:  {'n_estimators': 316, 'learning_rate': 0.023087045809130017, 'gamma': 2.4911737970405934, 'booster': 'dart', 'max_depth': 9}

#### Train Fine-tuned model

In [32]:
tuned_xgbReg = XGBRegressor(objective='reg:squarederror', 
                      booster='dart', 
                      n_estimators=316, 
                      learning_rate=0.023087045809130017, 
                      gamma=2.4911737970405934, 
                      max_depth=9,
                      device=device)

tuned_xgbReg.fit(fx_train, y_train, sample_weight=train_weights)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,'dart'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cpu'
,early_stopping_rounds,
,enable_categorical,False


In [33]:
ypred = tuned_xgbReg.predict(fx_train)

rmse = np.sqrt(mean_squared_error(y_train, ypred))
mae = mean_absolute_error(y_train, ypred)
mape = mean_absolute_percentage_error(y_train, ypred)
wsmape_score = wsmape(y_train, ypred, train_weights)

print(f'RMSE in training dataset: {rmse:.3f}')
print(f'MAE in training dataset: {mae:.3f}')
print(f'MAPE in training dataset: {mape:.3f}')
print(f'wMAPE in training dataset: {wsmape_score:.3f}')

RMSE in training dataset: 4.319
MAE in training dataset: 2.584
MAPE in training dataset: 0.256
wMAPE in training dataset: 0.177


In [34]:
ypred_val = tuned_xgbReg.predict(fx_val)

rmse_test = np.sqrt(mean_squared_error(y_val, ypred_val))
mae_test = mean_absolute_error(y_val, ypred_val)
mape_test = mean_absolute_percentage_error(y_val, ypred_val)
wsmape_score_test = wsmape(y_val, ypred_val, test_weights)

print(f'RMSE in validation dataset: {rmse_test:.3f}')
print(f'MAE in validation dataset: {mae_test:.3f}')
print(f'MAPE in validation dataset: {mape_test:.3f}')
print(f'wMAPE in validation dataset: {wsmape_score_test:.3f}')

RMSE in validation dataset: 5.899
MAE in validation dataset: 3.306
MAPE in validation dataset: 0.309
wMAPE in validation dataset: 0.275


### Build Neural Network

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.net(x)


In [31]:
input_size = x_train.shape[1]
n_outputs = y_train.shape[1]

input_size, n_outputs

(86, 1)

In [44]:
model = NeuralNet(input_size, n_outputs).to(device)

criterion = nn.SmoothL1Loss(beta=1.0)
optimizer = optim.Adam(
    model.parameters(),
    lr=3e-4,
    weight_decay=1e-5
)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=50,
    min_lr=1e-5
)

### Scale Data

In [45]:
def scale_mixed(X, max_vals):
    X_scaled = X.astype(np.float32)
    for i, m in enumerate(max_vals):
        if m > 0:
            X_scaled[:, i] /= m
    return torch.tensor(X_scaled, dtype=torch.float32)

In [46]:
max_vals = np.max(x_train, axis=0)

Xtrain = scale_mixed(x_train, max_vals)
Xval = scale_mixed(x_val, max_vals)
Xtest = scale_mixed(x_test, max_vals)


y_mean = y_train.mean(axis=0)
y_std  = y_train.std(axis=0) + 1e-8

Ytrain = torch.tensor((y_train - y_mean) / y_std, dtype=torch.float32)
Yval   = torch.tensor((y_val   - y_mean) / y_std, dtype=torch.float32)

### Train Neural Network

In [47]:
train_loader = DataLoader(
    TensorDataset(Xtrain, Ytrain),
    batch_size=32,
    shuffle=True
)

val_loader = DataLoader(
    TensorDataset(Xval, Yval),
    batch_size=256,
    shuffle=False
)

In [49]:
baseline = np.mean(np.abs(y_val - np.mean(y_train)))
print(f"Baseline MAE (mean predictor): {baseline:.6f}")

epochs = 91
for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            val_loss += criterion(pred, y).item()

    val_loss /= len(val_loader)
    scheduler.step(val_loss)

    if epoch % 10 == 0:
        print(
            f"Epoch {epoch} | "
            f"Train: {train_loss:.6f} | "
            f"Val: {val_loss:.6f}"
        )


Baseline MAE (mean predictor): 6.538846
Epoch 0 | Train: 0.119920 | Val: 0.125766
Epoch 10 | Train: 0.119575 | Val: 0.124846
Epoch 20 | Train: 0.119006 | Val: 0.124168
Epoch 30 | Train: 0.116828 | Val: 0.125075
Epoch 40 | Train: 0.117366 | Val: 0.124879
Epoch 50 | Train: 0.116692 | Val: 0.124126
Epoch 60 | Train: 0.117752 | Val: 0.124905
Epoch 70 | Train: 0.117227 | Val: 0.124414
Epoch 80 | Train: 0.116067 | Val: 0.124011
Epoch 90 | Train: 0.116138 | Val: 0.124773


### Predict Test Values

In [35]:
predicted_household_consumption = tuned_xgbReg.predict(x_test)

household_test.shape, predicted_household_consumption.shape

((103023, 88), (103023,))

In [56]:
model.eval()
with torch.no_grad():
    Xtest = Xtest.to(device)
    y_test_pred_scaled = model(Xtest).cpu().numpy()

### Unscale Predicted Values

In [57]:
y_test_pred = y_test_pred_scaled * y_std + y_mean
y_test_pred = y_test_pred.squeeze()

### Save predicted results to CSV

In [58]:
pred_household_consumption = pd.DataFrame({
                            'survey_id':household_test['survey_id'].values,
                            'hhid': household_test['hhid'].values,
                            'cons_ppp17': y_test_pred
                        })

pred_household_consumption.head()

Unnamed: 0,survey_id,hhid,cons_ppp17
0,400000,400001,12.033419
1,400000,400002,7.848096
2,400000,400003,10.788861
3,400000,400004,11.920748
4,400000,400005,7.094519


In [59]:
pred_household_consumption.to_csv('./submitted_csvs/predicted_household_consumption.csv', index=False)

## Predict Rates

In [4]:
household_train = pd.read_csv('./train_hh_features.csv')
household_test = pd.read_csv('./test_hh_features.csv')

target_train_rates = pd.read_csv('./train_rates_gt.csv')

In [5]:
for column in household_train.columns:
    if column in mapping_dict:
        household_train[column] = household_train[column].map(mapping_dict[column])
        household_test[column] = household_test[column].map(mapping_dict[column])

household_train.head()

Unnamed: 0,hhid,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,...,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000,survey_id
0,100001,1,75,4,594.80627,0,1,0,0,0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,100000
1,100002,1,150,4,1676.2723,0,2,0,0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,100000
2,100003,1,375,4,506.93719,1,5,0,0,2,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,100000
3,100004,1,375,4,824.61786,1,5,0,0,1,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,100000
4,100005,1,525,4,351.47644,1,7,1,0,0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,100000


### Split train data based on survey_id

In [6]:
unique_traingroups = household_train['survey_id'].unique()
unique_traingroups

array([100000, 200000, 300000])

In [7]:
Xsurvey1 = household_train[household_train['survey_id'] ==  unique_traingroups[0]]
Xsurvey2 = household_train[household_train['survey_id'] ==  unique_traingroups[1]]
Xsurvey3 = household_train[household_train['survey_id'] ==  unique_traingroups[2]]

Ysurvey1 = target_train_rates[target_train_rates['survey_id'] == unique_traingroups[0]]
Ysurvey2 = target_train_rates[target_train_rates['survey_id'] == unique_traingroups[1]]
Ysurvey3 = target_train_rates[target_train_rates['survey_id'] == unique_traingroups[2]]

Xsurvey1.shape, Xsurvey2.shape, Xsurvey3.shape

((32188, 88), (34584, 88), (37462, 88))

In [8]:
unique_testgroups = household_test['survey_id'].unique()
unique_testgroups

array([400000, 500000, 600000])

In [9]:
Xtest_survey1 = household_test[household_test['survey_id'] ==  unique_testgroups[0]]
Xtest_survey2 = household_test[household_test['survey_id'] ==  unique_testgroups[1]]
Xtest_survey3 = household_test[household_test['survey_id'] ==  unique_testgroups[2]]

Xtest_survey1.shape, Xtest_survey2.shape, Xtest_survey3.shape

((34565, 88), (34245, 88), (34213, 88))

### Preprocess Data

In [10]:
Xsurvey1 = Xsurvey1.drop(columns=['survey_id','hhid'])
Xsurvey2 = Xsurvey2.drop(columns=['survey_id','hhid'])
Xsurvey3 = Xsurvey3.drop(columns=['survey_id','hhid'])

Xtest_survey1 = Xtest_survey1.drop(columns=['survey_id','hhid'])
Xtest_survey2 = Xtest_survey2.drop(columns=['survey_id','hhid'])
Xtest_survey3 = Xtest_survey3.drop(columns=['survey_id','hhid'])

Ysurvey1 = Ysurvey1.drop(columns=['survey_id'])
Ysurvey2 = Ysurvey2.drop(columns=['survey_id'])
Ysurvey3 = Ysurvey3.drop(columns=['survey_id'])


Xsurvey1.shape, Xsurvey2.shape, Xsurvey3.shape

((32188, 86), (34584, 86), (37462, 86))

In [11]:
Xsurvey1.head()

Unnamed: 0,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,age,...,consumed4100,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000
0,1,75,4,594.80627,0,1,0,0,0,75,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
1,1,150,4,1676.2723,0,2,0,0,0,61,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1,375,4,506.93719,1,5,0,0,2,49,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
3,1,375,4,824.61786,1,5,0,0,1,58,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1,525,4,351.47644,1,7,1,0,0,57,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0


In [12]:
Ysurvey1.shape, Ysurvey2.shape, Ysurvey3.shape

((1, 19), (1, 19), (1, 19))

In [13]:
Xsurvey1 = Xsurvey1.fillna(Xsurvey1.mean())
Xsurvey2 = Xsurvey2.fillna(Xsurvey2.mean())
Xsurvey3 = Xsurvey3.fillna(Xsurvey3.mean())

In [14]:
Xsurvey1.isnull().sum(axis=0)

com              0
weight           0
strata           0
utl_exp_ppp17    0
male             0
                ..
consumed4600     0
consumed4700     0
consumed4800     0
consumed4900     0
consumed5000     0
Length: 86, dtype: int64

In [15]:
Xtest_survey1 = Xtest_survey1.fillna(Xtest_survey1.mean())
Xtest_survey2 = Xtest_survey2.fillna(Xtest_survey2.mean())
Xtest_survey3 = Xtest_survey3.fillna(Xtest_survey3.mean())

In [16]:
Xtest_survey1.isnull().sum(axis=0)

com              0
weight           0
strata           0
utl_exp_ppp17    0
male             0
                ..
consumed4600     0
consumed4700     0
consumed4800     0
consumed4900     0
consumed5000     0
Length: 86, dtype: int64

In [17]:
x1_train = np.array(Xsurvey1)
x2_train = np.array(Xsurvey2)
x3_train = np.array(Xsurvey3)

x1_test = np.array(Xtest_survey1)
x2_test = np.array(Xtest_survey2)
x3_test= np.array(Xtest_survey3)

y1_train = np.array(Ysurvey1)
y2_train = np.array(Ysurvey2)
y3_train = np.array(Ysurvey3)

### Build Neural Network

In [None]:
class SetModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
        )
        self.head = nn.Linear(32*2, output_dim)
        
    def forward(self, x):
        h = self.encoder(x)                   
        h_mean = h.mean(dim=0)               
        h_max = h.max(dim=0).values       
        h_combined = torch.cat([h_mean, h_max]).unsqueeze(0)
        out = self.head(h_combined)
        return out.squeeze(0)    


In [20]:
input_size = x1_train.shape[1]
n_outputs = y1_train.shape[1]

input_size, n_outputs

(86, 19)

In [21]:
model = SetModel(input_size, n_outputs)
criterion = nn.MSELoss()
optimizer = optim.Adam(
    model.parameters(),
    lr=1e-5,
    weight_decay=1e-5
)

### Train Neural Network

In [22]:
def create_chunks(X, y, chunk_size):
    chunks = []
    N = X.shape[0]
    for i in range(0, N, chunk_size):
        chunks.append((X[i:i+chunk_size], y))
    return chunks

In [23]:
X1 = torch.tensor(x1_train, dtype=torch.float32)
X2 = torch.tensor(x2_train, dtype=torch.float32)
X3 = torch.tensor(x3_train, dtype=torch.float32)

y1 = torch.tensor(y1_train, dtype=torch.float32).view(-1)
y2 = torch.tensor(y2_train, dtype=torch.float32).view(-1)
y3 = torch.tensor(y3_train, dtype=torch.float32).view(-1)

#### Scale Data

In [24]:
def scale_mixed(X, max_vals):
    X_scaled = X.astype(np.float32)
    for i, m in enumerate(max_vals):
        if m > 0:
            X_scaled[:, i] /= m
    return torch.tensor(X_scaled, dtype=torch.float32)

In [25]:
max_vals = np.max(np.vstack([x1_train, x2_train, x3_train]), axis=0)

X1 = scale_mixed(x1_train, max_vals)
X2 = scale_mixed(x2_train, max_vals)
X3 = scale_mixed(x3_train, max_vals)

### Create Chunks

In [26]:
chunk_size = 128

train_chunks = create_chunks(X1, y1, chunk_size) + \
               create_chunks(X2, y2, chunk_size) + \
               create_chunks(X3, y3, chunk_size)

print(f"Total training samples (chunks): {len(train_chunks)}")

Total training samples (chunks): 816


In [None]:
epochs = 201

for epoch in range(epochs):
    total_loss = 0.0
    for X_chunk, y_chunk in train_chunks:
        optimizer.zero_grad()

        pred = model(X_chunk)
        loss = criterion(pred, y_chunk)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    if epoch % 20 == 0:
        print(f"Epoch {epoch} | Loss: {total_loss:.6f}")

Epoch 0 | Loss: 256.963931
Epoch 20 | Loss: 0.290657
Epoch 40 | Loss: 0.133592
Epoch 60 | Loss: 0.081992
Epoch 80 | Loss: 0.056061
Epoch 100 | Loss: 0.040542
Epoch 120 | Loss: 0.030626
Epoch 140 | Loss: 0.023956
Epoch 160 | Loss: 0.019114
Epoch 180 | Loss: 0.015570
Epoch 200 | Loss: 0.012952


### Predict Test Rates

In [None]:
def predict_dataset(model, X_test, chunk_size=1024):
    model.eval()
    N = X_test.shape[0]
    preds = []
    with torch.no_grad():
        for i in range(0, N, chunk_size):
            X_chunk = X_test[i:i+chunk_size]
            pred = model(X_chunk)
            preds.append(pred)
    # Average predictions of chunks
    final_pred = torch.stack(preds).mean(dim=0)
    return final_pred

In [29]:
X1_test = torch.tensor(x1_test, dtype=torch.float32)
X2_test = torch.tensor(x2_test, dtype=torch.float32)
X3_test = torch.tensor(x3_test, dtype=torch.float32)

X1_test = scale_mixed(x1_test, max_vals)
X2_test = scale_mixed(x2_test, max_vals)
X3_test = scale_mixed(x3_test, max_vals)

y1_pred = predict_dataset(model, X1_test, chunk_size)
y2_pred = predict_dataset(model, X2_test, chunk_size)
y3_pred = predict_dataset(model, X3_test, chunk_size)

print("Prediction for test set 1:", y1_pred)
print("Prediction for test set 2:", y2_pred)
print("Prediction for test set 3:", y3_pred)

Prediction for test set 1: tensor([0.0501, 0.1017, 0.1510, 0.2007, 0.2502, 0.3025, 0.3515, 0.3989, 0.4509,
        0.4991, 0.5503, 0.6009, 0.6492, 0.7012, 0.7508, 0.8002, 0.8494, 0.8997,
        0.9522])
Prediction for test set 2: tensor([0.0508, 0.1000, 0.1502, 0.1991, 0.2497, 0.3016, 0.3502, 0.3993, 0.4495,
        0.4996, 0.5498, 0.5997, 0.6490, 0.6997, 0.7504, 0.7992, 0.8487, 0.8997,
        0.9519])
Prediction for test set 3: tensor([0.0503, 0.1008, 0.1499, 0.1991, 0.2506, 0.3009, 0.3499, 0.3987, 0.4492,
        0.4989, 0.5495, 0.5987, 0.6483, 0.6994, 0.7500, 0.7986, 0.8482, 0.8991,
        0.9512])


In [None]:
y1_pred = y1_pred.numpy()
y2_pred = y2_pred.numpy()
y3_pred = y3_pred.numpy()

In [43]:
all_preds = np.vstack([y1_pred, y2_pred, y3_pred])

In [47]:
all_preds.shape

(3, 19)

In [48]:
column_names = ["pct_hh_below_3.17","pct_hh_below_3.94","pct_hh_below_4.60","pct_hh_below_5.26","pct_hh_below_5.88","pct_hh_below_6.47","pct_hh_below_7.06","pct_hh_below_7.70","pct_hh_below_8.40","pct_hh_below_9.13","pct_hh_below_9.87","pct_hh_below_10.70","pct_hh_below_11.62","pct_hh_below_12.69","pct_hh_below_14.03","pct_hh_below_15.64","pct_hh_below_17.76","pct_hh_below_20.99","pct_hh_below_27.37"]
test_rates_df = pd.DataFrame(all_preds, columns=column_names)
test_rates_df.insert(0, 'survey_id', ['400000', '500000', '600000'])

test_rates_df.head()

Unnamed: 0,survey_id,pct_hh_below_3.17,pct_hh_below_3.94,pct_hh_below_4.60,pct_hh_below_5.26,pct_hh_below_5.88,pct_hh_below_6.47,pct_hh_below_7.06,pct_hh_below_7.70,pct_hh_below_8.40,pct_hh_below_9.13,pct_hh_below_9.87,pct_hh_below_10.70,pct_hh_below_11.62,pct_hh_below_12.69,pct_hh_below_14.03,pct_hh_below_15.64,pct_hh_below_17.76,pct_hh_below_20.99,pct_hh_below_27.37
0,400000,0.050072,0.101711,0.150964,0.200691,0.250171,0.302473,0.351494,0.398853,0.450873,0.499112,0.550349,0.600936,0.649184,0.701211,0.750831,0.800227,0.849411,0.899683,0.952211
1,500000,0.050776,0.10002,0.150217,0.199147,0.249662,0.30158,0.350227,0.399283,0.449542,0.499613,0.549774,0.599731,0.64899,0.69975,0.750354,0.799165,0.84874,0.899696,0.951879
2,600000,0.050335,0.10076,0.149877,0.19912,0.250617,0.300903,0.34989,0.398743,0.449227,0.498891,0.549536,0.59867,0.648292,0.699378,0.750033,0.798613,0.848167,0.899056,0.95119


In [50]:
test_rates_df.to_csv('./submitted_csvs/predicted_poverty_distribution.csv', index=False)