In [26]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [27]:
# Set random seed 
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fcfa03e2970>

In [28]:
# Parameters
n_samples = 50000 # Number of samples in dataset
n_continuous = 30 # Number of continuous variables
n_categorical = 10 # Number of categorical variables
max_categories = 100 # Max number of categories per categorical variable
non_informative_prop = 0.3 # Proportion of variables that are non-informative

In [29]:
# Generate continuous features
X_continuous = np.random.randn(n_samples, n_continuous)

# Generate categorical features
X_categorical = []
categorical_effects = []
for i in range(n_categorical):
    n_categories = np.random.randint(2, max_categories)
    X_categorical.append(np.random.randint(0, n_categories, n_samples))
    if np.random.rand() > non_informative_prop:
        categorical_effects.append(np.random.randn(n_categories))
    else:
        categorical_effects.append(np.zeros(n_categories))
X_categorical = np.array(X_categorical).T

# Generate target variable
y = np.zeros(n_samples)
for i in range(n_continuous):
    effect = np.random.randn()
    y += effect * X_continuous[:, i] + np.random.randn(n_samples) * 0.1

for i in range(n_categorical):
    y += np.array([categorical_effects[i][cat] for cat in X_categorical[:, i]])

y += np.random.randn(n_samples) * 0.5  # Add some noise

# Create dataframe
df = pd.DataFrame(X_continuous, columns=[f'cont_{i}' for i in range(n_continuous)])
for i in range(n_categorical):
    df[f'cat_{i}'] = X_categorical[:, i]
df['target'] = y

In [30]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

# split test into validation and test
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


In [31]:

# split the data into training and testing sets

# create an xgboost matrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# set the parameters for the xgboost model
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.05,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'reg:linear',  # error evaluation for multiclass training # early stopping rounds
    'eval_metric': 'rmse',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 0,
    'nthread': 8,
}

# specify validations set to watch performance
watchlist = [(dval, 'eval'), (dtrain, 'train')]

# train the model
num_round = 100000
xgb_model = xgb.train(param, dtrain, num_round, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=10)

xgb_pred = xgb_model.predict(dtest)
xgb_mse = mean_squared_error(y_test, xgb_pred)
print(f"XGBoost MSE: {xgb_mse}")

Parameters: { "silent" } are not used.

[0]	eval-rmse:6.09420
[10]	eval-rmse:5.75561
[20]	eval-rmse:5.50345
[30]	eval-rmse:5.29281
[40]	eval-rmse:5.10784
[50]	eval-rmse:4.94518
[60]	eval-rmse:4.79726
[70]	eval-rmse:4.66008
[80]	eval-rmse:4.53761
[90]	eval-rmse:4.42476
[100]	eval-rmse:4.32184
[110]	eval-rmse:4.22671
[120]	eval-rmse:4.13877
[130]	eval-rmse:4.05566
[140]	eval-rmse:3.97783
[150]	eval-rmse:3.90740
[160]	eval-rmse:3.84162
[170]	eval-rmse:3.77844
[180]	eval-rmse:3.71859
[190]	eval-rmse:3.66355
[200]	eval-rmse:3.61191
[210]	eval-rmse:3.56438
[220]	eval-rmse:3.51808
[230]	eval-rmse:3.47068
[240]	eval-rmse:3.42917
[250]	eval-rmse:3.38972
[260]	eval-rmse:3.35296
[270]	eval-rmse:3.31353
[280]	eval-rmse:3.27751
[290]	eval-rmse:3.24167
[300]	eval-rmse:3.21088
[310]	eval-rmse:3.18154
[320]	eval-rmse:3.15095
[330]	eval-rmse:3.12167
[340]	eval-rmse:3.09520
[350]	eval-rmse:3.07196
[360]	eval-rmse:3.04579
[370]	eval-rmse:3.02278
[380]	eval-rmse:3.00125
[390]	eval-rmse:2.97941
[400]	eval-

In [32]:
# Stacked model (GBM + NN)

# First, train a GBM on continuous features
cont_cols = [col for col in X_train.columns if col.startswith('cont_')]
cat_cols = [col for col in X_train.columns if col.startswith('cat_')]



# Split the data
X_train_cat, X_val_cat, X_test_cat = X_train[cat_cols], X_val[cat_cols], X_test[cat_cols]
X_train, X_val, X_test = X_train[cont_cols], X_val[cont_cols], X_test[cont_cols]

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# set the parameters for the xgboost model
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.05,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'reg:linear',  # error evaluation for multiclass training # early stopping rounds
    'eval_metric': 'rmse',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 0,
    'nthread': 8,
}

# specify validations set to watch performance
watchlist = [(dval, 'eval'), (dtrain, 'train')]

# train the model
num_round = 100000
xgb_model = xgb.train(param, dtrain, num_round, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=10)

gbm_train_pred = xgb_model.predict(dtrain)
gbm_test_pred = xgb_model.predict(dtest)

# Prepare data for neural network
label_encoders = {}
X_train_cat_encoded = np.zeros_like(X_train_cat)
X_test_cat_encoded = np.zeros_like(X_test_cat)

for i, col in enumerate(cat_cols):
    le = LabelEncoder()
    X_train_cat_encoded[:, i] = le.fit_transform(X_train_cat[col])
    X_test_cat_encoded[:, i] = le.transform(X_test_cat[col])
    label_encoders[col] = le

Parameters: { "silent" } are not used.

[0]	eval-rmse:6.09711
[10]	eval-rmse:5.75215
[20]	eval-rmse:5.50318
[30]	eval-rmse:5.29259
[40]	eval-rmse:5.11205
[50]	eval-rmse:4.95017
[60]	eval-rmse:4.79739
[70]	eval-rmse:4.66263
[80]	eval-rmse:4.54233
[90]	eval-rmse:4.43121
[100]	eval-rmse:4.32842
[110]	eval-rmse:4.23703
[120]	eval-rmse:4.15107
[130]	eval-rmse:4.07188
[140]	eval-rmse:4.00003
[150]	eval-rmse:3.93037
[160]	eval-rmse:3.86903
[170]	eval-rmse:3.80709
[180]	eval-rmse:3.75230
[190]	eval-rmse:3.69926
[200]	eval-rmse:3.64930
[210]	eval-rmse:3.60380
[220]	eval-rmse:3.56108
[230]	eval-rmse:3.52028
[240]	eval-rmse:3.48033
[250]	eval-rmse:3.44360
[260]	eval-rmse:3.40655
[270]	eval-rmse:3.37603
[280]	eval-rmse:3.34508
[290]	eval-rmse:3.31604
[300]	eval-rmse:3.28788
[310]	eval-rmse:3.26014
[320]	eval-rmse:3.23337
[330]	eval-rmse:3.21052
[340]	eval-rmse:3.18924
[350]	eval-rmse:3.16842
[360]	eval-rmse:3.14890
[370]	eval-rmse:3.13015
[380]	eval-rmse:3.11346
[390]	eval-rmse:3.09380
[400]	eval-

In [33]:
# PyTorch Dataset
class TabularDataset(Dataset):
    def __init__(self, cat_data, cont_data, gbm_pred, targets):
        self.cat_data = torch.tensor(cat_data, dtype=torch.long)
        self.cont_data = torch.tensor(cont_data, dtype=torch.float32)
        self.gbm_pred = torch.tensor(gbm_pred, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (self.cat_data[idx], self.cont_data[idx], self.gbm_pred[idx], self.targets[idx])


In [34]:
# Create datasets
train_dataset = TabularDataset(X_train_cat_encoded, X_train[cont_cols].values, gbm_train_pred, y_train.values)
test_dataset = TabularDataset(X_test_cat_encoded, X_test[cont_cols].values, gbm_test_pred, y_test.values)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Neural Network model
class StackedModel(nn.Module):
    def __init__(self, cat_dims, embed_dims, cont_dim):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(dim, embed_dim) 
                                         for dim, embed_dim in zip(cat_dims, embed_dims)])
        self.num_embeddings = sum(embed_dims)
        self.fc1 = nn.Linear(self.num_embeddings + cont_dim + 1, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, cat_input, cont_input, gbm_pred):
        embedded = [emb(cat_input[:, i]) for i, emb in enumerate(self.embeddings)]
        embedded = torch.cat(embedded, dim=1)
        x = torch.cat([embedded, cont_input, gbm_pred.unsqueeze(1)], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Instantiate the model
cat_dims = [len(le.classes_) for le in label_encoders.values()]
embed_dims = [min(50, (dim + 1) // 2) for dim in cat_dims]
model = StackedModel(cat_dims, embed_dims, len(cont_cols))

# Training loop
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

num_epochs = 10
best_score = 100000
best_model = None
for epoch in range(num_epochs):
    model.eval()
    stacked_preds = []
    with torch.no_grad():
        for cat_data, cont_data, gbm_pred, _ in test_loader:
            outputs = model(cat_data, cont_data, gbm_pred)
            stacked_preds.append(outputs.numpy())

    stacked_preds = np.concatenate(stacked_preds).flatten()
    stacked_mse = mean_squared_error(y_test, stacked_preds)
    if stacked_mse < best_score:
        best_score = stacked_mse
        best_model = model # need to figure out how to copy
        print(stacked_mse)
    else:
        break
    print(epoch)
    model.train()
    for cat_data, cont_data, gbm_pred, targets in train_loader:
        optimizer.zero_grad()
        outputs = None
        try:
            outputs = model(cat_data, cont_data, gbm_pred)
        except:
            outputs = model(cat_data, cont_data, gbn_pred.unsqueeze(1))
        loss = criterion(outputs, targets.unsqueeze(1))
        loss.backward()
        optimizer.step()


36.294677855618545
0
1.0477133014698317
1
1.0171995198647494
2
0.9199809940965704
3
0.8523584147263394
4
0.7992217637240148
5
0.7840515023619308
6


In [35]:
# Evaluation
model.eval()
stacked_preds = []
with torch.no_grad():
    for cat_data, cont_data, gbm_pred, _ in test_loader:
        outputs = model(cat_data, cont_data, gbm_pred)
        stacked_preds.append(outputs.numpy())

stacked_preds = np.concatenate(stacked_preds).flatten()
stacked_mse = mean_squared_error(y_test, stacked_preds)

print("XBG MSE:", xgb_mse)
print(f"Stacked Model MSE: {stacked_mse}")

print(f"Improvement: {(xgb_mse - stacked_mse) / xgb_mse * 100:.2f}%")

XBG MSE: 1.2302821228747942
Stacked Model MSE: 0.8187297941387333
Improvement: 33.45%
