# Model Upload

In [1]:
!python --version

Python 3.9.16


In [1]:
# Install dependencies
%pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1

Note: you may need to restart the kernel to use updated packages.


In [1]:
from numerapi import NumerAPI
import pandas as pd
import json
import cloudpickle
napi = NumerAPI()

# use one of the latest data versions
DATA_VERSION = "v4.3"

In [2]:
napi.download_dataset("v4.3/train_int8.parquet", "v4.3/train_int8.parquet")
napi.download_dataset("v4.3/validation_int8.parquet", "v4.3/validation_int8.parquet")
napi.download_dataset("v4.3/live_int8.parquet", "v4.3/live_int8.parquet")
napi.download_dataset("v4.3/live_example_preds.parquet", "v4.3/live_example_preds.parquet")
napi.download_dataset("v4.3/validation_example_preds.parquet", "v4.3/validation_example_preds.parquet")
napi.download_dataset("v4.3/features.json", "v4.3/features.json")
napi.download_dataset("v4.3/meta_model.parquet", "v4.3/meta_model.parquet")
napi.download_dataset("v4.3/live_benchmark_models.parquet", "v4.3/live_benchmark_models.parquet")
napi.download_dataset("v4.3/validation_benchmark_models.parquet", "v4.3/validation_benchmark_models.parquet")
napi.download_dataset("v4.3/train_benchmark_models.parquet", "v4.3/train_benchmark_models.parquet")

2024-02-20 11:34:24,486 INFO numerapi.utils: target file already exists
2024-02-20 11:34:24,486 INFO numerapi.utils: download complete
2024-02-20 11:34:25,541 INFO numerapi.utils: target file already exists
2024-02-20 11:34:25,541 INFO numerapi.utils: download complete
2024-02-20 11:34:26,487 INFO numerapi.utils: target file already exists
2024-02-20 11:34:26,487 INFO numerapi.utils: download complete
2024-02-20 11:34:27,475 INFO numerapi.utils: target file already exists
2024-02-20 11:34:27,475 INFO numerapi.utils: download complete
2024-02-20 11:34:28,480 INFO numerapi.utils: target file already exists
2024-02-20 11:34:28,480 INFO numerapi.utils: download complete
2024-02-20 11:34:29,393 INFO numerapi.utils: target file already exists
2024-02-20 11:34:29,397 INFO numerapi.utils: download complete
2024-02-20 11:34:30,345 INFO numerapi.utils: target file already exists
2024-02-20 11:34:30,345 INFO numerapi.utils: download complete
2024-02-20 11:34:31,316 INFO numerapi.utils: target fil

'v4.3/train_benchmark_models.parquet'

In [3]:
# Data inspection
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
features = feature_metadata["feature_sets"]["all"] # use "all" for better performance. Requires more RAM.
train = pd.read_parquet(f"{DATA_VERSION}/train_int8.parquet", columns=["era"]+features+["target"])
validation = pd.read_parquet(f"{DATA_VERSION}/validation_int8.parquet", columns=["era"]+features+["target"])

train = pd.concat([train, validation])
del validation

In [4]:
train = train[train["era"].isin(train["era"].unique()[::4])]

In [5]:
train.shape

(1277940, 2378)

---

In [10]:
#  For better models, join train and validation data and train on all of it.
# This would cause diagnostics to be misleading though.
# napi.download_dataset(f"{DATA_VERSION}/validation_int8.parquet");
# validation = pd.read_parquet(f"{DATA_VERSION}/validation_int8.parquet", columns=["era"]+features+["target"])
# validation = validation[validation["data_type"] == "validation"] # drop rows which don't have targets yet
# train = pd.concat([train, validation])
# del validation

# Downsample for speed
# train = train[train["era"].isin(train["era"].unique()[::4])]  # skip this step for better performance

# Train model
import lightgbm as lgb
model = lgb.LGBMRegressor(
    n_estimators=20000,  # If you want to use a larger model we've found 20_000 trees to be better
    learning_rate=0.001, # and a learning rate of 0.001
    max_depth=6, # and max_depth=6
    num_leaves=2**6-1, # and num_leaves of 2**6-1
    colsample_bytree=0.1
)
model.fit(
    train[features],
    train["target"]
)

# Define predict function
def predict(
    live_features: pd.DataFrame,
    live_benchmark_models: pd.DataFrame
 ) -> pd.DataFrame:
    live_predictions = model.predict(live_features[features])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# Pickle predict function
p = cloudpickle.dumps(predict)
with open("models/full_basic.pkl", "wb") as f:
    f.write(p)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11880
[LightGBM] [Info] Number of data points in the train set: 1277940, number of used features: 2376
[LightGBM] [Info] Start training from score 0.498081


In [6]:
feature_sets = feature_metadata["feature_sets"]

sizes = ["small", "medium", "all"]
groups = [
  "intelligence",
  "wisdom",
  "charisma",
  "dexterity",
  "strength",
  "constitution",
  "agility",
  "serenity"
]

# compile the intersections of feature sets and feature groups
subgroups = {}
for size in sizes:
    subgroups[size] = {}
    for group in groups:
        subgroups[size][group] = (
            set(feature_sets[size])
            .intersection(set(feature_sets[group]))
        )

# convert to data frame and display the feature count of each intersection
pd.DataFrame(subgroups).applymap(len).sort_values(by="all", ascending=False)

Unnamed: 0,small,medium,all
constitution,2,134,335
charisma,3,116,290
agility,2,58,145
wisdom,3,56,140
strength,1,54,135
serenity,3,34,95
dexterity,4,21,51
intelligence,2,14,35


In [9]:
for group in groups:
    group_feats = list(subgroups["all"][group])

    model = lgb.LGBMRegressor(
    n_estimators=20000,  # If you want to use a larger model we've found 20_000 trees to be better
    learning_rate=0.001, # and a learning rate of 0.001
    max_depth=6, # and max_depth=6
    num_leaves=2**6-1, # and num_leaves of 2**6-1
    colsample_bytree=0.1
    )
    model.fit(
        train[group_feats],
        train["target"]
    )

    def predict(
        live_features: pd.DataFrame,
        live_benchmark_models: pd.DataFrame
    ) -> pd.DataFrame:
        live_predictions = model.predict(live_features[group_feats])
        submission = pd.Series(live_predictions, index=live_features.index)
        return submission.to_frame("prediction")

    # Pickle predict function
    import cloudpickle
    p = cloudpickle.dumps(predict)
    with open(f"models/{group}_full.pkl", "wb") as f:
        f.write(p)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 175
[LightGBM] [Info] Number of data points in the train set: 1277940, number of used features: 35
[LightGBM] [Info] Start training from score 0.498081
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 700
[LightGBM] [Info] Number of data points in the train set: 1277940, number of used features: 140
[LightGBM] [Info] Start training from score 0.498081
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1450
[LightGBM] [Info] Number of data points in the train set: 1277940, number of used features: 290
[LightGBM] [Info] Start training from score 0.498081
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

In [24]:
live = pd.read_parquet("v4.3/live_int8.parquet", columns=["era"]+features)
live_benchmark_models = pd.read_parquet("v4.3/live_benchmark_models.parquet")
live_benchmark_models.drop(["era"], axis=1, inplace=True)

In [27]:
# Correlations with live
models = [f"{group}_full.pkl" for group in groups] + ["full_basic.pkl"]
correlations = pd.DataFrame(columns=live_benchmark_models.columns, index=models)

for model in models:
    with open(f"models/{model}", "rb") as f:
        predict = cloudpickle.load(f)

    predictions = predict(live, None)
    corrs = live_benchmark_models.corrwith(predictions["prediction"])
    correlations.loc[model] = corrs

In [28]:
correlations

Unnamed: 0,v43_lgbm_teager60,v43_lgbm_teager20,v43_lgbm_cyrus60,v43_lgbm_cyrus20,v42_example_preds,v42_lgbm_teager60,v42_lgbm_teager20,v42_lgbm_agnes20,v42_lgbm_claudia20,v42_lgbm_rowan20,...,v4_lgbm_jerome20,v3_example_preds,v2_example_preds,v41_example_preds,v42_rain_ensemble,v42_rain_ensemble2,v42_teager_plus_cyrus,v42_teager_ensemble,v42_lgbm_ct_blend,v43_lgbm_ct_blend
intelligence_full.pkl,0.138229,0.125886,0.130638,0.128597,0.125041,0.141778,0.130507,0.072981,0.131087,0.138315,...,0.137047,0.130313,0.10545,0.141998,0.132275,0.123008,0.137773,0.144238,0.137773,0.136979
wisdom_full.pkl,0.103848,0.105779,0.214417,0.207875,0.205422,0.12524,0.114189,0.23892,0.10873,0.114612,...,0.139363,0.248614,0.25248,0.172458,0.212342,0.203957,0.170835,0.180937,0.170835,0.166875
charisma_full.pkl,0.120994,0.1605,0.252596,0.309989,0.295966,0.118304,0.15167,0.292546,0.140532,0.149786,...,0.242018,0.323493,0.355955,0.291828,0.276146,0.302106,0.241487,0.227923,0.241487,0.254062
dexterity_full.pkl,0.162419,0.215267,0.148502,0.182313,0.191671,0.170675,0.225537,0.149702,0.225229,0.222717,...,0.180123,0.194082,0.127914,0.17325,0.189851,0.197521,0.221363,0.212707,0.221363,0.211839
strength_full.pkl,0.0624,0.057915,0.167172,0.160538,0.153172,0.059336,0.050351,0.168358,0.050143,0.049251,...,0.133901,0.204858,0.192631,0.152599,0.15172,0.177103,0.1096,0.112646,0.1096,0.118265
constitution_full.pkl,0.075441,0.066472,0.177276,0.16237,0.179064,0.082045,0.078019,0.187387,0.085669,0.078279,...,0.09295,0.184623,0.228169,0.168897,0.171486,0.191207,0.139853,0.139085,0.139853,0.125291
agility_full.pkl,0.142255,0.182126,0.135051,0.173121,0.17904,0.143984,0.181912,0.115564,0.181077,0.185923,...,0.202843,0.100332,0.046031,0.160274,0.172532,0.16767,0.194575,0.184742,0.194575,0.191109
serenity_full.pkl,0.175282,0.187178,0.201332,0.20756,0.204659,0.170592,0.185512,0.194725,0.193051,0.188906,...,0.246345,0.224924,0.172531,0.266124,0.222577,0.203606,0.207356,0.219931,0.207356,0.209135
full_basic.pkl,0.56875,0.641007,0.749991,0.849343,0.816815,0.549122,0.611441,0.600543,0.605056,0.60566,...,0.589046,0.639663,0.565264,0.614744,0.79855,0.791996,0.765862,0.750277,0.765862,0.799921


# MLP for ensembling

In [7]:
models = [f"{group}_full.pkl" for group in groups] + ["full_basic.pkl"]

In [8]:
predicts = []

for model in models:
    with open(f"models/{model}", "rb") as f:
        predict = cloudpickle.load(f)
    
    predicts.append(predict)

In [12]:
train_data = train.sample(frac=0.6)
train_data.shape

(766764, 2378)

In [13]:
del train

In [14]:
X_train = []

for predict in predicts:
    print("predicting")
    X_train.append(predict(train_data[features], None))

predicting
predicting
predicting
predicting
predicting
predicting
predicting
predicting
predicting


In [None]:
MLP_train_df = pd.read_csv("MLP_train.csv")

In [46]:
MLP_train_df.drop(columns=["target"]).values

array([[0.49990373, 0.47025265, 0.48809268, ..., 0.49448219, 0.48746565,
        0.47819121],
       [0.50331176, 0.50262932, 0.50114788, ..., 0.49623947, 0.48983293,
        0.49783164],
       [0.50008626, 0.50601412, 0.49818204, ..., 0.50967552, 0.49225771,
        0.49660677],
       ...,
       [0.49635745, 0.51437336, 0.5065374 , ..., 0.49297389, 0.49847244,
        0.50639885],
       [0.50027506, 0.49360373, 0.52378326, ..., 0.50087996, 0.51380891,
        0.5095379 ],
       [0.50336977, 0.49647006, 0.49239544, ..., 0.4949743 , 0.50886535,
        0.49598697]])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [66]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(MLP, self).__init__()
        layers = []
        layers.append(nn.Linear(input_size, hidden_size))
        layers.append(nn.ReLU())
        for _ in range(num_layers - 1):
            layers.append(nn.Linear(hidden_size, hidden_size))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_size, output_size))
        layers.append(nn.Sigmoid())
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

In [91]:
X_train = torch.tensor(MLP_train_df.drop(columns=["target"]).values).float()
y_train = torch.tensor(MLP_train_df["target"].values).float()

# Normalise the data
mean = X_train.mean(dim=0)
std = X_train.std(dim=0)
X_train = (X_train - mean) / std

# Normalise the target
mean_target = y_train.mean()
std_target = y_train.std()
y_train = (y_train - mean_target) / std_target

# Create DataLoader for training data
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize MLP
mlp = MLP(9, hidden_size=5, output_size=1, num_layers=1)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-8)

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    total_loss = 0
    for i, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = mlp(inputs).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        # Print loss every 10 iterations
        if (i + 1) % 10 == 0:
            print(f"Iteration [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}")



Iteration [10/23962], Loss: nan
Iteration [20/23962], Loss: nan
Iteration [30/23962], Loss: nan
Iteration [40/23962], Loss: nan
Iteration [50/23962], Loss: nan
Iteration [60/23962], Loss: nan
Iteration [70/23962], Loss: nan
Iteration [80/23962], Loss: nan
Iteration [90/23962], Loss: nan
Iteration [100/23962], Loss: nan
Iteration [110/23962], Loss: nan
Iteration [120/23962], Loss: nan
Iteration [130/23962], Loss: nan
Iteration [140/23962], Loss: nan
Iteration [150/23962], Loss: nan
Iteration [160/23962], Loss: nan
Iteration [170/23962], Loss: nan
Iteration [180/23962], Loss: nan
Iteration [190/23962], Loss: nan
Iteration [200/23962], Loss: nan
Iteration [210/23962], Loss: nan
Iteration [220/23962], Loss: nan
Iteration [230/23962], Loss: nan
Iteration [240/23962], Loss: nan
Iteration [250/23962], Loss: nan
Iteration [260/23962], Loss: nan
Iteration [270/23962], Loss: nan
Iteration [280/23962], Loss: nan
Iteration [290/23962], Loss: nan
Iteration [300/23962], Loss: nan
Iteration [310/2396

KeyboardInterrupt: 

### TESTING

In [123]:
predicts[0](train, None)

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
n003bba8a98662e4,0.495021
n003bee128c2fcfc,0.497133
n0048ac83aff7194,0.489885
n00691bec80d3e02,0.496798
n00b8720a2fdc4f2,0.494003
...,...
nffc93463f094655,0.493693
nffdaf3cd2c727c9,0.492277
nffe0f7cdc53c812,0.501777
nfff0d7e8380b4ac,0.498616


In [44]:
import numpy as np

In [47]:
def get_model_predictions(models, features):
    predictions = []
    for model in models:
        predictions.append(model(features, None)["prediction"].values)

    return np.array(predictions).T

In [30]:
# Correlations with live
models = [f"{group}_full.pkl" for group in groups] + ["full_basic.pkl"]
correlations = pd.DataFrame(columns=live_benchmark_models.columns, index=models)

predicts = []

for model in models:
    with open(f"models/{model}", "rb") as f:
        predict = cloudpickle.load(f)
    
    predicts.append(predict)

In [32]:
inputs = train.iloc[0:10]

In [59]:
x = torch.tensor(get_model_predictions(predicts, inputs))

In [68]:
x_pad = F.pad(x.unsqueeze(-1), (0, 5), value=0)
x_pad

tensor([[[0.4950, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5012, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4890, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4823, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4968, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4925, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4886, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4947, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5006, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.4971, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5009, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4992, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5003, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4991, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5049, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4943, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4991, 0.0000, 0.0000, 0.0000, 0.0000, 0.00