In [None]:
# Install dependencies
%pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1

In [1]:
from numerapi import NumerAPI
import pandas as pd
import json
napi = NumerAPI()

# use one of the latest data versions
DATA_VERSION = "v4.3"

In [None]:
napi.download_dataset("v4.3/train_int8.parquet", "v4.3/train_int8.parquet")
napi.download_dataset("v4.3/validation_int8.parquet", "v4.3/validation_int8.parquet")
napi.download_dataset("v4.3/live_int8.parquet", "v4.3/live_int8.parquet")
napi.download_dataset("v4.3/live_example_preds.parquet", "v4.3/live_example_preds.parquet")
napi.download_dataset("v4.3/validation_example_preds.parquet", "v4.3/validation_example_preds.parquet")
napi.download_dataset("v4.3/features.json", "v4.3/features.json")
napi.download_dataset("v4.3/meta_model.parquet", "v4.3/meta_model.parquet")
napi.download_dataset("v4.3/live_benchmark_models.parquet", "v4.3/live_benchmark_models.parquet")
napi.download_dataset("v4.3/validation_benchmark_models.parquet", "v4.3/validation_benchmark_models.parquet")
napi.download_dataset("v4.3/train_benchmark_models.parquet", "v4.3/train_benchmark_models.parquet")

In [2]:
# Load data
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
features = feature_metadata["feature_sets"]["medium"] # use "all" for better performance. Requires more RAM.
train = pd.read_parquet(f"{DATA_VERSION}/train_int8.parquet", columns=["era"]+features+["target"])

In [3]:
# Downsample for speed
train = train[train["era"].isin(train["era"].unique()[::4])]  # skip this step for better performance

In [4]:
train

Unnamed: 0_level_0,era,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_adam_incantational_winemaker,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,...,feature_wombed_liberatory_malva,feature_won_stalwart_eisenstein,feature_wrathful_prolix_colotomy,feature_wrinkliest_unmaintainable_usk,feature_wrought_muckier_temporality,feature_yauld_antediluvial_subprefecture,feature_yelled_hysteretic_eath,feature_yoruban_unapplied_tawse,feature_zygodactyl_exponible_lathi,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0001,0,4,0,4,2,0,2,1,4,...,0,2,4,2,4,3,2,2,3,0.25
n003bee128c2fcfc,0001,4,2,2,2,2,3,2,1,3,...,3,2,0,2,2,1,3,2,1,0.75
n0048ac83aff7194,0001,4,4,2,0,2,0,2,4,1,...,1,2,0,2,3,2,1,2,2,0.25
n00691bec80d3e02,0001,1,4,1,1,2,0,2,2,1,...,1,2,2,2,3,2,2,2,2,0.75
n00b8720a2fdc4f2,0001,0,2,0,0,2,0,2,3,1,...,1,2,0,2,0,1,1,2,1,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc2d5e4b79a7ae,0573,4,2,4,4,3,1,0,0,4,...,1,2,1,3,2,2,2,1,1,0.25
nffc7d24176548a4,0573,0,3,3,4,2,0,2,3,1,...,0,2,4,2,0,2,1,2,3,0.50
nffc9844c1c7a6a9,0573,4,1,1,2,0,3,2,4,0,...,3,0,3,0,2,0,0,1,2,0.50
nffd79773f4109bb,0573,0,0,1,3,1,4,2,1,2,...,4,1,2,0,1,1,0,1,0,0.50


# Transformer

In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [7]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim).to(device)
    self.beta = torch.zeros(dim).to(device)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

class Head(nn.Module):
    """ single self-attention head"""

    def __init__(self, block_size, n_embd, n_head, dropout, bias: bool = False):
        super().__init__()
        self.key = nn.Linear(n_embd, n_embd // n_head, bias)
        self.query = nn.Linear(n_embd, n_embd // n_head, bias)
        self.value = nn.Linear(n_embd, n_embd // n_head, bias)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        b, t, c = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        w = (q @ k.transpose(-2, -1)) * (int(c) ** -0.5)
        w = w.masked_fill(self.tril[:t, :t] == 0, float('-inf'))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        a = w @ v
        return a
    
class MultiHeadAttention(nn.Module):
    """ multi-head self-attention """

    def __init__(self, block_size, n_embd, n_head, dropout, bias: bool = False):
        super().__init__()
        self.heads = nn.ModuleList([Head(block_size, n_embd, n_head, dropout, bias) for _ in range(n_head)])
        self.linear = nn.Linear(n_embd, n_embd)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        a = torch.cat([h(x) for h in self.heads], dim=-1)
        a = self.dropout(self.linear(a))
        return a
    
class FeedForward(nn.Module):
    """ simple positional feed-forward """

    def __init__(self, n_embd, n_hidden, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, block_size, n_embd, n_head, n_hidden, dropout, bias: bool = False):
        super().__init__()
        self.attn = MultiHeadAttention(block_size, n_embd, n_head, dropout, bias)
        self.ff = FeedForward(n_embd, n_hidden, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x
    

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

class Transformer(nn.Module):
    """ a simple Transformer """

    def __init__(self, block_size, block_size_out, n_embd, n_head, n_layer, n_vocab, n_hidden, dropout, bias: bool = False):
        super().__init__()
        self.block_size = block_size
        self.block_size_out = block_size_out
        self.token_emb = nn.Embedding(n_vocab, n_embd)
        self.blocks = nn.Sequential(*[Block(block_size, n_embd, n_head, n_hidden, dropout, bias) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = MLP(n_embd, n_hidden, block_size_out)

    def forward(self, x, targets=None):
        b, t = x.shape
        assert t <= self.block_size, "x has to be within block size"

        print(x)
        print(x.dtype)
        x = self.token_emb(x)
        x = self.blocks(x)
        x = self.ln_f(x)
        out = self.head(x)
        print(out.shape)
        if self.block_size_out is not None:
            logits = logits[:, -self.block_size_out:, :]

        if targets is None:
            loss = None
            per_token_loss = None
        else:
            b, t, c = logits.shape
            logits = logits.contiguous().view(b*t, c)
            targets = targets.view(b*t)
            loss = F.cross_entropy(logits, targets)
            per_token_loss = F.cross_entropy(logits, targets, reduction='none').view(b, t).mean(dim=0)

        return logits, loss, per_token_loss
    
    def generate(self, x, max_new_tokens=6):
        for _ in range(max_new_tokens):
            x_cond = x[:, :]
            logits, _, _ = self(x_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            x_next = torch.multinomial(probs, 1)
            x = torch.cat((x, x_next), dim=1)
        return x

In [8]:
# Hyperparameters
batch_size = 16
block_size = train[features].shape[1]
block_size_out = 1
n_embd = 256
n_hidden = 1024
n_head = 256
n_layer = 4
n_vocab = 5
lr = 1e-4
n_epochs = 100
dropout = 0

In [9]:
model = Transformer(block_size, block_size_out, n_embd, n_head, n_layer, n_vocab, n_hidden, dropout).to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

3.421953 M parameters


In [10]:
train[features].iloc[0:16].values

array([[0, 4, 0, ..., 2, 2, 3],
       [4, 2, 2, ..., 3, 2, 1],
       [4, 4, 2, ..., 1, 2, 2],
       ...,
       [4, 4, 2, ..., 1, 2, 1],
       [0, 2, 0, ..., 0, 2, 4],
       [3, 0, 4, ..., 3, 2, 1]], dtype=int8)

In [11]:
model(torch.tensor(train[features].iloc[0:16].values).to(device).long())

tensor([[0, 4, 0,  ..., 2, 2, 3],
        [4, 2, 2,  ..., 3, 2, 1],
        [4, 4, 2,  ..., 1, 2, 2],
        ...,
        [4, 4, 2,  ..., 1, 2, 1],
        [0, 2, 0,  ..., 0, 2, 4],
        [3, 0, 4,  ..., 3, 2, 1]], device='cuda:0')
torch.int64


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.27 GiB is allocated by PyTorch, and 192.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF