In [58]:
from matplotlib import pyplot as plt

In [59]:
import os
from tqdm import tqdm
import pathlib
import numpy as np
import pandas as pd
import pickle
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.preprocessing import MinMaxScaler

import torch as t

from datetime import datetime as dt

DEVICE = "cuda" if t.cuda.is_available() else 'cpu'

tqdm.pandas()
pd.set_option('display.max_columns', None)

In [60]:
DATA_DIR = pathlib.Path("./data/")
RS = 3984765

In [61]:
train = pd.read_parquet(DATA_DIR.joinpath("train.parquet"))
index_split = int(train.shape[0] * 0.8)
test = train[index_split:]
train = train[:index_split]
test.shape

(30684, 27)

In [62]:
train["feature0"].quantile(0.99)

63.37731798107359

In [63]:


class Pipe:
    def __init__(self, cols_to_short, batch_size=64):
        self.min_max = MinMaxScaler()
        self.cols_to_short = cols_to_short
        self.batch_size = batch_size


    def fit(self, X: pd.DataFrame):
        X_ = self.quntize(X)
        X_["feature4"] = (X_["feature4"] == "gas1") * 1
        self.min_max.fit(X_)

    def quntize(self, X):
        X_ = X.copy()
        
        for col in self.cols_to_short:
            X_ = X_[X_[col] < X_[col].quantile(0.999)]
        
        return X_

    def transform(self, X, y=None, train_df=False):
        X_ = X.copy()
        if y is not None:
            y_ = y.copy()


        if train_df:
            X_ = self.quntize(X_)
        if train_df and y is not None:
            y_ = y_.iloc[X_.index]
        

        X_["feature4"] = (X_["feature4"] == "gas1") * 1
        

        X_ = pd.DataFrame(self.min_max.transform(X_), columns=X_.columns) 
        X_["cluster"] = (X_["feature4"] == 1) * 1
        X_.drop("feature4", axis=1, inplace=True)

        X_t = t.tensor(X_.to_numpy(), dtype=t.float32)

        if y is not None:
            y_t = t.tensor(y_.to_numpy(), dtype=t.float32)
            ds = t.utils.data.TensorDataset(X_t, y_t)
        else:
            ds = t.utils.data.TensorDataset(X_t)

        ldr = t.utils.data.DataLoader(ds, batch_size=self.batch_size)

        return ds, ldr
    
    def pack(self, path):
        to_pack = {
            "scaler" : self.min_max,
            "cols" : self.cols_to_short,
            "batch_size" : self.batch_size
        }
        with open(path, "wb+") as f:
            pickle.dump(to_pack, f)
    
    def unpack(self, path):
        with open(path, "rb") as f:
            params = pickle.load(f)

        self.batch_size = params["batch_size"]
        self.min_max = params["scaler"]
        self.cols_to_short = params["cols"] 

    
    

pipe = Pipe(cols_to_short=[
    "feature0",
    "feature1",
    "feature2",
    "feature3",
    "feature5",
    "feature7",
    "feature12",
    "feature15",
    "feature17",
    ], batch_size=512)

pipe.fit(train.drop(["target0", "target1"], axis=1))


In [64]:
print(train.shape)
train_ds, train_ldr = pipe.transform(
    train.drop(["target0", "target1"], axis=1), 
    train[["target0", "target1"]],
    train_df=True)
print(train_ds.tensors[0].shape)

print(test.shape)
test_ds, test_ldr = pipe.transform(
    test.drop(["target0", "target1"], axis=1),
    test[["target0", "target1"]]
    )
print(test_ds.tensors[0].shape)

(122733, 27)
torch.Size([121629, 25])
(30684, 27)
torch.Size([30684, 25])


In [65]:
class fc_model(t.nn.Module):
    def __init__(self, input_dim, layers=[256, 16, 2], device='cpu'):
        super().__init__()
        self.device=device

        self.zero = t.nn.Sequential(
            t.nn.Linear(input_dim, layers[0]),
            # t.nn.BatchNorm1d(layers[0]),
            t.nn.ReLU(),
            

            t.nn.Linear(layers[0], layers[1]),
            t.nn.ReLU(),
            # t.nn.BatchNorm1d(layers[1]),
            t.nn.Linear(layers[1], layers[2])
        )
        self.one = t.nn.Sequential(
            t.nn.Linear(input_dim, layers[0]),
            
            t.nn.ReLU(),
            # t.nn.BatchNorm1d(layers[0]),
            t.nn.Linear(layers[0], layers[1]),
            
            t.nn.ReLU(),
            # t.nn.BatchNorm1d(layers[1]),
            t.nn.Linear(layers[1], layers[2])
        )

        # self.norm_scalar_1 = t.nn.Parameter(t.scalar_tensor(0.1, dtype=t.float32, requires_grad=True)) 
        # self.norm_scalar_2 = t.nn.Parameter(t.scalar_tensor(0.1, dtype=t.float32, requires_grad=True)) 
        # self.norm_params_1 = t.nn.Parameter(t.tensor([[55.0, 55.0], [10.0, 10.0]], dtype=t.float32, requires_grad=True)) 
        # self.norm_params_2 = t.nn.Parameter(t.tensor([[8.0, 8.0], [3.0, 3.0]], dtype=t.float32, requires_grad=True))
        
        # self.mean_1 = t.nn.Parameter(t.tensor([8.0, 8.0], dtype=t.float32, requires_grad=True))
        # self.mean_2 = t.nn.Parameter(t.tensor([55.0, 55.0], dtype=t.float32, requires_grad=True))
        # self.std_1 = t.nn.Parameter(t.tensor([3.0, 3.0], dtype=t.float32, requires_grad=True))
        # self.std_2 = t.nn.Parameter(t.tensor([8.0, 8.0], dtype=t.float32, requires_grad=True))
        
        # self.puiss_scalar = t.scalar_tensor(0.1, dtype=t.float32, requires_grad=True)

    def forward(self, X):
        out = t.zeros((X.shape[0], 2), device=self.device, dtype=t.float32)
        
        inds_one = (X[:, -1] == t.scalar_tensor(1)).nonzero().T[0]
        inds_zero = (X[:, -1] == t.scalar_tensor(0)).nonzero().T[0]

        one = self.one(X[inds_one]) # + t.normal(self.mean_1, self.std_1) * self.norm_scalar_1
        
        zero = self.zero(X[inds_zero]) # + t.normal(self.mean_2, self.std_2) * self.norm_scalar_2

        out[inds_one] = one
        out[inds_zero] = zero

        return out



In [66]:


def train(train_ldr, 
          test_ldr, 
          model, 
          optimizer, 
          loss_f, 
          epochs=10, 
          device='cpu',
          sm = None,
          save_each = 20):
    
    global pipe

    bar = tqdm(range(epochs))
    for epoch in bar:
        loss_cum = 0
        for X, y in train_ldr:
            optimizer.zero_grad()
            out = model(X.to(device=device))
            loss = loss_f(out, y.to(device=device))
            loss.backward()
            loss_cum += loss
            optimizer.step()

        eval_test = eval(test_ldr, model, device)
        eval_train = eval(train_ldr, model, device)
        bar.set_description(f'''epoch {epoch}, loss {loss_cum:.3f}, mape_train {(eval_train * 100).round(2)}, mape_test {(eval_test * 100).round(2)}''')
        if sm is not None:
            sm.add_scalar('Loss/train', loss_cum, epoch)
            sm.add_scalar('MAPE/train', eval_train.sum() / 2, epoch)
            sm.add_scalar('MAPE/test', eval_test.sum() / 2, epoch)
        if epoch % 10 == 0:
            save_model(model, pipe, "./models/", "fc_")




def eval(test_ldr, model, device='cpu'):
    
    loss = np.array([0, 0], dtype=np.float32)
    i=0
    for X, y in test_ldr:
        with t.no_grad():
            out = model(X.to(device=device))
            loss += mape(y, out.cpu().detach().numpy(), multioutput='raw_values')
            i+=1

    return loss / i

def pred(ldr, model, device='cpu'):
    res = t.tensor([], dtype=t.float32)

    with t.no_grad():
        for X, y in ldr:
            out = model(X.to(device=device)).to('cpu')
            res = t.cat([res,out], axis=0)
    
    res = pd.DataFrame(
        res.detach().numpy(),
        columns = ["target0", "target1"]
    )

    return res



def save_model(model, pipe, path, prefix):
    timestamp = dt.now().strftime('%Y-%m-%d %H:%M:%S')
    filename_m = prefix + "model_" + timestamp + ".t"
    filename_p = prefix + "pipe_" + timestamp + ".pckl"
    
    t.save(model.state_dict(), path + filename_m)
    with open(path+filename_p, "wb+") as f:
        pickle.dump([pipe], f, -1)

    

In [71]:
model = fc_model(train_ds.tensors[0].shape[1], layers=[512, 64, 2], device=DEVICE).to(device=DEVICE)
optimizer = t.optim.Adam(model.parameters())
loss_f = t.nn.MSELoss()

sm = SummaryWriter("./runs")

train(train_ldr, test_ldr, model, optimizer, loss_f, 500, DEVICE, sm)
save_model(model, pipe, "./models/", "model_linear_")

epoch 355, loss 78.457, mape_train [0.45 1.88], mape_test [0.47 1.93]:  71%|███████   | 356/500 [09:42<03:52,  1.61s/it]   

In [68]:
pred(test_ldr, model, DEVICE)

Unnamed: 0,target0,target1
0,75.400444,39.451164
1,75.434258,48.968521
2,18.958340,2.854556
3,32.652805,7.321374
4,78.701927,42.199982
...,...,...
30679,26.223734,4.336345
30680,19.790100,7.048435
30681,28.555611,5.647378
30682,71.569077,50.264404


In [None]:
model_ = fc_model(25, [512, 64, 2], DEVICE)
model_.load_state_dict(t.load("./submission/model.t"))
model_ = model_.cpu()

In [None]:
model_.device = 'cpu'
eval(test_ldr, model_)

array([0.00533095, 0.0228748 ], dtype=float32)

In [None]:
pipe.pack("./src/pipe.pckl")
pipe = Pipe([])
pipe.unpack("./src/pipe.pckl")

In [None]:
pipe.transform(test.drop(["target0", "target1"], axis=1))

(<torch.utils.data.dataset.TensorDataset at 0x7fdd3472c1f0>,
 <torch.utils.data.dataloader.DataLoader at 0x7fdcc2f4bf10>)

In [None]:
with open("./src/pipe_2023-05-27 18:20:59.pckl", "rb") as f:
    pipe_ = pickle.load(f)