In [1]:
import pandas as pd
import sklearn
import torch
import numpy as np
from  sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import Dataset
from torch import optim

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(data_path, with_dummies=True):
    train_data = pd.read_csv(data_path)
    train_data.drop(columns=["id"], inplace=True)
    train_data[["święto", "dzień_roboczy"]] = train_data[["święto", "dzień_roboczy"]].astype(int)
    train_data[['year', 'month', 'day']] = train_data['data'].str.split('-', expand=True).astype(int)
    train_data.drop("data", axis=1, inplace=True)
    train_data.drop("year", axis=1, inplace=True)
    train_data.drop("month", axis=1, inplace=True)
    train_data.drop("day", axis=1, inplace=True)
    if with_dummies:
        return pd.get_dummies(train_data, columns=['pogoda'], dtype=int)
    return train_data

In [3]:
def get_train_and_validate_data():
    train_data_with_dummies = preprocess_data("bit-x-adata/train.csv")

    X = train_data_with_dummies.drop("studenty_ms", axis=1).values
    Y = train_data_with_dummies["studenty_ms"].values.reshape(-1,1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

    return X_train, X_valid, y_train, y_valid

In [4]:
X_train, X_valid, y_train, y_valid = get_train_and_validate_data()

In [5]:
X_train

array([[-0.20739034,  0.66269838, -1.37635436, ..., -0.3352392 ,
        -0.50321961,  0.65947801],
       [-0.20739034, -1.50898212, -1.92648776, ...,  2.98294472,
        -0.50321961, -1.51635079],
       [-0.20739034,  0.66269838, -1.04627432, ..., -0.3352392 ,
         1.98720396, -1.51635079],
       ...,
       [-0.20739034,  0.66269838,  1.37431263, ..., -0.3352392 ,
        -0.50321961,  0.65947801],
       [-0.20739034, -1.50898212, -0.05603421, ..., -0.3352392 ,
        -0.50321961,  0.65947801],
       [-0.20739034,  0.66269838,  0.82417923, ..., -0.3352392 ,
        -0.50321961,  0.65947801]], shape=(388, 9))

In [6]:
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()

In [7]:
X_train.shape[1]

9

In [8]:
class RMSLELoss(nn.Module):
    def __init__(self, eps: float = 1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, pred, actual):
        # ensure non-negative and stable log
        pred_clamped = torch.clamp(pred, min=0.0)
        actual_clamped = torch.clamp(actual, min=0.0)
        # use log1p for stability
        return torch.sqrt(self.mse(torch.log1p(pred_clamped + self.eps), torch.log1p(actual_clamped + self.eps)))

In [9]:
class MyDataset(Dataset):
    def __init__(self, data, y):
        super().__init__()

        self.data = data
        self.y = y

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        return self.data[idx], self.y[idx]



In [10]:
# BEST KAGGLE
# model loss = 0.87
class GepardPred(nn.Module):
    def __init__(self, input_size: int, dropout_rate: float = 0.5):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 1),
            nn.Softplus()
        )

    def forward(self, x):
        return self.model(x)

In [11]:
from torch.utils.data import DataLoader

learning_rate = 1e-3
dropout_p = 0.3
l2_reg = 1e-4
batch_size = 128
max_epochs = 300

early_stopping_patience = 4

In [12]:
from copy import deepcopy

In [13]:
def evaluate_model(model: GepardPred, X, y, loss_fn):
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor(X, dtype=torch.float32)
        targets = torch.tensor(y, dtype=torch.float32)

        # outputs = model.predict_int(inputs)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

    return {"loss": loss.item()}

In [14]:

model = GepardPred(
    input_size=X_train.shape[1],
    dropout_rate=dropout_p
)

optimizer = optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=l2_reg
)

train_dataset = MyDataset(
    X_train,
    y_train
)

data_loader = DataLoader(
    train_dataset,
    batch_size
)

loss_fn = RMSLELoss()

best_model = model
best_threshold = None
patience = 100
steps_without_improvement = 0
best_loss = np.inf

for epoch in range(max_epochs):
    model.train()

    for X_batch, y_batch in data_loader:
        y_pred = model(X_batch)

        loss = loss_fn(y_pred, y_batch)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

    valid_metrics = evaluate_model(model, X_valid, y_valid, loss_fn)

    valid_loss = valid_metrics["loss"]

    print(f"Epoch {epoch} train loss: {loss.item():.4f}, eval loss {valid_loss:.4f}")

    if valid_loss < best_loss:
        best_loss = valid_loss
        best_model = deepcopy(model)

        steps_without_improvement = 0
    else:
        steps_without_improvement += 1
        if steps_without_improvement >= patience:

            print(f"Early stopping at epoch {epoch}")
            break


Epoch 0 train loss: 2.7396, eval loss 2.8022
Epoch 1 train loss: 2.3450, eval loss 2.4135
Epoch 2 train loss: 1.7159, eval loss 1.8883
Epoch 3 train loss: 1.0998, eval loss 1.4733
Epoch 4 train loss: 0.7005, eval loss 1.2548
Epoch 5 train loss: 0.3249, eval loss 1.1889
Epoch 6 train loss: 0.3398, eval loss 1.1779
Epoch 7 train loss: 0.3732, eval loss 1.1510
Epoch 8 train loss: 0.4293, eval loss 1.0964
Epoch 9 train loss: 0.2351, eval loss 1.0303
Epoch 10 train loss: 0.2746, eval loss 0.9739
Epoch 11 train loss: 0.2139, eval loss 0.9385
Epoch 12 train loss: 0.2424, eval loss 0.9267
Epoch 13 train loss: 0.1960, eval loss 0.9226
Epoch 14 train loss: 0.1747, eval loss 0.9139
Epoch 15 train loss: 0.2612, eval loss 0.8978
Epoch 16 train loss: 0.2082, eval loss 0.8856
Epoch 17 train loss: 0.2352, eval loss 0.8756
Epoch 18 train loss: 0.1496, eval loss 0.8715
Epoch 19 train loss: 0.2711, eval loss 0.8680
Epoch 20 train loss: 0.1552, eval loss 0.8660
Epoch 21 train loss: 0.2028, eval loss 0.866

In [15]:
final_test_data = preprocess_data("bit-x-adata/test.csv").values
scaler = StandardScaler()
X_test = scaler.fit_transform(final_test_data)
X_test = torch.from_numpy(X_test).float()

In [16]:
def pred_to_csv(model, X_test, path="bit-x-adata/test.csv"):
    model.eval()
    test_data = pd.read_csv(path)
    ids = test_data["id"].values

    y_pred = model(X_test)
    y_pred = y_pred.detach().numpy().flatten()

    submission = pd.DataFrame({
        "id": ids,
        "studenty_ms": y_pred.astype(np.int64)
    })

    submission.to_csv('comunicadores_submission.csv', index=False)

In [17]:
pred_to_csv(best_model, X_test)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_log_error
import joblib
import numpy as np
import torch

def rmsle_np(y_true, y_pred):
    y_t = np.clip(y_true, 0, None)
    y_p = np.clip(y_pred, 0, None)
    return np.sqrt(np.mean((np.log1p(y_p) - np.log1p(y_t)) ** 2))

# get fresh numpy train/valid (same preprocessing as used for NN)
X_train_np, X_valid_np, y_train_np, y_valid_np = get_train_and_validate_data()
# ensure 1D targets for sklearn
y_train_1d = y_train_np.ravel()
y_valid_1d = y_valid_np.ravel()

# train RF
rf = RandomForestRegressor(n_estimators=300, max_depth=12, n_jobs=-1, random_state=42)
rf.fit(X_train_np, y_train_1d)

# evaluate RF
rf_pred_valid = rf.predict(X_valid_np)
print("RF valid RMSLE:", rmsle_np(y_valid_1d, rf_pred_valid))

# evaluate NN (best_model) on same validation set
best_model.eval()
with torch.no_grad():
    nn_inputs = torch.tensor(X_valid_np, dtype=torch.float32)
    nn_out = best_model(nn_inputs).cpu().numpy().flatten()
print("NN valid RMSLE:", rmsle_np(y_valid_1d, nn_out))

# simple ensemble (average)
ensemble_pred = (rf_pred_valid + nn_out) / 2.0
print("Ensemble valid RMSLE:", rmsle_np(y_valid_1d, ensemble_pred))

# helper to produce submission using RF or ensemble
def pred_to_csv_with_rf(rf_model, nn_model, X_test, use_ensemble=False, path="bit-x-adata/test.csv"):
    test_df = pd.read_csv(path)
    ids = test_df["id"].values

    rf_preds = rf_model.predict(X_test)
    if use_ensemble:
        nn_model.eval()
        with torch.no_grad():
            nn_preds = nn_model(torch.from_numpy(X_test).float()).cpu().numpy().flatten()
        preds = (rf_preds + nn_preds) / 2.0
    else:
        preds = rf_preds

    preds = np.round(np.clip(preds, 0, None)).astype(np.int64)

    submission = pd.DataFrame({
        "id": ids,
        "studenty_ms": preds
    })
    submission.to_csv("comunicadores_submission_rf.csv", index=False)

RF valid RMSLE: 1.4680854099876028
NN valid RMSLE: 0.8504811425012122
Ensemble valid RMSLE: 1.21458686333347


In [19]:
def pred_to_csv_rf(model, X_test, path="bit-x-adata/test.csv"):
    test_data = pd.read_csv(path)
    ids = test_data["id"].values

    y_pred = model.predict(X_test)

    submission = pd.DataFrame({
        "id": ids,
        "studenty_ms": y_pred.astype(np.int64)
    })

    submission.to_csv('comunicadores_submission.csv', index=False)

In [20]:
pred_to_csv_rf(rf, X_test)