# Tabular Playground Series - Nov 2021

## Dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F

import os
import random

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

SEED = 2502
DATA_PATH = "../data/tabular-playground-series-nov-2021/"

def set_seed(seed=2502):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(SEED)

In [2]:
print(f"PyTorch версия: {torch.__version__}")
print(f"CUDA доступна: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

PyTorch версия: 2.10.0+cu128
CUDA доступна: True
Используемое устройство: cuda


## Data

In [3]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")

df_train.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
0,0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


### EDA

In [4]:
df_train.shape

(600000, 102)

In [5]:
df_train.isna().sum().sum()

np.int64(0)

In [6]:
df_test.isna().sum().sum()

np.int64(0)

In [7]:
df_train['target'].value_counts()

target
1    303606
0    296394
Name: count, dtype: int64

### Dataset preparing

In [8]:
cols2drop = ['id']
target = 'target'
features = df_train.drop(columns=cols2drop + [target], errors='ignore').columns

X = df_train[features].to_numpy()
y = df_train[target].to_numpy()
X_test = df_test[features].to_numpy()

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=SEED, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [10]:
X_train_t = torch.FloatTensor(X_train)
X_val_t = torch.FloatTensor(X_val)
X_test_t = torch.FloatTensor(X_test)
y_train_t = torch.LongTensor(y_train)
y_val_t = torch.LongTensor(y_val)

train_dataset = TensorDataset(X_train_t, y_train_t)
val_dataset = TensorDataset(X_val_t, y_val_t)

train_loader = DataLoader(train_dataset, batch_size=2**14, shuffle=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=2**14, shuffle=False, num_workers=8)

## Model

In [11]:
class BinaryClassifier(torch.nn.Module):
    def __init__(self, input_dim: int, hidden_dims: list[int], dropout: float = 0.3):
        super().__init__()
        
        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.append(torch.nn.Linear(prev_dim, hidden_dim))
            layers.append(torch.nn.BatchNorm1d(hidden_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(dropout))
            prev_dim = hidden_dim
        
        layers.append(torch.nn.Linear(prev_dim, 1))

        self.network = torch.nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

model = BinaryClassifier(input_dim=X.shape[1], hidden_dims=[128, 64, 32]).to(device)
model

BinaryClassifier(
  (network): Sequential(
    (0): Linear(in_features=100, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=32, out_features=1, bias=True)
  )
)

### Criterion & Optimizer & Scheduler

In [12]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
scheduler = StepLR(optimizer=optimizer, step_size=10, gamma=0.8)

## Train

In [13]:
# preds = []
# for X_batch, y_batch in val_loader:
#     pred = model(X_batch.to(device))[:, 0]
#     # print(roc_auc_score(y_batch.detach().cpu(), pred.detach().cpu()))
#     preds.append(pred.detach().cpu())
    
# torch.cat(preds).numpy()

In [14]:
num_epoches = 100
best_val_metric = 0.0
best_val_loss = np.inf
patience, counter = 30, 0

for epoch in tqdm(range(num_epoches)):
    model.train()

    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.float().view(-1, 1))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()

    val_loss = 0
    y_preds = []
    y_true = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            
            y_preds.append(y_pred.detach().cpu())
            y_true.append(y_batch.detach().cpu())

            loss = criterion(y_pred, y_batch.float().view(-1, 1))
            val_loss += loss.item()

    y_preds = torch.cat(y_preds).numpy()
    y_true = torch.cat(y_true).numpy()

    val_loss /= len(val_loader)
    val_metric = roc_auc_score(y_true, y_preds)

    if val_metric > best_val_metric:
        best_val_metric = val_metric
        counter = 0
        torch.save(model.state_dict(), '../models/best_model.pth')
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping на эпохе {epoch+1}")
            break

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d} | Train Loss: {train_loss:.4f} LR: {scheduler.get_last_lr()[0]:.4f} | "
              f"Val Loss (LogLoss): {val_loss:.4f} ROC_AUC: {val_metric:.4f}")
        
    scheduler.step()

print(f"\nBest val loss {best_val_loss:.4f}\nBest val ROC_AUC {best_val_metric:.4f}")




  0%|          | 0/100 [00:00<?, ?it/s]

Epoch  10 | Train Loss: 0.5703 LR: 0.0050 | Val Loss (LogLoss): 0.5692 ROC_AUC: 0.7540
Epoch  20 | Train Loss: 0.5681 LR: 0.0040 | Val Loss (LogLoss): 0.5689 ROC_AUC: 0.7540
Epoch  30 | Train Loss: 0.5656 LR: 0.0032 | Val Loss (LogLoss): 0.5687 ROC_AUC: 0.7540
Epoch  40 | Train Loss: 0.5647 LR: 0.0026 | Val Loss (LogLoss): 0.5690 ROC_AUC: 0.7540
Early stopping на эпохе 43

Best val loss inf
Best val ROC_AUC 0.7547


## Generating submission

In [23]:
model.load_state_dict(torch.load('../models/best_model.pth'))
model.eval()

with torch.no_grad():
    logits = torch.sigmoid(model(X_test_t.to(device)).squeeze()).detach().cpu()

logits

tensor([0.7412, 0.7547, 0.7484,  ..., 0.7467, 0.7381, 0.7420])

In [24]:
sample_submission = pd.read_csv(DATA_PATH + "sample_submission.csv")
sample_submission.head()

Unnamed: 0,id,target
0,600000,0.5
1,600001,0.5
2,600002,0.5
3,600003,0.5
4,600004,0.5


In [27]:
subm = sample_submission.copy()
subm[target] = logits
subm.to_csv("../submissions/subm.csv", index=False)
subm.head()

Unnamed: 0,id,target
0,600000,0.741154
1,600001,0.754696
2,600002,0.748444
3,600003,0.296581
4,600004,0.727921
