<a href="https://colab.research.google.com/github/SorokinMaksimArtemovich/MTS-ML-CUP/blob/main/models/DanetSexPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import time
import pyarrow as pa
import pyarrow.parquet as pq
import scipy
import gc
!pip install implicit
import implicit
import bisect
import pickle
import sklearn.metrics as m
!pip install catboost
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp39-cp39-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
sns.set_style('darkgrid')

In [None]:
#!pip install feather-format >> none
#!pip install faiss-cpu --no-cache

In [None]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

### save and load functions

In [None]:
def save(obj, path, verbose=True):
    if verbose:
        print("Saving object to {}".format(path))

    with open(path, "wb") as obj_file:
        pickle.dump( obj, obj_file, protocol=pickle.HIGHEST_PROTOCOL )

    if verbose:
        print("Object saved to {}".format(path))
    pass

In [None]:
def load(path, verbose=True):
    if verbose:
        print("Loading object from {}".format(path))
    with open(path, "rb") as obj_file:
        obj = pickle.load(obj_file)
    if verbose:
        print("Object loaded from {}".format(path))
    return obj

### importing data and reading train targets and test id

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sorokinmaksim","key":"9537fa921e351600c84c9f61d1f01441"}'}

In [None]:
!ls -lha kaggle.json

-rw-r--r-- 1 root root 69 Apr  7 11:23 kaggle.json


In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d sorokinmaksim/mts-ml-cup-data-for-training-model

Downloading mts-ml-cup-data-for-training-model.zip to /content
100% 4.43G/4.43G [04:17<00:00, 18.9MB/s]
100% 4.43G/4.43G [04:17<00:00, 18.5MB/s]


In [None]:
! unzip /content/mts-ml-cup-data-for-training-model.zip

Archive:  /content/mts-ml-cup-data-for-training-model.zip
  inflating: df_age.csv              
  inflating: df_age_1.csv            
  inflating: df_danet.csv            
  inflating: df_sex.csv              
  inflating: df_sex_1.csv            
  inflating: id_to_submit.csv        
  inflating: none                    
  inflating: targets.csv             


In [None]:
id_to_submit = pd.read_csv('id_to_submit.csv')

In [None]:
targets = pd.read_csv('targets.csv')

### standart scaling and making test and train data function

In [None]:
from sklearn.preprocessing import StandardScaler

def make_train_data(data):
  scaler = StandardScaler()
  data_id = data['user_id']
  df = pd.DataFrame(scaler.fit_transform(data.drop(['user_id'], axis=1)), columns=data.drop(['user_id'], axis=1).columns)
  df['user_id'] = data_id
  df_sex_test = id_to_submit.merge(df, how = 'left', on = ['user_id'])
  print(df_sex_test.shape)
  df_sex = targets.merge(df, how = 'inner', on = ['user_id'])
  df_sex = df_sex[df_sex['is_male'] != 'NA']
  df_sex = df_sex.dropna()
  df_sex['is_male'] = df_sex['is_male'].map(int)
  print(df_sex['is_male'].value_counts())
  del(df)
  del(data)
  gc.collect()
  return df_sex, df_sex_test

# DaNet

In [None]:
from torch import nn
from torch.autograd import Function
import torch.nn.functional as F

import torch

"""
Other possible implementations:
https://github.com/KrisKorrel/sparsemax-pytorch/blob/master/sparsemax.py
https://github.com/msobroza/SparsemaxPytorch/blob/master/mnist/sparsemax.py
https://github.com/vene/sparse-structured-attention/blob/master/pytorch/torchsparseattn/sparsemax.py
"""


# credits to Yandex https://github.com/Qwicen/node/blob/master/lib/nn_utils.py
def _make_ix_like(input, dim=0):
    d = input.size(dim)
    rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
    view = [1] * input.dim()
    view[0] = -1
    return rho.view(view).transpose(0, dim)


class SparsemaxFunction(Function):
    """
    An implementation of sparsemax (Martins & Astudillo, 2016). See
    :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
    By Ben Peters and Vlad Niculae
    """

    @staticmethod
    def forward(ctx, input, dim=-1):
        """sparsemax: normalizing sparse transform (a la softmax)
        Parameters
        ----------
        ctx : torch.autograd.function._ContextMethodMixin
        input : torch.Tensor
            any shape
        dim : int
            dimension along which to apply sparsemax
        Returns
        -------
        output : torch.Tensor
            same shape as input
        """
        ctx.dim = dim
        max_val, _ = input.max(dim=dim, keepdim=True)
        input -= max_val  # same numerical stability trick as for softmax
        tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim)
        output = torch.clamp(input - tau, min=0)
        ctx.save_for_backward(supp_size, output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        supp_size, output = ctx.saved_tensors
        dim = ctx.dim
        grad_input = grad_output.clone()
        grad_input[output == 0] = 0

        v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
        v_hat = v_hat.unsqueeze(dim)
        grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
        return grad_input, None

    @staticmethod
    def _threshold_and_support(input, dim=-1):
        """Sparsemax building block: compute the threshold
        Parameters
        ----------
        input: torch.Tensor
            any dimension
        dim : int
            dimension along which to apply the sparsemax
        Returns
        -------
        tau : torch.Tensor
            the threshold value
        support_size : torch.Tensor
        """

        input_srt, _ = torch.sort(input, descending=True, dim=dim)
        input_cumsum = input_srt.cumsum(dim) - 1
        rhos = _make_ix_like(input, dim)
        support = rhos * input_srt > input_cumsum

        support_size = support.sum(dim=dim).unsqueeze(dim)
        tau = input_cumsum.gather(dim, support_size - 1)
        tau /= support_size.to(input.dtype)
        return tau, support_size


sparsemax = SparsemaxFunction.apply


class Sparsemax(nn.Module):

    def __init__(self, dim=-1):
        self.dim = dim
        super(Sparsemax, self).__init__()

    def forward(self, input):
        return sparsemax(input, self.dim)


class Entmax15Function(Function):
    """
    An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See
    :cite:`https://arxiv.org/abs/1905.05702 for detailed description.
    Source: https://github.com/deep-spin/entmax
    """

    @staticmethod
    def forward(ctx, input, dim=-1):
        ctx.dim = dim

        max_val, _ = input.max(dim=dim, keepdim=True)
        input = input - max_val  # same numerical stability trick as for softmax
        input = input / 2  # divide by 2 to solve actual Entmax

        tau_star, _ = Entmax15Function._threshold_and_support(input, dim)
        output = torch.clamp(input - tau_star, min=0) ** 2
        ctx.save_for_backward(output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        Y, = ctx.saved_tensors
        gppr = Y.sqrt()  # = 1 / g'' (Y)
        dX = grad_output * gppr
        q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
        q = q.unsqueeze(ctx.dim)
        dX -= q * gppr
        return dX, None

    @staticmethod
    def _threshold_and_support(input, dim=-1):
        Xsrt, _ = torch.sort(input, descending=True, dim=dim)

        rho = _make_ix_like(input, dim)
        mean = Xsrt.cumsum(dim) / rho
        mean_sq = (Xsrt ** 2).cumsum(dim) / rho
        ss = rho * (mean_sq - mean ** 2)
        delta = (1 - ss) / rho

        # NOTE this is not exactly the same as in reference algo
        # Fortunately it seems the clamped values never wrongly
        # get selected by tau <= sorted_z. Prove this!
        delta_nz = torch.clamp(delta, 0)
        tau = mean - torch.sqrt(delta_nz)

        support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim)
        tau_star = tau.gather(dim, support_size - 1)
        return tau_star, support_size


class Entmoid15(Function):
    """ A highly optimized equivalent of lambda x: Entmax15([x, 0]) """

    @staticmethod
    def forward(ctx, input):
        output = Entmoid15._forward(input)
        ctx.save_for_backward(output)
        return output

    @staticmethod
    def _forward(input):
        input, is_pos = abs(input), input >= 0
        tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2
        tau.masked_fill_(tau <= input, 2.0)
        y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2
        return torch.where(is_pos, 1 - y_neg, y_neg)

    @staticmethod
    def backward(ctx, grad_output):
        return Entmoid15._backward(ctx.saved_tensors[0], grad_output)

    @staticmethod
    def _backward(output, grad_output):
        gppr0, gppr1 = output.sqrt(), (1 - output).sqrt()
        grad_input = grad_output * gppr0
        q = grad_input / (gppr0 + gppr1)
        grad_input -= q * gppr0
        return grad_input


entmax15 = Entmax15Function.apply
entmoid15 = Entmoid15.apply


class Entmax15(nn.Module):

    def __init__(self, dim=-1):
        self.dim = dim
        super(Entmax15, self).__init__()

    def forward(self, input):
        return entmax15(input, self.dim)

In [None]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from scipy.special import softmax
from torch.utils.data import DataLoader
from torch.nn.functional import cross_entropy, mse_loss

def initialize_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    return

class GBN(torch.nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """
    def __init__(self, input_dim, virtual_batch_size=512):
        super(GBN, self).__init__()
        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = nn.BatchNorm1d(self.input_dim)

    def forward(self, x):
        if self.training == True:
            chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
            res = [self.bn(x_) for x_ in chunks]
            return torch.cat(res, dim=0)
        else:
            return self.bn(x)

class LearnableLocality(nn.Module):

    def __init__(self, input_dim, k):
        super(LearnableLocality, self).__init__()
        self.register_parameter('weight', nn.Parameter(torch.rand(k, input_dim)))
        self.smax = Entmax15(dim=-1)

    def forward(self, x):
        mask = self.smax(self.weight)
        masked_x = torch.einsum('nd,bd->bnd', mask, x)  # [B, k, D]
        return masked_x

class AbstractLayer(nn.Module):
    def __init__(self, base_input_dim, base_output_dim, k, virtual_batch_size, bias=True):
        super(AbstractLayer, self).__init__()
        self.masker = LearnableLocality(input_dim=base_input_dim, k=k)
        self.fc = nn.Conv1d(base_input_dim * k, 2 * k * base_output_dim, kernel_size=1, groups=k, bias=bias)
        initialize_glu(self.fc, input_dim=base_input_dim * k, output_dim=2 * k * base_output_dim)
        self.bn = GBN(2 * base_output_dim * k, virtual_batch_size)
        self.k = k
        self.base_output_dim = base_output_dim

    def forward(self, x):
        b = x.size(0)
        x = self.masker(x)  # [B, D] -> [B, k, D]
        x = self.fc(x.view(b, -1, 1))  # [B, k, D] -> [B, k * D, 1] -> [B, k * (2 * D'), 1]
        x = self.bn(x)
        chunks = x.chunk(self.k, 1)  # k * [B, 2 * D', 1]
        x = sum([F.relu(torch.sigmoid(x_[:, :self.base_output_dim, :]) * x_[:, self.base_output_dim:, :]) for x_ in chunks])  # k * [B, D', 1] -> [B, D', 1]
        return x.squeeze(-1)


class BasicBlock(nn.Module):
    def __init__(self, input_dim, base_outdim, k, virtual_batch_size, fix_input_dim, drop_rate):
        super(BasicBlock, self).__init__()
        self.conv1 = AbstractLayer(input_dim, base_outdim // 2, k, virtual_batch_size)
        self.conv2 = AbstractLayer(base_outdim // 2, base_outdim, k, virtual_batch_size)

        self.downsample = nn.Sequential(
            nn.Dropout(drop_rate),
            AbstractLayer(fix_input_dim, base_outdim, k, virtual_batch_size)
        )

    def forward(self, x, pre_out=None):
        if pre_out == None:
            pre_out = x
        out = self.conv1(pre_out)
        out = self.conv2(out)
        identity = self.downsample(x)
        out += identity
        return F.leaky_relu(out, 0.01)


class DANet(nn.Module):
    def __init__(self, input_dim, num_classes, layer_num=20, base_outdim=64, k=5, virtual_batch_size=256, drop_rate=0.1):
        super(DANet, self).__init__()
        params = {'base_outdim': base_outdim, 'k': k, 'virtual_batch_size': virtual_batch_size,
                  'fix_input_dim': input_dim, 'drop_rate': drop_rate}
        self.init_layer = BasicBlock(input_dim, **params)
        self.lay_num = layer_num
        self.layer = nn.ModuleList()
        for i in range((layer_num // 2) - 1):
            self.layer.append(BasicBlock(base_outdim, **params))
        self.drop = nn.Dropout(0.1)

        self.fc = nn.Sequential(nn.Linear(base_outdim, 256),
                                nn.ReLU(inplace=True),
                                nn.Linear(256, 512),
                                nn.ReLU(inplace=True),
                                nn.Linear(512, num_classes))

    def forward(self, x):
        out = self.init_layer(x)
        for i in range(len(self.layer)):
            out = self.layer[i](x, out)
        out = self.drop(out)
        out = self.fc(out)
        return out

In [None]:
import numpy as np
from copy import deepcopy
from datetime import datetime

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD, Adam, AdamW
!pip install qhoptim
from qhoptim.pyt import QHAdam


class CustomDataset(Dataset):
    def __init__(self, x, y, class_count):
        
        self.x = x.astype( np.float32 )
        self.y = y.astype( np.float32 )
        self.ds_len = len(y)
        self.class_count = class_count
        
        y_transformed = np.zeros((self.ds_len, self.class_count), dtype=np.float32)
        for i in range(len(y)):
            y_transformed[i][y[i]] = 1.0
        self.y = y_transformed
    
        pass
    
    def __getitem__(self, id):
        
        x = self.x[id]
        y = self.y[id]
        
        return x, y
    
    def __len__(self):
        return self.ds_len
    
class PredictDataset(Dataset):
    def __init__(self, x):
        self.x = x.astype( np.float32 )
        self.ds_len = len(x)
        pass
    
    def __getitem__(self, id):
        x = self.x[id]
        return x
    
    def __len__(self):
        return self.ds_len 
        
        

class DANetClassifier():
    def __init__(self, input_dim, num_classes, 
                 #layer_num=48, base_outdim=96, k=8,
                 layer_num=32, base_outdim=64, k=5,
                 virtual_batch_size=256, drop_rate=0.1,
                 device="cuda"):
        
        self.device = device
        
        self.danet = DANet(input_dim = input_dim, 
                           num_classes = num_classes, 
                           layer_num = layer_num, 
                           base_outdim = base_outdim, 
                           k = k, 
                           virtual_batch_size = virtual_batch_size, 
                           drop_rate = drop_rate)
        self.model = torch.nn.Sequential( self.danet, nn.LogSoftmax(dim=1) )
        self.model = self.model.to( self.device )
        
        self.class_names = None
        
        pass
    
    def predict_proba(self, x, batch_size=1024):
        
        self.model.eval()
        
        predict_dataset = PredictDataset( x )
        predict_dataloader = DataLoader( predict_dataset, batch_size=batch_size, shuffle=False )
        
        probas = []
        for x_batch in predict_dataloader:
            x_batch = x_batch.to( self.device )
            y_pred = self.model( x_batch )
            y_pred = nn.Softmax(dim=1)(y_pred)
            y_pred = y_pred.cpu().detach().numpy()
            probas.append( y_pred )
        probas = np.vstack( probas )
        
        return probas
    
    def predict(self, x, batch_size=1024):
        
        probas = self.predict_proba(x, batch_size)
        
        y_pred = []
        for i in range(len(probas)):
            current_proba = probas[i]
            y_i = np.argmax( current_proba )
            y_i = self.class_names[ y_i ]
            y_pred.append( y_i )
        y_pred = np.array( y_pred ) 
            
        return y_pred
    
    def get_embeddings(self, x, batch_size=1024):
        pass
    
    # no mixup version
    """def fit(self, x_train, y_train, x_val, y_val,
            start_lr=0.008, end_lr=0.0001, batch_size=8192, epochs=100):
        
        def train(dataloader, loss_fn, optimizer):
            self.model.train()
            i = 0
            start_time = datetime.now()
            ema_loss = None
            log_frequency = int(len(dataloader) / 10.0)
            for x, y in dataloader:
                x = x.to( self.device )
                y = y.to( self.device )
                y_pred = self.model(x)
                loss = loss_fn(y_pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                i += 1
                if ema_loss is None:
                    ema_loss = loss.item()
                else:
                    alpha = 0.2
                    ema_loss = alpha * loss.item() + (1.0 - alpha) * ema_loss
                if i % log_frequency == 0:
                    total_time = datetime.now() - start_time
                    print(f"ema_loss: {ema_loss:>7f}  [{i * len(x):>5d}/{len(dataloader.dataset):>5d}] {total_time}")
        def test(dataloader, loss_fn):
            self.model.eval()
            
            num_batches = len(dataloader)
            test_loss = 0
            with torch.no_grad():
                for x, y in dataloader:
                    x = x.to( self.device )
                    y = y.to( self.device )
                    y_pred = self.model(x)
                    test_loss += loss_fn(y_pred, y).item()
            test_loss /= num_batches
            return test_loss
        
        self.class_names = np.unique( y_train )
        class_count = len( self.class_names )
        train_dataset = CustomDataset( x_train, y_train, class_count )
        val_dataset = CustomDataset( x_val, y_val, class_count )
        val_data_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
        train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
        loss_function = torch.nn.CrossEntropyLoss()
        
        best_loss = np.inf
        lr_step = (end_lr - start_lr) / epochs
        current_lr = start_lr
        for i in range(1, epochs+1):
            
            #optimizer = AdamW(self.model.parameters(), lr=current_lr, weight_decay=1e-2, betas=(0.9, 0.999), amsgrad=True)
            #optimizer = torch.optim.Adam(self.model.parameters(), lr=current_lr, weight_decay=1e-5, betas=(0.9, 0.999), amsgrad=True)
            optimizer = QHAdam(self.model.parameters(), lr=current_lr, weight_decay=1.0e-5 )
            
            print("Epoch: {} | lr: {}".format(i, current_lr))
            train(train_data_loader, loss_function, optimizer)
            
            val_loss = test(val_data_loader, loss_function)
            print("Validation loss: {}".format(val_loss))
            if val_loss < best_loss:
                print("Previous best loss: {}".format(best_loss))
                best_loss = val_loss
                best_model = deepcopy( self.model )
            
            current_lr += lr_step
        
        self.model = best_model
        self.model.eval()
        self.model = self.model.to("cpu")
        torch.cuda.empty_cache()
        self.model = self.model.to(self.device)
        self.model.eval()
        
        return self"""
    
    #mixup version
    def fit(self, x_train, y_train, x_val, y_val,
            start_lr=0.008, end_lr=0.0001, batch_size=8192, epochs=1000):
        
        def train(dataloader_1, dataloader_2, loss_fn, optimizer):
            self.model.train()

            i = 0
            start_time = datetime.now()
            ema_loss = None
            log_frequency = int(len(dataloader_1) / 10.0)
            for (x_1, y_1), (x_2, y_2) in zip(dataloader_1, dataloader_2):
                lam = np.random.beta(0.2, 0.2)
                x = lam * x_1 + (1.0 - lam) * x_2
                y = lam * y_1 + (1.0 - lam) * y_2
                x = x.to(self.device)
                y = y.to(self.device)
                y_pred = self.model(x)

                loss = loss_fn(y_pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                i += 1
                if ema_loss is None:
                    ema_loss = loss.item()
                else:
                    alpha = 0.2
                    ema_loss = alpha * loss.item() + (1.0 - alpha) * ema_loss

                #if i % log_frequency == 0:
                total_time = datetime.now() - start_time
                #print(f"ema_loss: {ema_loss:>7f}  [{i * len(x):>5d}/{len(dataloader_1.dataset):>5d}] {total_time}")

        def test(dataloader, loss_fn):
            self.model.eval()
            
            num_batches = len(dataloader)
            test_loss = 0
            with torch.no_grad():
                for x, y in dataloader:
                    x = x.to( self.device )
                    y = y.to( self.device )
                    y_pred = self.model(x)
                    test_loss += loss_fn(y_pred, y).item()
            test_loss /= num_batches
            return test_loss
        
        self.class_names = np.unique( y_train )
        class_count = len( self.class_names )
        train_dataset_1 = CustomDataset( x_train, y_train, class_count )
        train_dataset_2 = CustomDataset( x_train, y_train, class_count )
        train_data_loader_1 = DataLoader(train_dataset_1, batch_size=batch_size, shuffle=True, drop_last=True)
        train_data_loader_2 = DataLoader(train_dataset_2, batch_size=batch_size, shuffle=True, drop_last=True)
        val_dataset = CustomDataset( x_val, y_val, class_count )
        val_data_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
        loss_function = torch.nn.CrossEntropyLoss()
        
        best_loss = np.inf
        lr_step = (end_lr - start_lr) / epochs
        current_lr = start_lr
        for i in range(1, epochs+1):
            
            #optimizer = AdamW(self.model.parameters(), lr=current_lr, weight_decay=1e-2, betas=(0.9, 0.999), amsgrad=True)
            #optimizer = torch.optim.Adam(self.model.parameters(), lr=current_lr, weight_decay=1e-5, betas=(0.9, 0.999), amsgrad=True)
            #if i % 20 == 0:
            #    current_lr = 0.95 * current_lr
            optimizer = QHAdam( self.model.parameters(), lr=current_lr, weight_decay=1.0e-5 )
            
            print("Epoch: {} | lr: {}".format(i, current_lr))
            train(train_data_loader_1, train_data_loader_2, loss_function, optimizer)
            
            val_loss = test(val_data_loader, loss_function)
            print("Validation loss: {}".format(val_loss))
            if val_loss < best_loss:
                print("Previous best loss: {}".format(best_loss))
                best_loss = val_loss
                best_model = deepcopy( self.model )
            
            current_lr += lr_step
        
        self.model = best_model
        
        self.model = best_model
        self.model.eval()
        self.model = self.model.to("cpu")
        torch.cuda.empty_cache()
        self.model = self.model.to(self.device)
        self.model.eval()
        
        return self

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting qhoptim
  Downloading qhoptim-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: qhoptim
Successfully installed qhoptim-1.1.0


### metric and cv and train function

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, f1_score

def gini_score(model, x, y):
   y_pred = model.predict_proba( x )
   if len(y_pred.shape) == 2:
       y_pred = y_pred[:, 1]
   gini_score = 2.0 * roc_auc_score(y, y_pred) - 1.0
   return gini_score

In [None]:
import gc
import numpy as np
import pandas as pd
import bisect
import torch

from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE, ADASYN

def DaNet_Kfold_sex(x_is_male, y_is_male):
   i = 0
   cv = 10
   val_scores = []
   k_fold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=45)
   for train_ids, val_ids in tqdm(k_fold.split(x_is_male, y_is_male), desc="Fitting cv classifiers"):
   
       model = DANetClassifier(input_dim = len( x_is_male[ train_ids ][0] ), 
                               num_classes = len( np.unique(y_is_male) ), 
                               #layer_num=48, base_outdim=96, k=8,
                               #layer_num=32, base_outdim=96, k=8,
                               layer_num=32, base_outdim=64, k=5,
                               virtual_batch_size=256, drop_rate=0.1,
                               device="cuda")
       model.fit( x_is_male[ train_ids ], y_is_male[ train_ids ], x_is_male[ val_ids ], y_is_male[ val_ids ], start_lr=0.008, end_lr=0.0001, batch_size=2048, epochs=50 )
       save( model, "danet_is_male_cv_{}.pkl".format(i) )
       y_pred = model.predict_proba(x_is_male[ val_ids ])[:, 1]
       save( y_pred, "danet_pred_cv_{}.pkl".format(i) )
       save( y_is_male[ val_ids ], "danet_test_cv_{}.pkl".format(i) )
   
       val_score_i = gini_score(model, x_is_male[ val_ids ], y_is_male[ val_ids ])
       val_scores.append( val_score_i )
       print(val_score_i)
       del model
       gc.collect()
       torch.cuda.empty_cache()
       i += 1
   print(val_scores)
   print("Mean val score: {}".format(np.mean(val_scores)))
   return(val_scores)

### Prediction and save function

In [None]:
def predict(name):
  probas = []
  for i in tqdm(range(10), desc="Predicting probas"):
      model = load("danet_is_male_cv_{}.pkl".format(i))
      probas_i = model.predict_proba(df_sex_test.drop(['user_id'], axis=1).values)[:, 1]
      probas_i = probas_i.reshape((-1, 1))
      probas.append(probas_i)
      del model
      gc.collect()
      torch.cuda.empty_cache()
  probas = np.hstack(probas)
  mean_probas = np.mean( probas, axis=1 )

  submission_ids = id_to_submit["user_id"].values
  submission_ids = submission_ids.reshape((-1, 1))
  submission_predicts = mean_probas.reshape((-1, 1))
  submission_data = np.hstack( [submission_ids, submission_predicts] )
  my_submission_df = pd.DataFrame( data=submission_data, columns=["user_id", "is_male"] )
  my_submission_df["user_id"] = my_submission_df["user_id"].astype(int)
  my_submission_df.to_csv("is_male_predicts_{}.csv".format(name), index=False )
  print(my_submission_df.head())

# training and prediction for df_danet

In [None]:
df_danet = pd.read_csv('df_danet.csv')

In [None]:
df_sex, df_sex_test = make_train_data(df_danet)

(144724, 564)
1    135314
0    128982
Name: is_male, dtype: int64


In [None]:
%%time
score_sex = DaNet_Kfold_sex(df_sex.drop(['user_id', 'age', 'is_male'], axis=1).values, df_sex['is_male'].values)

In [None]:
score_sex

In [None]:
predict('df_danet')

# training and prediction for df_sex

In [None]:
# df_sex_0 = pd.read_csv('df_sex.csv')

In [None]:
# df_sex, df_sex_test = make_train_data(df_sex_0)

In [None]:
# %%time
# score_sex_0 = DaNet_Kfold_sex(df_sex.drop(['user_id', 'age', 'is_male'], axis=1).values, df_sex['is_male'].values)

In [None]:
# score_sex_0

In [None]:
# predict('df_sex')

# training and prediction for df_sex_1

In [None]:
# df_sex_1 = pd.read_csv('df_sex_1.csv')

In [None]:
# df_sex, df_sex_test = make_train_data(df_sex_1)

In [None]:
# %%time
# score_sex_1 = DaNet_Kfold_sex(df_sex.drop(['user_id', 'age', 'is_male'], axis=1).values, df_sex['is_male'].values)

In [None]:
# score_sex_1

In [None]:
# predict('df_sex_1')