imports

In [2]:
import pandas as pd
import torch
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
import time
import copy
from torch.utils.data import Dataset
import shutil
from tqdm import tqdm
import os


def seed_everything(seed=24535):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


seed_everything()


In [4]:
file_names = [
    f'data_for_competition/train_data/train_data_{i}.pq' for i in range(12)]
# file_names


In [5]:
def get_padded_tensor(df, orders_cnt=70):
    tensors = []
    ids = []
    for id, order in df.groupby("id"):
        order = order.sort_values("rn").drop('rn', axis=1).values
        tensors.append(torch.tensor(
            np.pad(order, [(0, orders_cnt-order.shape[0]), (0, 0)], mode='constant')))
        # ids.append(id)
    return torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)


Loader

In [6]:
class CustomDataset(Dataset):
    def __init__(self, filenames, sample):
      # `filenames` is a list of strings that contains all file names.
      # `batch_size` determines the number of files that we want to read in a chunk.
        self.filenames = filenames
        self.sample = sample

    def __len__(self):
        # Number of chunks.
        return len(self.filenames)

    def __getitem__(self, idx):  # idx means index of the chunk.
      # In this method, we do all the preprocessing.
      # First read  data from files in a chunk. Preprocess it. Extract labels. Then return data and labels.
        # This extracts one batch of file names from the list `filenames`.
        file = self.filenames[idx]
        data = pd.read_parquet(file, engine='fastparquet')
        cat_columns = list(data.columns)[2:]
        data[cat_columns] = data[cat_columns].astype('category')

        # ids, data = get_padded_tensor(data)

        target_file_name = f'data_for_competition/{self.sample}_target.csv'
        target = pd.read_csv(target_file_name)
        labels = target[target.id.isin(set(data.id))]
        # The following condition is actually needed in Pytorch. Otherwise, for our particular example, the iterator will be an infinite loop.
        # Readers can verify this by removing this condition.
        if idx == self.__len__():
            raise IndexError

        return data, labels
    
def get_batches(inputs, labels, batch_size=10_000):
    """batch_size - кол-во продуктов-записей"""
    if len(inputs) <= batch_size:
        return [inputs], [labels]
    res_batches = []
    res_labels = []
    l = 0
    last_id = -1
    while last_id != labels.id.iloc[-1]:
        next_id_ind = inputs[inputs.id > last_id].index[0]
        l = inputs.index.get_loc(next_id_ind)
        first_id = inputs.iloc[l].id
        if l+batch_size >= len(inputs):
            last_id = inputs.id.iloc[-1]
        else:
            last_id = inputs.iloc[l+batch_size].id
        batch = inputs[(inputs.id >= first_id) & (inputs.id <= last_id)]
        res_batches.append(batch)
        res_labels.append(labels[labels.id.isin(set(batch.id))])
    return res_batches, res_labels


In [7]:
train_dataloader = CustomDataset(
    [f'data_for_competition/train_data/train_data_{i}.pq' for i in range(11)],  'train')
val_dataloader = CustomDataset(
    [f'data_for_competition/train_data/train_data_{11}.pq'], 'val')
test_dataloader = CustomDataset([f'data_for_competition/test_data/test_data_{i}.pq' for i in range(2)],
                                'test')


In [9]:
inputs, labels = train_dataloader[0]
b_inp, b_lab = get_batches(inputs, labels, batch_size=10_000)
batch = b_inp[0]
label = b_lab[0]
tensor = get_padded_tensor(batch)


In [10]:

# mini_sample = inputs[:100_000]
# for c in inputs.columns[1:]:
#     fig = go.Figure()
#     fig.update_layout(title = c)
#     fig.add_trace(go.Histogram(x=mini_sample[c],
#                                 histnorm='probability'))
#     fig.show()

model

In [11]:
import torch.nn as nn
from torch.autograd import Variable


class LstmClassifier(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
        super(LstmClassifier, self).__init__()
        self.num_classes = num_classes  # number of classes
        self.num_layers = num_layers  # number of layers
        self.input_size = input_size  # input size
        self.hidden_size = hidden_size  # hidden state
        self.seq_length = seq_length  # sequence length

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)  # lstm
        self.fc_1 = nn.Linear(hidden_size, 128)  # fully connected 1
        self.fc = nn.Linear(128, num_classes)  # fully connected last layer

        self.relu = nn.ReLU()

    def forward(self, x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0),
                       self.hidden_size))  # hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0),
                       self.hidden_size))  # internal state
        # Propagate input through LSTM
        # lstm with input, hidden, and internal state
        output, (hn, cn) = self.lstm(x, (h_0, c_0))
        # reshaping the data for Dense layer next
        hn = hn.view(-1, self.hidden_size)
        out = self.relu(hn)
        out = self.fc_1(out)  # first Dense
        out = self.relu(out)  # relu
        out = self.fc(out)  # Final Output
        return out


In [14]:
tensor.shape# 

torch.Size([1270, 70, 60])

In [16]:
model = LstmClassifier(num_classes=2,
                       input_size=tensor.shape[2],# n_features == 60
                       hidden_size=2,
                       num_layers=1,
                       seq_length=None)


In [17]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")#TODO delete
device = torch.device("cpu")
model = model.to(device)


train_model

In [19]:
def train_model(model, loss, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        print('Epoch {}/{}:'.format(epoch, num_epochs - 1), flush=True)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                dataloader = train_dataloader
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                dataloader = val_dataloader
                model.eval()   # Set model to evaluate mode

            running_loss = 0.
            running_acc = 0.

            # Iterate over data.
            for huge_inputs, huge_labels in tqdm(dataloader):
                batch_inputs, batch_labels = get_batches(
                    huge_inputs, huge_labels, batch_size=10_000)
                batch_cnt = 1  # TODO delete
                for inputs, labels in zip(batch_inputs[:batch_cnt], batch_labels[:batch_cnt]):
                    inputs = get_padded_tensor(inputs).type(torch.float)
                    #TODO one hot encode enc_loans_credit_status
                    labels = torch.tensor(
                        labels.flag.values).type(torch.LongTensor)
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    optimizer.zero_grad()

                    # forward and backward
                    with torch.set_grad_enabled(phase == 'train'):
                        preds = model(inputs)
                        loss_value = loss(preds, labels)
                        preds_class = preds.argmax(dim=1)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss_value.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss_value.item()
                    running_acc += (preds_class == labels.data).float().mean()

            epoch_loss = running_loss / len(dataloader)
            epoch_acc = running_acc / len(dataloader)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc), flush=True)

    return model


In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=1.0e-3)
train_model(model,
            torch.nn.CrossEntropyLoss(),
            optimizer,
            torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1),
            1)


Epoch 0/0:


 18%|█▊        | 2/11 [01:50<08:22, 55.82s/it]