1. Для датасета животных обучить MLP.
2. Использовать Custom Dataset, Sampler, collate_fn
3. Сделать различную предобработку фичей
4. Подключить для логирования tensorboard и/или mlflow
5. Не забыть разделить выборку на train и valid
6. Получить точность не ниже 65%.

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Sampler


import mlflow
from mlflow.exceptions import MlflowException
from mlflow.tracking import MlflowClient

In [2]:
# Fix all seeds

SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
def load_data(path_to_data='data/X_cat.csv', path_to_target='data/y_cat.csv'):
    X = pd.read_csv(path_to_data, sep = '\t', index_col=0)
    y = pd.read_csv(path_to_target, sep = '\t', index_col=0, names=['status'])
    return X, y

In [4]:
def preprocess_target(target):
    """
    Perform target preprocessing
    """
    target[target == 'Died'] = 'Euthanasia'
    name_to_class = {name:cls for cls, name in enumerate(pd.unique(target))}
    class_to_name = {cls:name for name, cls in name_to_class.items()}
    target = target.map(name_to_class)
    return name_to_class, class_to_name, target

In [5]:
X, y = load_data()
name_to_class_dict, class_to_name_dict, y  = preprocess_target(y['status'])
X.head()

Unnamed: 0,IsDog,Age,HasName,NameLength,NameFreq,MixColor,ColorFreqAsIs,ColorFreqBase,TabbyColor,MixBreed,...,SexStatus_Flawed,SexStatus_Intact,SexStatus_Unknown,Weekday_0,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6
0,1,365.0,1,7,0.000157,1,0.032919,0.463624,0,1,...,1,0,0,0,0,1,0,0,0,0
1,0,365.0,1,5,0.000655,0,0.008092,0.015005,1,1,...,1,0,0,0,0,0,0,0,0,1
2,1,730.0,1,6,5.2e-05,1,0.026293,0.357521,0,1,...,1,0,0,0,0,0,0,0,1,0
3,0,21.0,0,7,0.285871,0,0.000471,0.058418,0,1,...,0,1,0,0,0,0,0,1,0,0
4,1,730.0,0,7,0.285871,0,0.023831,0.075353,0,0,...,1,0,0,0,0,0,0,1,0,0


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [7]:
cyclic_meta = {'cyclic_columns': ['Month', 'Day', 'Hour', 'Weekday'],
              'cyclic_period' : [12, 31, 24, 7]}

In [8]:
def get_sex_columns(X:pd.DataFrame)->pd.Series:
    sex_columns = X.columns[X.columns.str.startswith('Sex_')]
    return sex_columns

In [9]:
def get_sex_status_columns(X:pd.DataFrame)->pd.Series:
    sex_status_columns = X.columns[X.columns.str.startswith('SexStatus')]
    return sex_status_columns

In [10]:
def get_weekday_columns(X: pd.DataFrame) -> pd.Series:
    """
    Return columns contains weekdays 
    """
    weekday_columns = X.columns[X.columns.str.startswith('Week')]
    return weekday_columns

In [11]:
def columns_aggregate_preprocessing(X, col):
    """
    Perform aggregarion over several columns
    """
    aggregation_result = np.argmax(X[col].values, axis=1)
    return aggregation_result

In [12]:
def cyclic_preprocessing(x: np.array, number_unique_values:list):
    """
    Apply sin and cos transformation over cyclic variable
    """
    sin = np.sin(2 * np.pi * x / number_unique_values)
    cos = np.cos(2 * np.pi * x / number_unique_values)
    return sin, cos

In [13]:
def cyclic_column_preprocessing(X, columns, periods):
    for col, period in zip(columns, periods):
        X[col+'_sin'], X[col+'_cos'] = cyclic_preprocessing(X[col], number_unique_values=period)
    return

In [78]:
def get_binary_columns(X):
    binary_columns = []
    for col in X.columns:
        if len(X[col].unique())<=2:
            binary_columns.append(col)
    return binary_columns

In [79]:
class CastomDataset(Dataset):
    def __init__(self, X, y, cyclic_meta):

        self.X, self.y = X.copy(), y.copy()
        self.X.reset_index(drop=True, inplace=True)
        self.y = self.y.reset_index(drop=True)
        
        weekday_columns = get_weekday_columns(self.X)
        sex_columns = get_sex_columns(self.X)
        sex_status = get_sex_status_columns(self.X)
        columns_to_drop = np.concatenate([weekday_columns.values,
                                         sex_columns.values,
                                         sex_status.values,
                                         cyclic_meta['cyclic_columns']])
        
        self.X['Weekday'] = columns_aggregate_preprocessing(self.X, weekday_columns)
        self.X['Gender'] = columns_aggregate_preprocessing(self.X, sex_columns)
        self.X['Sex_status'] = columns_aggregate_preprocessing(self.X, sex_status)
        cyclic_column_preprocessing(self.X, cyclic_meta['cyclic_columns'], cyclic_meta['cyclic_period'])
        self.X.drop(columns=columns_to_drop, inplace=True)
        
        self.binary_columns = get_binary_columns(self.X)
        self.numerical_columns = np.setdiff1d(self.X.columns.values, self.binary_columns)
    
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, indx):
        row = self.X.loc[indx,:]
        category, numerical = row[self.binary_columns].values, row[self.numerical_columns].values
        target = self.y[indx]
        return category, numerical, target

In [116]:
def collate_fn(batch):
    category_data = []
    numerical_data = []
    target_data = []
    for category, numerical, target in batch:
        category_data.append(category)
        numerical_data.append(numerical)
        target_data.append(target)
        
    category_data = torch.LongTensor(category_data).T
    numerical_data = torch.Tensor(numerical_data)
    target_data = torch.LongTensor(target_data)
    
    return category_data, numerical_data, target_data

In [117]:
train_dataset = CastomDataset(X=X_train, y=y_train,
                        cyclic_meta=cyclic_meta)

valid_dataset = CastomDataset(X=X_valid, y=y_valid,
                        cyclic_meta=cyclic_meta)

num_embedding_layers = len(train_dataset.binary_columns)
num_numerical_features = len(train_dataset.numerical_columns)

In [118]:
EMBEDDING_DIM = 2
NUM_CLASSES = 4
INPUT_SIZE = EMBEDDING_DIM*num_embedding_layers+num_numerical_features
HIDDEN_SIZE = 256

BATCH_SIZE=512
LEARNING_RATE = 1e-3

In [119]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
loaders = {'train' : train_loader, 'valid': valid_loader}

In [120]:
class SimpleNN(nn.Module):
    def __init__(self, input_size=INPUT_SIZE, num_classes=NUM_CLASSES,
                 embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                num_embedding_layers = num_embedding_layers):
        
        super(SimpleNN, self).__init__()
        
        self.embedding_layer_list = nn.ModuleList([nn.Embedding(2, embedding_dim) for _ in range(num_embedding_layers)])
            
        self.batch_norm_1 = nn.BatchNorm1d(input_size)
        self.fc_1 = nn.Linear(in_features=input_size, out_features=hidden_size)
        
        self.batch_norm_2 = nn.BatchNorm1d(hidden_size)
        self.fc_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.batch_norm_3 = nn.BatchNorm1d(hidden_size)
        self.fc_3 = nn.Linear(in_features=hidden_size, out_features=num_classes)
    
    def forward(self, cat, num):
        embeddings = torch.cat([layer(col) for col, layer in zip(cat, self.embedding_layer_list)], dim=-1)
        features = torch.cat([num, embeddings], dim=-1)
        
        x = self.batch_norm_1(features)
        x = self.fc_1(x)
        x = F.relu(x)
        
        x = self.batch_norm_2(x)
        x = self.fc_2(x)
        x = F.relu(x)
        
        x = self.batch_norm_3(x)
        x = self.fc_3(x)
        
        return x

In [85]:
model = SimpleNN().to(DEVICE)

In [86]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [87]:
def experiment_mlflow(model, loaders, criterion=criterion, optimizer=optimizer, epoches=10):
    for epoch in tqdm(range(epoches)):
        train_loss = 0
        valid_loss = 0
        correct_train = 0
        correct_valid = 0

        for mode, data_loader in loaders.items():
            if mode == 'train':
                model.train()
            else:
                model.eval()

            for X_cat, X_num, y_data in tqdm(data_loader):

                X_cat, X_num, y_data = X_cat.to(DEVICE), X_num.to(DEVICE), y_data.to(DEVICE)

                if mode == 'train':
                    optimizer.zero_grad()
                    output = model(X_cat, X_num)
                    loss = criterion(output, y_data)
                    loss.backward()
                    optimizer.step()
                    train_loss += loss.item()
                    output = torch.softmax(output, dim=-1)
                    predict = torch.argmax(output.cpu(), dim=-1)
                    correct_train += (y_data.cpu()==predict).sum()

                else:
                    with torch.no_grad():
                        output = model(X_cat, X_num)
                        loss = criterion(output, y_data)
                        valid_loss += loss.item()
                        output = torch.softmax(output, dim=-1)
                        predict = torch.argmax(output.cpu(), dim=-1)
                        correct_valid += (y_data.cpu()==predict).sum()

        train_loss = train_loss/len(train_dataset)
        valid_loss = valid_loss/len(valid_dataset)
        train_accuracy = correct_train/len(train_dataset)
        valid_accuracy = correct_valid/len(valid_dataset)
            
        mlflow.log_metric("train/loss", float(train_loss), step=epoch)
        mlflow.log_metric("valid/loss", float(valid_loss), step=epoch)
        mlflow.log_metric("train/accuracy", float(train_accuracy), step=epoch)
        mlflow.log_metric("valid/accuracy", float(valid_accuracy), step=epoch)

        print(f'Epoch: {epoch}, Train loss: {round(train_loss, 4)},  Valid loss: {round(valid_loss, 4)}')
        print(f'Epoch: {epoch}, Train Accuracy: {train_accuracy},  Valid Accuracy: {valid_accuracy}')

#### MLflow

In [88]:
client = MlflowClient()
try:
    experiment_id = client.create_experiment("MLflow_experiment_mlp")
except MlflowException:  # If such experiment already exist
    experiment_id = client.get_experiment_by_name("MLflow_experiment_mlp").experiment_id
        
with mlflow.start_run(run_name='MLFlow_run', experiment_id=experiment_id):
    
    EPOCHES = 10
    mlflow.log_param("optimizer", 'Adam')
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("embedding_dim", EMBEDDING_DIM)
    mlflow.log_param("epoches", EPOCHES)

    experiment_mlflow(model, loaders, epoches=EPOCHES)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  category_data = torch.LongTensor(category_data).T


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, Train loss: 0.0019,  Valid loss: 0.0049
Epoch: 0, Train Accuracy: 0.6107655763626099,  Valid Accuracy: 0.39169472455978394


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, Train loss: 0.0016,  Valid loss: 0.0017
Epoch: 1, Train Accuracy: 0.6652480959892273,  Valid Accuracy: 0.6569397449493408


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 2, Train loss: 0.0015,  Valid loss: 0.0017
Epoch: 2, Train Accuracy: 0.6753028035163879,  Valid Accuracy: 0.6580621004104614


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 3, Train loss: 0.0015,  Valid loss: 0.0017
Epoch: 3, Train Accuracy: 0.6850301623344421,  Valid Accuracy: 0.6593714952468872


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 4, Train loss: 0.0014,  Valid loss: 0.0017
Epoch: 4, Train Accuracy: 0.6926062703132629,  Valid Accuracy: 0.6595585346221924


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 5, Train loss: 0.0014,  Valid loss: 0.0017
Epoch: 5, Train Accuracy: 0.699106752872467,  Valid Accuracy: 0.6582491397857666


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 6, Train loss: 0.0014,  Valid loss: 0.0017
Epoch: 6, Train Accuracy: 0.7071505188941956,  Valid Accuracy: 0.6560044884681702


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 7, Train loss: 0.0013,  Valid loss: 0.0017
Epoch: 7, Train Accuracy: 0.7149137258529663,  Valid Accuracy: 0.655817449092865


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 8, Train loss: 0.0013,  Valid loss: 0.0017
Epoch: 8, Train Accuracy: 0.7206192016601562,  Valid Accuracy: 0.650954008102417


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 9, Train loss: 0.0013,  Valid loss: 0.0017
Epoch: 9, Train Accuracy: 0.729832112789154,  Valid Accuracy: 0.6498316526412964


#### TensorBoard
(Just to try)

In [121]:
from torch.utils.tensorboard import SummaryWriter
train_writer = SummaryWriter('./logs/train')
valid_writer = SummaryWriter('./logs/valid')

In [133]:
def experiment_tensorboard(model, loaders, criterion=criterion, optimizer=optimizer, epoches=10):
    for epoch in tqdm(range(epoches)):
        train_loss = 0
        valid_loss = 0
        correct_train = 0
        correct_valid = 0

        for mode, data_loader in loaders.items():
            if mode == 'valid':
                model.eval()
            else:
                model.train()

            for X_cat, X_num, y_data in tqdm(data_loader):

                X_cat, X_num, y_data = X_cat.to(DEVICE), X_num.to(DEVICE), y_data.to(DEVICE)

                if mode == 'train':
                    optimizer.zero_grad()
                    output = model(X_cat, X_num)
                    loss = criterion(output, y_data)
                    loss.backward()
                    optimizer.step()
                    train_loss += loss.item()
                    output = torch.softmax(output, dim=-1)
                    predict = torch.argmax(output.cpu(), dim=-1)
                    correct_train += (y_data.cpu()==predict).sum()

                else:
                    with torch.no_grad():
                        output = model(X_cat, X_num)
                        loss = criterion(output, y_data)
                        valid_loss += loss.item()
                        output = torch.softmax(output, dim=-1)
                        predict = torch.argmax(output.cpu(), dim=-1)
                        correct_valid += (y_data.cpu()==predict).sum()

        train_loss = train_loss/len(train_dataset)
        valid_loss = valid_loss/len(valid_dataset)
        train_accuracy = correct_train/len(train_dataset)
        valid_accuracy = correct_valid/len(valid_dataset)
            
        train_writer.add_scalar("loss", float(train_loss), epoch)
        valid_writer.add_scalar("loss", float(valid_loss), epoch)
        train_writer.add_scalar("accuracy", float(train_accuracy), epoch)
        valid_writer.add_scalar("accuracy", float(valid_accuracy), epoch)

        print(f'Epoch: {epoch}, Train loss: {round(train_loss, 4)},  Valid loss: {round(valid_loss, 4)}')
        print(f'Epoch: {epoch}, Train Accuracy: {train_accuracy},  Valid Accuracy: {valid_accuracy}')

In [134]:
example_batch = next(iter(valid_loader))

In [135]:
model_board = SimpleNN().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_board.parameters(), lr=LEARNING_RATE)

In [136]:
EPOCHES = 10
train_writer.add_text("optimizer", 'Adam')
train_writer.add_text('learning_rate', str(LEARNING_RATE))
train_writer.add_text('embedding_dim', str(EMBEDDING_DIM))
train_writer.add_text('batch_size', str(BATCH_SIZE))
train_writer.add_text('epoches', str(EPOCHES))

train_writer.add_graph(model, example_batch[:2], verbose=False)

experiment_tensorboard(model_board, loaders, criterion=criterion, optimizer=optimizer, epoches=EPOCHES)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

  category_data = torch.LongTensor(category_data).T


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, Train loss: 0.0019,  Valid loss: 0.0023
Epoch: 0, Train Accuracy: 0.6030958890914917,  Valid Accuracy: 0.5725776553153992


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, Train loss: 0.0017,  Valid loss: 0.0018
Epoch: 1, Train Accuracy: 0.6619744896888733,  Valid Accuracy: 0.654321014881134


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 2, Train loss: 0.0016,  Valid loss: 0.0017
Epoch: 2, Train Accuracy: 0.6731048226356506,  Valid Accuracy: 0.6597456336021423


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 3, Train loss: 0.0015,  Valid loss: 0.0017
Epoch: 3, Train Accuracy: 0.6837674975395203,  Valid Accuracy: 0.6582491397857666


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 4, Train loss: 0.0015,  Valid loss: 0.0017
Epoch: 4, Train Accuracy: 0.690782368183136,  Valid Accuracy: 0.6589973568916321


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 5, Train loss: 0.0014,  Valid loss: 0.0017
Epoch: 5, Train Accuracy: 0.698077917098999,  Valid Accuracy: 0.6563786268234253


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 6, Train loss: 0.0014,  Valid loss: 0.0017
Epoch: 6, Train Accuracy: 0.7063555121421814,  Valid Accuracy: 0.6560044884681702


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 7, Train loss: 0.0014,  Valid loss: 0.0017
Epoch: 7, Train Accuracy: 0.7143992781639099,  Valid Accuracy: 0.6541339159011841


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 8, Train loss: 0.0013,  Valid loss: 0.0017
Epoch: 8, Train Accuracy: 0.7211803793907166,  Valid Accuracy: 0.6515151262283325


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 9, Train loss: 0.0013,  Valid loss: 0.0017
Epoch: 9, Train Accuracy: 0.7302062511444092,  Valid Accuracy: 0.6513280868530273
