In [57]:
import pandas as pd
import numpy as np
import os
import shutil

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
#from sklearn.model_selection import train_test_split

from packaging import version
import pytorch_lightning as pl
from pytorch_lightning import Callback

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
import torch.optim as optimizers

from torchvision import datasets
from torchvision import transforms

import optuna
from optuna.integration import PyTorchLightningPruningCallback

In [58]:
if version.parse(pl.__version__) < version.parse("1.0.2"):
    raise RuntimeError("PyTorch Lightning>=1.0.2 is required for this example.")

In [59]:
all_data = pd.read_csv('data/all_df_nlp_preprocessed.csv')
train_data = all_data[all_data["data_type"] == "train"]

In [60]:
target_cols = ["state"]
feature_cols = ["country", "category1", "duration", "goal_min", 'number_of_figure', 'number_of_paragraph', 'length_of_text']

In [61]:
class CFDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.csv_file = csv_file
        self.transform = transform
        self.feature_cols = ["country", "category1", "duration", "goal_min", 'number_of_figure', 'number_of_paragraph', 'length_of_text']
        self.target_cols = ["state"]
        
        
    def __len__(self):
        return len(self.csv_file)
    
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        features = self.csv_file[self.feature_cols].iloc[idx]
        features = torch.FloatTensor(features)
        try:
            target = self.csv_file[self.target_cols].iloc[idx]
            target = torch.Tensor(target)
            # train_step
        except:
            target = self.csv_file["id"].iloc[idx]
            # test_step
        
        if self.transform:
            features = self.transform(features)
            
        return features, target

In [62]:
class CFDataModule(pl.LightningDataModule):
    def __init__(self, train, validation, transform, split_rate, batch_size, num_workers):
        super().__init__()
        self.csv_file = train
        self.transform = transform
        self.split_rate = split_rate
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.validation = validation
        

    def setup(self, stage=None):
        self.train_dataset = CFDataset(csv_file=self.csv_file, transform=self.transform)
        self.val_dataset = CFDataset(csv_file=self.validation, transform=self.transform)
            
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          drop_last=True,
                          num_workers=self.num_workers,
                          pin_memory=True)
    
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          drop_last=True,
                          num_workers=self.num_workers,
                          pin_memory=True)

In [63]:
class CFModule(pl.LightningModule):
    def __init__(self, num_features, num_classes, trial):
        super(CFModule, self).__init__()
        self.layers = []
        self.dropouts = []

        # We optimize the number of layers, hidden units in each layer and dropouts.
        n_layers = trial.suggest_int("n_layers", 1, 4)
        dropout = trial.suggest_float("dropout", 0.2, 0.5)
        input_dim = num_features
        for i in range(n_layers):
            output_dim = trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True)
            self.layers.append(nn.BatchNorm1d(input_dim))
            self.dropouts.append(nn.Dropout(dropout))
            self.layers.append(nn.utils.weight_norm(nn.Linear(input_dim, output_dim)))
            input_dim = output_dim

        self.layers.append(nn.Linear(input_dim, num_classes))

        # Assigning the layers as class variables (PyTorch requirement).
        # Parameters of a layer are returned when calling model.parameters(),
        # only if the layer is a class variable. Thus, assigning as class
        # variable is necessary to make the layer parameters trainable.
        for idx, layer in enumerate(self.layers):
            setattr(self, "fc{}".format(idx), layer)

        # Assigning the dropouts as class variables (PyTorch requirement), for
        # the same reason as above.
        for idx, dropout in enumerate(self.dropouts):
            setattr(self, "drop{}".format(idx), dropout)
            
    
    def forward(self, x):
        for layer, dropout in zip(self.layers, self.dropouts):
            x = F.relu(layer(x))
            x = dropout(x)
        x = self.layers[-1](x)
        
        return F.sigmoid(x)
    
    
    def training_step(self, batch, batch_idx):
        x, t = batch
        pred = self.forward(x)
        loss = self.criterion(pred, t)
        acc = self.metric(pred, t)
        # you should define log as {"tag_name/log_name"}
        tensorboard_logs = {'train/train_loss': loss, "train/train_acc": acc}
        return {"loss": loss, "acc": acc, "logs": tensorboard_logs, "progress_bar": tensorboard_logs}
    
    
    def validation_step(self, batch, batch_idx):
        x, t = batch
        pred = self.forward(x)
        loss = self.criterion(pred, t)
        acc = self.metric(pred, t)
        logs = {"val_loss": loss, "val_acc": acc}
        return {"val_loss": loss, "val_acc": acc, "progress_bar": logs}

    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([torch.tensor(x['val_acc']) for x in outputs]).mean()
        tensorboard_logs = {'val/avg_loss': avg_loss, "val/avg_acc": avg_acc}
        #print(f"val_loss: {avg_loss}, val_acc: {avg_acc}")
        # you should call back as name "val_loss" to using the Early-Stopping
        return {'val_loss': avg_loss, "val_acc": avg_acc, 'log': tensorboard_logs}
    
    
    def configure_optimizers(self):
        optimizer = optimizers.Adam(self.parameters(), 
                                    lr=0.001, betas=(0.9,0.999),
                                    eps=1e-08, weight_decay=0, amsgrad=False)
        scheduler = {"scheduler": 
                     optimizers.lr_scheduler.CosineAnnealingLR(
                        optimizer, T_max=10),
                    "interval": "epoch",
                    "monitor": "val_loss"}
        return [optimizer], [scheduler]
    
    
    def criterion(self, pred, t):
        #pred = pred.view(-1)
        pred = pred.float()
        t = t.float()
        return F.binary_cross_entropy(input=pred, target=t)
    
    def metric(self, pred, t):
        t = t.to('cpu')
        try:
            pred = pred.detach().numpy()
        except:
            pred = pred.to('cpu')
        pred = np.where(pred<0.5, 0, 1)
        return f1_score(y_true=t, y_pred=pred, average='binary', sample_weight=None, zero_division='warn')

In [64]:
class MetricsCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

In [65]:
def objective(trial):
    
    # data module config
    seed = 123456
    cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    for fold, (t_idx, v_idx) in enumerate(cv.split(train_data)):
        train_set = train_data.loc[t_idx]
        val_set = train_data.loc[v_idx]
        
    transform = None
    split_rate = 0.8
    batch_size = 256*4
    num_workers = 4
    PERCENT_VALID_EXAMPLES = 0.5
    EPOCHS = 15
    
    # model config
    num_input = len(feature_cols)
    num_classes = 1
    
    cf = CFDataModule(train_set, val_set, transform, split_rate, batch_size, num_workers)
    model = CFModule(num_features=num_input, num_classes=num_classes, trial=trial)
    
    
    
    # Filenames for each trial must be made unique in order to access each checkpoint.
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        os.path.join(MODEL_DIR, "trial_{}".format(trial.number), "{epoch}"), monitor="val_acc"
    )

    # The default logger in PyTorch Lightning writes to event files to be consumed by
    # TensorBoard. We don't use any logger here as it requires us to implement several abstract
    # methods. Instead we setup a simple callback, that saves metrics from each validation step.
    metrics_callback = MetricsCallback()
    trainer = pl.Trainer(
        logger=False,
        limit_val_batches=PERCENT_VALID_EXAMPLES,
        checkpoint_callback=checkpoint_callback,
        max_epochs=EPOCHS,
        gpus=1 if torch.cuda.is_available() else None,
        callbacks=[metrics_callback, PyTorchLightningPruningCallback(trial, monitor="val_acc")],
    )
    
    trainer.fit(model, cf)

    return metrics_callback.metrics[-1]["val_acc"].item()

In [66]:
if __name__ == "__main__":
    
    DIR = os.getcwd()
    MODEL_DIR = os.path.join(DIR, "result")
    
    pruner = optuna.pruners.NopPruner()

    study = optuna.create_study(direction="maximize", pruner=pruner)
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    shutil.rmtree(MODEL_DIR)

[32m[I 2020-12-28 15:55:54,219][0m A new study created in memory with name: no-name-e31b81e2-3945-4e3b-87fa-b40a9e78925c[0m
GPU available: False, used: False
TPU available: None, using: 0 TPU cores

   | Name  | Type        | Params
---------------------------------------
0  | fc0   | BatchNorm1d | 14    
1  | fc1   | Linear      | 36    
2  | fc2   | BatchNorm1d | 8     
3  | fc3   | Linear      | 42    
4  | fc4   | BatchNorm1d | 14    
5  | fc5   | Linear      | 423   
6  | fc6   | BatchNorm1d | 94    
7  | fc7   | Linear      | 980   
8  | fc8   | Linear      | 21    
9  | drop0 | Dropout     | 0     
10 | drop1 | Dropout     | 0     
11 | drop2 | Dropout     | 0     
12 | drop3 | Dropout     | 0     
---------------------------------------
1.6 K     Trainable params
0         Non-trainable params
1.6 K     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

KeyboardInterrupt: 