In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

#from sklearn import preprocessing as pp
#from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import re
import time
from tqdm import tqdm
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

#import torchvision.transforms as transforms
import torch.optim as optimizers
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import WeightedRandomSampler, BatchSampler

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def set_seed(seed: int = 123):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
set_seed(123)

In [4]:
train_new = pd.read_csv('data/train_added_encoded.csv')
test_new = pd.read_csv('data/test_added_encoded.csv')
all_data = pd.read_csv('data/all_added_encoded.csv')
sub = pd.read_csv('data/sample_submit.csv')

In [5]:
train_new.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_min,goal_max
0,0,4001-5000,4,28,12,142,"<div class=""contents""><div><span class=""bold"">...",0,4001.0,5000.0
1,1,3001-4000,16,33,5,108,"<div class=""contents""><div><h1 class=""page-anc...",0,3001.0,4000.0
2,2,19001-20000,21,29,7,122,"<div class=""contents""><div><p> As our society ...",0,19001.0,20000.0
3,3,2001-3000,21,40,13,0,"<div class=""contents""><div><p>My name is Donal...",0,2001.0,3000.0
4,4,2001-3000,9,28,13,33,"<div class=""contents""><div><div class=""templat...",1,2001.0,3000.0


In [6]:
target_cols = ["state"]
#feature_cols = ["country", "duration", "category1", "category2", "goal_min", "goal_max"]
#categorical_cols = ["country", "category1", "category2"]
feature_cols = ["duration", "category1", "category2", "goal_min"]
categorical_cols = ["category1", "category2"]

In [21]:
class CFDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.csv_file = csv_file
        self.transform = transform
        self.feature_cols = ["duration", "category1", "category2", "goal_min"]
        self.target_cols = ["state"]
        
        
    def __len__(self):
        return len(self.csv_file)
    
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        features = self.csv_file[self.feature_cols].iloc[idx]
        features = torch.FloatTensor(features)
        try:
            target = self.csv_file[self.target_cols].iloc[idx]
            target = torch.tensor(target)
            # train_step
        except:
            target = self.csv_file["id"].iloc[idx]
            # test_step
        
        if self.transform:
            features = self.transform(features)
            
        return features, target

In [8]:
class CFDataModule(pl.LightningDataModule):
    def __init__(self, csv_file, transform, split_rate, batch_size, num_workers):
        super().__init__()
        self.csv_file = csv_file
        self.transform = transform
        self.split_rate = split_rate
        self.batch_size = batch_size
        self.num_workers = num_workers
        

    def setup(self, stage=None):
        dataset = self.csv_file
        n_samples = len(dataset)
        n_train = int(n_samples * 0.8)
        n_val = n_samples - n_train
        train_dataset, val_dataset = train_test_split(dataset,  train_size=n_train, test_size=n_val)
        
        self.train_dataset = CFDataset(csv_file=train_dataset, transform=self.transform)
        self.val_dataset = CFDataset(csv_file=val_dataset, transform=self.transform)
        
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          drop_last=True,
                          num_workers=self.num_workers,
                          pin_memory=True)
    
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          drop_last=True,
                          num_workers=self.num_workers,
                          pin_memory=True)

In [11]:
class CFModule(pl.LightningModule):
    def __init__(self, num_features, hidden_size, dropout, num_classes):
        super(CFModule, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(dropout*(2/5))
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size*2))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size*2)
        self.dropout2 = nn.Dropout(dropout)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size*2, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(dropout)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm4 = nn.BatchNorm1d(hidden_size)
        self.dropout4 = nn.Dropout(dropout)
        self.layer4 = nn.Linear(hidden_size, num_classes)
        
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.relu(self.dense3(x))
        
        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = self.layer4(x)
        
        x = torch.sigmoid(x)
        
        return x
    
    
    def training_step(self, batch, batch_idx):
        x, t = batch
        pred = self.forward(x)
        loss = self.criterion(pred, t)
        acc = self.metric(pred, t)
        # you should define log as {"tag_name/log_name"}
        tensorboard_logs = {'train/train_loss': loss, "train/train_acc": acc}
        return {"loss": loss, "acc": acc, "logs": tensorboard_logs, "progress_bar": tensorboard_logs}
    
    
    def validation_step(self, batch, batch_idx):
        x, t = batch
        pred = self.forward(x)
        loss = self.criterion(pred, t)
        acc = self.metric(pred, t)
        logs = {"val_loss": loss, "val_acc": acc}
        return {"val_loss": loss, "val_acc": acc, "progress_bar": logs}

    
    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        #avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        tensorboard_logs = {'val/avg_loss': avg_loss}
        # you should call back as name "val_loss" to using the Early-Stopping
        return {'val_loss': avg_loss, 'log': tensorboard_logs}
    
    
    def configure_optimizers(self):
        optimizer = optimizers.Adam(self.parameters(), 
                                    lr=0.001, betas=(0.9,0.999),
                                    eps=1e-08, weight_decay=0, amsgrad=False)
        scheduler = {"scheduler": 
                     optimizers.lr_scheduler.CosineAnnealingLR(
                        optimizer, T_max=10),
                    "interval": "epoch",
                    "monitor": "val_loss"}
        return [optimizer], [scheduler]
    
    
    def criterion(self, pred, t):
        return F.mse_loss(input=pred, target=t)
    
    def metric(self, pred, t):
        t = np.array(t.to('cpu'))
        pred = np.array(pred.to('cpu'))
        pred = np.where(pred<0.5, 0, 1)
        return f1_score(y_true=t, y_pred=pred, average='binary', sample_weight=None, zero_division='warn')

In [12]:
def main():
    # trainer config
    epochs = 1
    output_path = './'
    
    # data module config
    csv_file = train_new
    transform = None
    split_rate = 0.8
    batch_size = 256*4
    num_workers = 2
    
    # model config
    num_input = len(feature_cols)
    hidden_size = num_input*2
    dropout = 0.3
    num_classes = 1
    
    # early stopping config
    patience = 3
    
    cf = CFDataModule(csv_file, transform, split_rate, batch_size, num_workers)
    model = CFModule(num_features=num_input, hidden_size=hidden_size, dropout=dropout, num_classes=num_classes)
    
    early_stopping = EarlyStopping('val_loss', patience=patience, verbose=True)
    trainer = Trainer(
            max_epochs=epochs,
            weights_save_path=output_path,
            gpus = 1 if torch.cuda.is_available() else None,
            callbacks=[early_stopping]
            #accumulate_grad_batches=1
            # use_amp=False,
        )
        
    trainer.fit(model, cf)
    #torch.cuda.empty_cache()
    # TO DO: use model.apply(weights_init) instead of torch.cuda.empty_cache()

In [13]:
if __name__ == "__main__":
    main()

EarlyStopping mode auto is unknown, fallback to auto mode.
EarlyStopping mode set to min for monitoring val_loss.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores

   | Name        | Type        | Params
---------------------------------------------
0  | batch_norm1 | BatchNorm1d | 8     
1  | dropout1    | Dropout     | 0     
2  | dense1      | Linear      | 96    
3  | batch_norm2 | BatchNorm1d | 32    
4  | dropout2    | Dropout     | 0     
5  | dense2      | Linear      | 144   
6  | batch_norm3 | BatchNorm1d | 16    
7  | dropout3    | Dropout     | 0     
8  | dense3      | Linear      | 80    
9  | batch_norm4 | BatchNorm1d | 16    
10 | dropout4    | Dropout     | 0     
11 | layer4      | Linear      | 9     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

KeyboardInterrupt: 