In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [3]:
from utils import UtilsKy
from analyzer import AnalyzerPrediction

In [4]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [5]:
from functools import partial
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [6]:
# pytorch mlp for binary classification
from numpy import vstack
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset 
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
import joblib

In [7]:
db_teach = pd.read_csv( UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')

In [8]:
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')

In [9]:
COL_FACTORS = ['amount', 'bank_currency', 'bin', 'count_months_to_end_card', 'day_of_week', 'is_city_resolved', 'hour',
                                                             'is_gender_undefined', 'latitude', 'longitude', 'phone_2_norm']
COL_FACTORS = sorted(COL_FACTORS)

In [10]:
from helper import DataHelper

In [14]:
from helper import DataHelper
datahelper = DataHelper(db_teach, db_test, COL_FACTORS)
datahelper.create_train_test()
datahelper.show_columns_with_na()
mean_values = datahelper.get_mean_value()
replaced_values = { col: mean_values[col] for col in ('latitude', 'longitude')}
replaced_values['default'] =  -999
datahelper.replaced_na_values(replaced_values) 
scaler_params = datahelper.get_scaler_params()
datahelper.minMaxScaler_own()
datahelper.add_status_in_train_test()
train , test = datahelper.get_train_test()

train na columns : Index(['latitude', 'longitude'], dtype='object')
test na columns : Index(['latitude', 'longitude'], dtype='object')
-999
-999
-999
-999
-999
-999
-999
-999
36.90237577890762
-92.53325861542274
-999


In [15]:
train.head()

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm,status
0,0.0,0.125874,0.741749,0.032955,0.333333,0.391304,1.0,1.0,0.803879,0.523963,0.555556,0
1,0.016011,0.974359,0.142897,0.046591,0.333333,0.521739,1.0,0.0,0.571486,0.033105,0.080808,0
2,0.013462,0.974359,0.730808,0.05,0.333333,0.652174,0.0,0.0,0.710107,0.214711,0.40404,0
3,0.132755,0.974359,0.016022,0.045455,0.333333,0.652174,1.0,0.0,0.708466,0.141195,0.10101,0
4,0.000921,0.083916,0.694623,0.090909,0.333333,0.652174,1.0,1.0,0.179704,0.371303,0.191919,0


In [16]:
test.head()

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm,status
0,0.105304,0.974359,0.018562,0.021591,0.166667,0.0,1.0,1.0,0.763969,0.239378,0.20202,0
1,0.016998,0.974359,0.018502,0.053409,0.166667,0.0,1.0,0.0,0.730848,0.256943,0.232323,0
2,0.013462,0.974359,0.648358,0.0,0.166667,0.0,1.0,1.0,0.753619,0.247821,0.161616,0
3,0.036531,0.974359,0.648638,0.039773,0.166667,0.0,1.0,0.0,0.745077,0.240952,0.474747,0
4,0.034797,0.974359,0.156039,0.052273,0.166667,0.0,0.0,1.0,0.710107,0.214711,0.10101,0


In [17]:
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, df): # path
        # store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        # ensure input data is floats
        self.X = self.X.astype('float32')
        # label encode target and ensure the values are floats
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

In [18]:
class Net(nn.Module):
    # define model elements
    def __init__(self, l1=10, l2=6, n_inputs=11 ):
        super(Net, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, l1)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = Linear(l1, l2)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = Linear(l2, 1)
        xavier_uniform_(self.hidden3.weight)
        self.act3 = Sigmoid()
 
    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
         # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        X = self.act3(X)
        return X

In [19]:
def get_train():
    return train

def get_validation():
    return test

In [20]:
data_dir = currentdir
checkpoint_dir = currentdir

def train_nn(config, checkpoint_dir=None, data_dir=None, percent=0.05):
    net = Net(config["l1"], config["l2"])        
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)
    
    # criterion for binary classification
    criterion = BCELoss() 
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=config['momentum'])

    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
    # should be restored.
    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
#         model_state, optimizer_state = torch.load(checkpoint)
#         net.load_state_dict(model_state)
#         optimizer.load_state_dict(optimizer_state)
    
    
    # split train on train and validate
    if config.get('validation'):
        train = get_train()
        csv_dataset = CSVDataset(train)
        train_subset, val_subset = csv_dataset.get_splits(n_test=0.2)
    else:
        train = get_train()
        train_subset = CSVDataset(train)    
        validation = get_validation()
        val_subset = CSVDataset(validation)    
    
    
    trainloader = DataLoader( 
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True)
    
    valloader = DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True)
    
    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 10000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        l_labels = []
        l_predicted = []
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1
                #==============================       
                l_predicted += outputs[:,0].tolist()
                l_labels += labels[:,0].tolist()
                    
        df = pd.DataFrame({'label': l_labels, 'predict': l_predicted})        
        df.sort_values(by=['predict'], inplace=True, ascending=False)
        n = int(df.shape[0] * percent)
        n_total= sum (df.label.values)
        p = 100*sum(df.label.values[:n])/n_total

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save(
                (net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total, p_5=p)
    print("Finished Training")

In [21]:
def test_accuracy(net, device="cpu"):
    train_subset = get_train()
    trainset = CSVDataset(train_subset)    
    
    validation = get_validation()
    testset = CSVDataset(validation) 

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [22]:
def main(num_samples=10, max_num_epochs=10):
    
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16, 32, 64, 128]),
        "momentum": tune.choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
        'validation': False
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    result = tune.run(
        tune.with_parameters(train_nn, data_dir=data_dir),
        resources_per_trial={"cpu": 2},
        config=config,
        metric="loss",
        mode="min",
        num_samples=num_samples,
        scheduler=scheduler
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

In [None]:
# main()

In [78]:
# == Status ==
# Memory usage on this node: 10.8/31.2 GiB
# Using AsyncHyperBand: num_stopped=10 Bracket: Iter 8.000: -0.06649076655486842 | Iter 4.000: -0.06681835556443294 | Iter 2.000: -0.06733460283291567 | 
#                     Iter 1.000: -0.06842794980297343
# Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/14.84 GiB heap, 0.0/5.08 GiB objects
# Current best trial: ac8d2_00001 with loss=0.06682538182706561 and parameters={'l1': 32, 'l2': 4, 'lr': 0.026630467385660272, 'batch_size': 2, 'momentum': 0.1, 
#                                                                               'validation': False}
# Result logdir: /home/sergey/ray_results/inner_2021-02-02_11-26-05
# Number of trials: 10/10 (10 TERMINATED)
# Trial name	status	loc	batch_size	l1	l2	lr	momentum	iter	total time (s)	loss	accuracy	p_5
# inner_ac8d2_00000	TERMINATED		32	16	128	0.0176577	0.7	2	18.6769	0.0680945	31.5853	13.7748
# inner_ac8d2_00001	TERMINATED		2	32	4	0.0266305	0.1	10	872.449	0.0668254	1.97435	18.6755
# inner_ac8d2_00002	TERMINATED		16	128	16	0.000216925	0.8	1	15.3179	0.0703961	15.7943	4.76821
# inner_ac8d2_00003	TERMINATED		128	256	16	0.0128569	0.7	10	52.676	0.067173	126.325	18.6755
# inner_ac8d2_00004	TERMINATED		16	8	32	0.00180111	0.3	1	15.2749	0.0696522	15.7943	6.75497
# inner_ac8d2_00005	TERMINATED		32	32	128	0.0715262	0.9	1	9.41819	0.0678254	31.5853	13.1126
# inner_ac8d2_00006	TERMINATED		128	32	16	0.0135899	0.8	1	4.89001	0.0686044	126.325	7.41722
# inner_ac8d2_00007	TERMINATED		8	128	32	0.0808823	0.9	1	28.6921	0.0686833	7.89718	11.9205
# inner_ac8d2_00008	TERMINATED		128	16	4	0.000461718	0.1	1	4.76229	0.107292	126.325	4.23841
# inner_ac8d2_00009	TERMINATED		64	256	16	0.00907162	0.8	10	60.0316	0.0668777	63.1653	20.7947


# 2021-02-02 11:40:39,295	INFO tune.py:448 -- Total run time: 874.15 seconds (874.04 seconds for the tuning loop).
# Best trial config: {'l1': 32, 'l2': 4, 'lr': 0.026630467385660272, 'batch_size': 2, 'momentum': 0.1, 'validation': False}
# Best trial final validation loss: 0.06682538182706561
# Best trial final validation accuracy: 1.9743467772077061
# Best trial test set accuracy: 3.9486935544154123