In [1]:
import numpy as np
import pandas as pd
import utils
from experiment.experiment_utils import set_random_seed, load_data, build_data, grid_search, makedir, save_result, load_data_multitask
from model import LogisticRegression
from pipeline.diffprep_flex_pipeline import DiffPrepFlexPipeline
from pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
import torch
import torch.nn as nn
from trainer.diffprep_trainer import DiffPrepSGD
from utils import SummaryWriter
from experiment.experiment_utils import min_max_normalize
from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def load_prep_pipeline(path, prep_space, params, data_dir, dataset):

    X, y = load_data_multitask(data_dir, dataset)
    X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])

    prep_pipeline = DiffPrepFixPipeline(prep_space, temperature=params["temperature"],
                                use_sample=False,
                                diff_method=params["diff_method"],
                                init_method=params["init_method"])
    prep_pipeline.init_parameters(X_train, X_val, X_test)
    prep_pipeline.load_state_dict(torch.load(path))
    prep_pipeline.is_fitted = True
    prep_pipeline.fit(X_train)
    #prep_pipeline.eval()

    return prep_pipeline

In [17]:
class DiffPrepExperiment(object):
    """Run auto prep with one set of hyper parameters"""
    def __init__(self, data_dir, dataset, prep_space, model_name, method, fixed_pipeline_path=None):
        self.data_dir = data_dir
        self.dataset = dataset
        self.prep_space = prep_space
        self.model_name = model_name
        self.method = method
        self.fixed_pipeline_path = fixed_pipeline_path

    def run(self, params, verbose=True):        
        X, y = load_data_multitask(self.data_dir, self.dataset)
        X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])
        
        print("Dataset shapes: ", X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

        # pre norm for diffprep flex
        if self.method == "diffprep_flex":
            X_train, X_val, X_test = min_max_normalize(X_train, X_val, X_test)
            params["patience"] = 10
            params["num_epochs"] = 3000

        # set random seed
        set_random_seed(params)
        ## transform pipeline
        # define and fit first step
        if self.fixed_pipeline_path:
            #prep_pipeline = self.fixed_pipeline
            prep_pipeline = DiffPrepFixPipeline(self.prep_space, temperature=params["temperature"],
                                use_sample=False,
                                diff_method=params["diff_method"],
                                init_method=params["init_method"])
            prep_pipeline.init_parameters(X_train, X_val, X_test)
            prep_pipeline.load_state_dict(torch.load(self.fixed_pipeline_path))
            prep_pipeline.fit(X_train)
            prep_pipeline.is_fitted = True

        elif self.method == "diffprep_fix":
            prep_pipeline = DiffPrepFixPipeline(self.prep_space, temperature=params["temperature"],
                                             use_sample=params["sample"],
                                             diff_method=params["diff_method"],
                                             init_method=params["init_method"])
            prep_pipeline.init_parameters(X_train, X_val, X_test)
        elif self.method == "diffprep_flex":
            prep_pipeline = DiffPrepFlexPipeline(self.prep_space, temperature=params["temperature"],
                            use_sample=params["sample"],
                            diff_method=params["diff_method"],
                            init_method=params["init_method"])
            prep_pipeline.init_parameters(X_train, X_val, X_test)
        else:
            raise Exception("Wrong auto prep method")

        #prep_pipeline.init_parameters(X_train, X_val, X_test)
        print("Train size: ({}, {})".format(X_train.shape[0], prep_pipeline.out_features))

        # model
        input_dim = prep_pipeline.out_features
        output_dim = len(set(y.values.ravel()))

        # model = TwoLayerNet(input_dim, output_dim)
        set_random_seed(params)
        if self.model_name == "log":
            model = LogisticRegression(input_dim, output_dim)
        else:
            raise Exception("Wrong model")

        model = model.to(params["device"])

        # loss
        loss_fn = nn.CrossEntropyLoss()

        # optimizer
        model_optimizer = torch.optim.SGD(
            model.parameters(),
            lr=params["model_lr"],
            weight_decay=params["weight_decay"],
            momentum=params["momentum"]
        )
        
        if params["prep_lr"] is None:
            prep_lr = params["model_lr"]
        else:
            prep_lr = params["prep_lr"]
    
        prep_pipeline_optimizer = None
        # torch.optim.Adam(
        #     prep_pipeline.parameters(),
        #     lr=prep_lr,
        #     betas=(0.5, 0.999),
        #     weight_decay=params["weight_decay"]
        # )

        # scheduler
        # model_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, patience=patience, factor=0.1, threshold=0.001)
        prep_pipeline_scheduler = None
        model_scheduler = None

        if params["logging"]:
            logger = SummaryWriter()
        else:
            logger = None

        diff_prep = DiffPrepSGD(prep_pipeline, model, loss_fn, model_optimizer, prep_pipeline_optimizer,
                    model_scheduler, prep_pipeline_scheduler, params, writer=logger, train_pipeline=False)

        result, best_model = diff_prep.fit(X_train, y_train, X_val, y_val, X_test, y_test)
        return result, best_model, logger

In [18]:
def run_diffprep(data_dir, dataset, result_dir, prep_space, params, model_name, method, prep_pipeline_price_path):
    print("Dataset:", dataset, "Diff Method:", params["diff_method"], method)

    sample = "sample" if params["sample"] else "nosample"
    diff_prep_exp = DiffPrepExperiment(data_dir, dataset, prep_space, model_name, method, fixed_pipeline_path=prep_pipeline_price_path)
    best_result, best_model, best_logger, best_params = grid_search(diff_prep_exp, deepcopy(params))
    save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)
    print("DiffPrep Finished. val acc:", best_result["best_val_acc"], "test acc", best_result["best_test_acc"])
    return best_result, best_model, best_logger, best_params

In [27]:
import utils
from prep_space import space
from experiment.baseline_experiment import run_baseline
import os

# define hyper parameters
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    #"model_lr": [0.1, 0.01, 0.001],
    "model_lr": 0.01,
    "weight_decay": 0,
    "model": 'log',
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9
}

auto_prep_params = {
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

DATADIR = "data"

params.update(auto_prep_params)

datasets = sorted(os.listdir(DATADIR))
dataset = "Airbnb"

print("Run {} on dataset {}".format(params["method"], dataset))

result_dir = utils.makedir(["result", params["method"], dataset])

Run diffprep_fix on dataset Airbnb


In [28]:
prep_pipeline_price_path = './result/diffprep_fix/Airbnb/Price/prep_pipeline.pth'

if params["method"] in ["diffprep_fix", "diffprep_flex"]:
    best_result, best_model, best_logger, best_params = run_diffprep(DATADIR, dataset, result_dir, space, params, params["model"], params["method"], prep_pipeline_price_path)
else:
    best_result, best_model, best_logger, best_params = run_baseline(DATADIR, dataset, result_dir, space, params, params["model"], params["method"], prep_pipeline_price_path)

Dataset: Airbnb Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (600, 38) torch.Size([600]) (200, 38) torch.Size([200]) (200, 38) torch.Size([200])
Train size: (600, 94)


 20%|██        | 400/2000 [00:13<00:54, 29.51it/s, next_eval_time=3s, tr_loss=0.545, val_loss=0.764]

DiffPrep Finished. val acc: 0.69 test acc 0.63





In [31]:
best_model['prep_pipeline']

OrderedDict([('pipeline.0.num_tf_prob_logits',
              tensor([[-0.5294, -2.2761, -2.2764, -1.9794, -2.2728],
                      [-0.7023, -2.0379, -2.0969, -2.1204, -2.1318],
                      [-0.7024, -2.0996, -2.0831, -2.0956, -2.0196],
                      [-0.7606, -2.1473, -2.1223, -2.1562, -2.1031],
                      [-0.6643, -2.0510, -2.0510, -2.0510, -2.1340],
                      [-0.8341, -2.2191, -2.2191, -2.2191, -2.1100],
                      [-0.6931, -2.0794, -2.0794, -2.0794, -2.0794],
                      [-0.6931, -2.0794, -2.0794, -2.0794, -2.0794],
                      [-0.6931, -2.0794, -2.0794, -2.0794, -2.0794],
                      [-0.6931, -2.0794, -2.0794, -2.0794, -2.0794],
                      [-0.6931, -2.0794, -2.0794, -2.0794, -2.0794],
                      [-0.6931, -2.0794, -2.0794, -2.0794, -2.0794],
                      [-0.6982, -2.0845, -2.0845, -2.0845, -1.8638],
                      [-0.4386, -1.8255, -1.8255, -1.825