In [1]:
import numpy as np
import pandas as pd
import utils
from experiment.experiment_utils import set_random_seed, load_data, build_data, grid_search, makedir, save_result, load_data_multitask, load_data_multitask_synthetic_label, save_task_label
from model import LogisticRegression
from pipeline.diffprep_flex_pipeline import DiffPrepFlexPipeline
from pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
import torch
import torch.nn as nn
from trainer.diffprep_trainer import DiffPrepSGD
from utils import SummaryWriter
from experiment.experiment_utils import min_max_normalize
from copy import deepcopy


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import seaborn as sns
# old_df = pd.read_csv('./data/ada_prior/data_old.csv')
# #sns.distplot(old_df['hoursPerWeek'])
# #old_df['hoursPerWeek'] = (old_df['hoursPerWeek'] <= 40).map({True: 'Y', False: 'N'})
# old_df['label'] = old_df['label'].map({-1: 'N', 1: 'Y'})
# old_df.to_csv('./data/ada_prior/data.csv', index=False)

In [3]:
class DiffPrepExperiment(object):
    """Run auto prep with one set of hyper parameters"""
    def __init__(self, data_dir, dataset, prep_space, model_name, method, similarity_threshold):
        self.data_dir = data_dir
        self.dataset = dataset
        self.prep_space = prep_space
        self.model_name = model_name
        self.method = method
        self.similarity_threshold = similarity_threshold

    def run(self, params, verbose=True):        
        X, y = load_data_multitask_synthetic_label(self.data_dir, self.dataset, similarity_threshold=self.similarity_threshold)
        self.generated_task = {
            "X": X,
            "y": y
        }
        #X, y = load_data_multitask(self.data_dir, self.dataset)
        X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])
        
        print("Dataset shapes: ", X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

        # pre norm for diffprep flex
        if self.method == "diffprep_flex":
            X_train, X_val, X_test = min_max_normalize(X_train, X_val, X_test)
            params["patience"] = 10
            params["num_epochs"] = 3000

        # set random seed
        set_random_seed(params)

        ## transform pipeline
        # define and fit first step
        if self.method == "diffprep_fix":
            prep_pipeline = DiffPrepFixPipeline(self.prep_space, temperature=params["temperature"],
                                             use_sample=params["sample"],
                                             diff_method=params["diff_method"],
                                             init_method=params["init_method"])
        elif self.method == "diffprep_flex":
            prep_pipeline = DiffPrepFlexPipeline(self.prep_space, temperature=params["temperature"],
                            use_sample=params["sample"],
                            diff_method=params["diff_method"],
                            init_method=params["init_method"])
        else:
            raise Exception("Wrong auto prep method")

        prep_pipeline.init_parameters(X_train, X_val, X_test)
        print("Train size: ({}, {})".format(X_train.shape[0], prep_pipeline.out_features))

        # model
        input_dim = prep_pipeline.out_features
        output_dim = len(set(y.values.ravel()))

        # model = TwoLayerNet(input_dim, output_dim)
        set_random_seed(params)
        if self.model_name == "log":
            model = LogisticRegression(input_dim, output_dim)
        else:
            raise Exception("Wrong model")

        model = model.to(params["device"])

        # loss
        loss_fn = nn.CrossEntropyLoss()

        # optimizer
        model_optimizer = torch.optim.SGD(
            model.parameters(),
            lr=params["model_lr"],
            weight_decay=params["weight_decay"],
            momentum=params["momentum"]
        )
        
        if params["prep_lr"] is None:
            prep_lr = params["model_lr"]
        else:
            prep_lr = params["prep_lr"]
    
        prep_pipeline_optimizer = torch.optim.Adam(
            prep_pipeline.parameters(),
            lr=prep_lr,
            betas=(0.5, 0.999),
            weight_decay=params["weight_decay"]
        )

        # scheduler
        # model_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, patience=patience, factor=0.1, threshold=0.001)
        prep_pipeline_scheduler = None
        model_scheduler = None

        if params["logging"]:
            logger = SummaryWriter()
        else:
            logger = None

        diff_prep = DiffPrepSGD(prep_pipeline, model, loss_fn, model_optimizer, prep_pipeline_optimizer,
                    model_scheduler, prep_pipeline_scheduler, params, writer=logger)

        result, best_model = diff_prep.fit(X_train, y_train, X_val, y_val, X_test, y_test)
        return result, best_model, logger

In [4]:
# from scipy.stats import pearsonr
# ada_df = pd.read_csv('./data/ada_prior/data.csv')
# ada_df
# y_rating = ada_df['label']

# X, y = load_data_multitask_synthetic_label("data", "ada_prior", 0.7)
# #(y == y_rating).mean()
# corrs = []
# for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
#     X, y = load_data_multitask_synthetic_label("data", "ada_prior", i)
#     corrs.append(pearsonr(y == 'Y', y_rating == 'Y').statistic)

In [5]:
def run_diffprep(data_dir, dataset, result_dir, prep_space, params, model_name, method):
    print("Dataset:", dataset, "Diff Method:", params["diff_method"], method)

    diff_prep_exp = DiffPrepExperiment(data_dir, dataset, prep_space, model_name, method, similarity_threshold=params["similarity_threshold"])
    best_result, best_model, best_logger, best_params = grid_search(diff_prep_exp, deepcopy(params))
    save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)
    save_task_label(diff_prep_exp.generated_task['y'], result_dir)
    print("DiffPrep Finished. val acc:", best_result["best_val_acc"], "test acc", best_result["best_test_acc"])
    return best_result, best_model, best_logger, best_params

In [6]:
import utils
from prep_space import space
from experiment.baseline_experiment import run_baseline
import os

# define hyper parameters
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    #"model_lr": [0.1, 0.01, 0.001],
    "model_lr": 0.01,
    "weight_decay": 0,
    "model": 'log',
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9,
    "similarity_threshold": 0.1,
}

auto_prep_params = {
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

DATADIR = "data"

params.update(auto_prep_params)

datasets = sorted(os.listdir(DATADIR))
dataset = "house_prices"

print("Run {} on dataset {}".format(params["method"], dataset))

sims = list(np.arange(0, 1, 0.1))

for sim in sims:
    print(sim)
    params["similarity_threshold"] = sim
    #result_dir = utils.makedir(["result", params["method"], dataset, f'Rating_ground_truth'])
    result_dir = utils.makedir(["result", params["method"], dataset, f'label_{round(params["similarity_threshold"], 2)}'])

    if params["method"] in ["diffprep_fix", "diffprep_flex"]:
        best_result, best_model, best_logger, best_params = run_diffprep(DATADIR, dataset, result_dir, space, params, params["model"], params["method"])
    else:
        best_result, best_model, best_logger, best_params = run_baseline(DATADIR, dataset, result_dir, space, params, params["model"], params["method"])

Run diffprep_fix on dataset ada_prior
0.0
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 40%|████      | 800/2000 [02:45<04:08,  4.82it/s, next_eval_time=20s, tr_loss=0.758, val_loss=0.933]


DiffPrep Finished. val acc: 0.5230263157894737 test acc 0.5076754385964912
0.1
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 15%|█▌        | 300/2000 [01:01<05:50,  4.85it/s, next_eval_time=20s, tr_loss=3.89, val_loss=6.28] 


DiffPrep Finished. val acc: 0.4967105263157895 test acc 0.5296052631578947
0.2
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 20%|██        | 400/2000 [01:22<05:30,  4.84it/s, next_eval_time=20s, tr_loss=1.68, val_loss=1.03]  


DiffPrep Finished. val acc: 0.5153508771929824 test acc 0.5296052631578947
0.30000000000000004
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 20%|██        | 400/2000 [01:22<05:31,  4.82it/s, next_eval_time=20s, tr_loss=2.75, val_loss=0.891]


DiffPrep Finished. val acc: 0.5592105263157895 test acc 0.5668859649122807
0.4
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 65%|██████▌   | 1300/2000 [04:28<02:24,  4.84it/s, next_eval_time=21s, tr_loss=0.69, val_loss=0.653] 


DiffPrep Finished. val acc: 0.6557017543859649 test acc 0.6337719298245614
0.5
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 40%|████      | 800/2000 [02:39<03:58,  5.02it/s, next_eval_time=19s, tr_loss=0.803, val_loss=0.792]


DiffPrep Finished. val acc: 0.6754385964912281 test acc 0.6578947368421053
0.6000000000000001
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 20%|██        | 400/2000 [01:19<05:16,  5.05it/s, next_eval_time=19s, tr_loss=2.62, val_loss=1.81]  


DiffPrep Finished. val acc: 0.6732456140350878 test acc 0.6754385964912281
0.7000000000000001
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 25%|██▌       | 500/2000 [01:38<04:55,  5.08it/s, next_eval_time=19s, tr_loss=2.14, val_loss=3.68]  


DiffPrep Finished. val acc: 0.7291666666666666 test acc 0.7138157894736842
0.8
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 65%|██████▌   | 1300/2000 [04:16<02:18,  5.07it/s, next_eval_time=19s, tr_loss=0.483, val_loss=0.502]


DiffPrep Finished. val acc: 0.7730263157894737 test acc 0.7576754385964912
0.9
Dataset: ada_prior Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (2738, 13) torch.Size([2738]) (912, 13) torch.Size([912]) (912, 13) torch.Size([912])
Train size: (2738, 99)


 30%|███       | 600/2000 [02:00<04:40,  4.98it/s, next_eval_time=19s, tr_loss=0.685, val_loss=0.615]


DiffPrep Finished. val acc: 0.7916666666666666 test acc 0.8146929824561403


In [31]:
result_dir = utils.makedir(["result", params["method"], dataset, "hoursPerWeek"])
save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)

In [29]:
# label
for md in best_model['prep_pipeline'].keys():
    print(md, best_model['prep_pipeline'][md].shape)

pipeline.0.num_tf_prob_logits torch.Size([5, 5])
pipeline.0.cat_tf_prob_logits torch.Size([94, 2])
pipeline.1.tf_prob_logits torch.Size([99, 4])
pipeline.2.tf_prob_logits torch.Size([99, 10])
pipeline.3.tf_prob_logits torch.Size([99, 7])


In [32]:
# hoursPerWeek
for md in best_model['prep_pipeline'].keys():
    print(md, best_model['prep_pipeline'][md].shape)

pipeline.0.num_tf_prob_logits torch.Size([5, 5])
pipeline.0.cat_tf_prob_logits torch.Size([94, 2])
pipeline.1.tf_prob_logits torch.Size([99, 4])
pipeline.2.tf_prob_logits torch.Size([99, 10])
pipeline.3.tf_prob_logits torch.Size([99, 7])


## Next steps:

1. Load the transfer datasets
2. Train using diffprep fix on one task
3. Create a new diffprep fix pipeline where the pipeline is set to the one we got above, and is frozen
4. Train the same model arch on the new task, and see accuracy etc.
5. Train the same model arch on the new task with a fresh diffprep pipeline, and see accuracy etc.
6. Compared the tau matrices and the operatiosn