In [1]:
import numpy as np
import pandas as pd
import utils
from experiment.experiment_utils import set_random_seed, load_data, build_data, grid_search, makedir, save_result, load_data_multitask, load_data_multitask_synthetic_label, save_task_label
from model import LogisticRegression
from pipeline.diffprep_flex_pipeline import DiffPrepFlexPipeline
from pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
import torch
import torch.nn as nn
from trainer.diffprep_trainer import DiffPrepSGD
from utils import SummaryWriter
from experiment.experiment_utils import min_max_normalize
from copy import deepcopy


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import seaborn as sns
# old_df = pd.read_csv('./data/ada_prior/data_old.csv')
# #sns.distplot(old_df['hoursPerWeek'])
# #old_df['hoursPerWeek'] = (old_df['hoursPerWeek'] <= 40).map({True: 'Y', False: 'N'})
# old_df['label'] = old_df['label'].map({-1: 'N', 1: 'Y'})
# old_df.to_csv('./data/ada_prior/data.csv', index=False)

In [3]:
class DiffPrepExperiment(object):
    """Run auto prep with one set of hyper parameters"""
    def __init__(self, data_dir, dataset, prep_space, model_name, method, similarity_threshold):
        self.data_dir = data_dir
        self.dataset = dataset
        self.prep_space = prep_space
        self.model_name = model_name
        self.method = method
        self.similarity_threshold = similarity_threshold

    def run(self, params, verbose=True):        
        X, y = load_data_multitask_synthetic_label(self.data_dir, self.dataset, similarity_threshold=self.similarity_threshold)
        self.generated_task = {
            "X": X,
            "y": y
        }
        #X, y = load_data_multitask(self.data_dir, self.dataset)
        X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])
        
        print("Dataset shapes: ", X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

        # pre norm for diffprep flex
        if self.method == "diffprep_flex":
            X_train, X_val, X_test = min_max_normalize(X_train, X_val, X_test)
            params["patience"] = 10
            params["num_epochs"] = 3000

        # set random seed
        set_random_seed(params)

        ## transform pipeline
        # define and fit first step
        if self.method == "diffprep_fix":
            prep_pipeline = DiffPrepFixPipeline(self.prep_space, temperature=params["temperature"],
                                             use_sample=params["sample"],
                                             diff_method=params["diff_method"],
                                             init_method=params["init_method"])
        elif self.method == "diffprep_flex":
            prep_pipeline = DiffPrepFlexPipeline(self.prep_space, temperature=params["temperature"],
                            use_sample=params["sample"],
                            diff_method=params["diff_method"],
                            init_method=params["init_method"])
        else:
            raise Exception("Wrong auto prep method")

        prep_pipeline.init_parameters(X_train, X_val, X_test)
        print("Train size: ({}, {})".format(X_train.shape[0], prep_pipeline.out_features))

        # model
        input_dim = prep_pipeline.out_features
        output_dim = len(set(y.values.ravel()))

        # model = TwoLayerNet(input_dim, output_dim)
        set_random_seed(params)
        if self.model_name == "log":
            model = LogisticRegression(input_dim, output_dim)
        else:
            raise Exception("Wrong model")

        model = model.to(params["device"])

        # loss
        loss_fn = nn.CrossEntropyLoss()

        # optimizer
        model_optimizer = torch.optim.SGD(
            model.parameters(),
            lr=params["model_lr"],
            weight_decay=params["weight_decay"],
            momentum=params["momentum"]
        )
        
        if params["prep_lr"] is None:
            prep_lr = params["model_lr"]
        else:
            prep_lr = params["prep_lr"]
    
        prep_pipeline_optimizer = torch.optim.Adam(
            prep_pipeline.parameters(),
            lr=prep_lr,
            betas=(0.5, 0.999),
            weight_decay=params["weight_decay"]
        )

        # scheduler
        # model_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, patience=patience, factor=0.1, threshold=0.001)
        prep_pipeline_scheduler = None
        model_scheduler = None

        if params["logging"]:
            logger = SummaryWriter()
        else:
            logger = None

        diff_prep = DiffPrepSGD(prep_pipeline, model, loss_fn, model_optimizer, prep_pipeline_optimizer,
                    model_scheduler, prep_pipeline_scheduler, params, writer=logger)

        result, best_model = diff_prep.fit(X_train, y_train, X_val, y_val, X_test, y_test)
        return result, best_model, logger

In [4]:
# from scipy.stats import pearsonr
# ada_df = pd.read_csv('./data/ada_prior/data.csv')
# ada_df
# y_rating = ada_df['label']

# X, y = load_data_multitask_synthetic_label("data", "ada_prior", 0.7)
# #(y == y_rating).mean()
# corrs = []
# for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
#     X, y = load_data_multitask_synthetic_label("data", "ada_prior", i)
#     corrs.append(pearsonr(y == 'Y', y_rating == 'Y').statistic)

In [5]:
def run_diffprep(data_dir, dataset, result_dir, prep_space, params, model_name, method):
    print("Dataset:", dataset, "Diff Method:", params["diff_method"], method)

    diff_prep_exp = DiffPrepExperiment(data_dir, dataset, prep_space, model_name, method, similarity_threshold=params["similarity_threshold"])
    best_result, best_model, best_logger, best_params = grid_search(diff_prep_exp, deepcopy(params))
    save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)
    save_task_label(diff_prep_exp.generated_task['y'], result_dir)
    print("DiffPrep Finished. val acc:", best_result["best_val_acc"], "test acc", best_result["best_test_acc"])
    return best_result, best_model, best_logger, best_params

In [6]:
import utils
from prep_space import space
from experiment.baseline_experiment import run_baseline
import os

# define hyper parameters
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    #"model_lr": [0.1, 0.01, 0.001],
    "model_lr": 0.01,
    "weight_decay": 0,
    "model": 'log',
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9,
    "similarity_threshold": 0.1,
}

auto_prep_params = {
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

DATADIR = "data"

params.update(auto_prep_params)

datasets = sorted(os.listdir(DATADIR))
dataset = "house_prices"

print("Run {} on dataset {}".format(params["method"], dataset))

sims = list(np.arange(0, 1, 0.1))

for sim in sims:
    print(sim)
    params["similarity_threshold"] = sim
    #result_dir = utils.makedir(["result", params["method"], dataset, f'Rating_ground_truth'])
    result_dir = utils.makedir(["result", params["method"], dataset, f'label_{round(params["similarity_threshold"], 2)}'])

    if params["method"] in ["diffprep_fix", "diffprep_flex"]:
        best_result, best_model, best_logger, best_params = run_diffprep(DATADIR, dataset, result_dir, space, params, params["model"], params["method"])
    else:
        best_result, best_model, best_logger, best_params = run_baseline(DATADIR, dataset, result_dir, space, params, params["model"], params["method"])

Run diffprep_fix on dataset house_prices
0.0
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 15%|█▌        | 300/2000 [01:49<10:19,  2.74it/s, next_eval_time=35s, tr_loss=8.22, val_loss=12.1]


DiffPrep Finished. val acc: 0.4486301369863014 test acc 0.523972602739726
0.1
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 15%|█▌        | 300/2000 [01:47<10:10,  2.78it/s, next_eval_time=35s, tr_loss=4.92, val_loss=9]   


DiffPrep Finished. val acc: 0.4965753424657534 test acc 0.5513698630136986
0.2
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 20%|██        | 400/2000 [02:26<09:44,  2.74it/s, next_eval_time=35s, tr_loss=11.6, val_loss=18.6] 


DiffPrep Finished. val acc: 0.5753424657534246 test acc 0.5034246575342466
0.30000000000000004
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 20%|██        | 400/2000 [02:26<09:44,  2.74it/s, next_eval_time=36s, tr_loss=7, val_loss=11.7]    


DiffPrep Finished. val acc: 0.6061643835616438 test acc 0.5547945205479452
0.4
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 25%|██▌       | 500/2000 [03:04<09:13,  2.71it/s, next_eval_time=37s, tr_loss=2.1, val_loss=2.07]   


DiffPrep Finished. val acc: 0.6027397260273972 test acc 0.6267123287671232
0.5
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 20%|██        | 400/2000 [02:27<09:50,  2.71it/s, next_eval_time=36s, tr_loss=1.92, val_loss=2.4]   


DiffPrep Finished. val acc: 0.6438356164383562 test acc 0.5993150684931506
0.6000000000000001
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 20%|██        | 400/2000 [02:26<09:46,  2.73it/s, next_eval_time=36s, tr_loss=5.63, val_loss=11.8] 


DiffPrep Finished. val acc: 0.6301369863013698 test acc 0.678082191780822
0.7000000000000001
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 20%|██        | 400/2000 [02:26<09:46,  2.73it/s, next_eval_time=37s, tr_loss=3.85, val_loss=5.78]  


DiffPrep Finished. val acc: 0.75 test acc 0.7773972602739726
0.8
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 20%|██        | 400/2000 [02:26<09:44,  2.74it/s, next_eval_time=35s, tr_loss=1.45, val_loss=2.39]  


DiffPrep Finished. val acc: 0.8082191780821918 test acc 0.815068493150685
0.9
Dataset: house_prices Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (876, 80) torch.Size([876]) (292, 80) torch.Size([292]) (292, 80) torch.Size([292])
Train size: (876, 295)


 25%|██▌       | 500/2000 [03:02<09:08,  2.74it/s, next_eval_time=36s, tr_loss=0.684, val_loss=0.671]

DiffPrep Finished. val acc: 0.886986301369863 test acc 0.8698630136986302





In [31]:
result_dir = utils.makedir(["result", params["method"], dataset, "hoursPerWeek"])
save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)

In [29]:
# label
for md in best_model['prep_pipeline'].keys():
    print(md, best_model['prep_pipeline'][md].shape)

pipeline.0.num_tf_prob_logits torch.Size([5, 5])
pipeline.0.cat_tf_prob_logits torch.Size([94, 2])
pipeline.1.tf_prob_logits torch.Size([99, 4])
pipeline.2.tf_prob_logits torch.Size([99, 10])
pipeline.3.tf_prob_logits torch.Size([99, 7])


In [32]:
# hoursPerWeek
for md in best_model['prep_pipeline'].keys():
    print(md, best_model['prep_pipeline'][md].shape)

pipeline.0.num_tf_prob_logits torch.Size([5, 5])
pipeline.0.cat_tf_prob_logits torch.Size([94, 2])
pipeline.1.tf_prob_logits torch.Size([99, 4])
pipeline.2.tf_prob_logits torch.Size([99, 10])
pipeline.3.tf_prob_logits torch.Size([99, 7])


## Next steps:

1. Load the transfer datasets
2. Train using diffprep fix on one task
3. Create a new diffprep fix pipeline where the pipeline is set to the one we got above, and is frozen
4. Train the same model arch on the new task, and see accuracy etc.
5. Train the same model arch on the new task with a fresh diffprep pipeline, and see accuracy etc.
6. Compared the tau matrices and the operatiosn