In [1]:
import numpy as np
import pandas as pd
import utils
from experiment.experiment_utils import set_random_seed, load_data, build_data, grid_search, makedir, save_result, load_data_multitask, load_data_multitask_synthetic_label
from model import LogisticRegression
from pipeline.diffprep_flex_pipeline import DiffPrepFlexPipeline
from pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
import torch
import torch.nn as nn
from trainer.diffprep_trainer import DiffPrepSGD
from utils import SummaryWriter
from experiment.experiment_utils import min_max_normalize
from copy import deepcopy


  from .autonotebook import tqdm as notebook_tqdm


In [16]:
class DiffPrepExperiment(object):
    """Run auto prep with one set of hyper parameters"""
    def __init__(self, data_dir, dataset, prep_space, model_name, method, similarity_threshold):
        self.data_dir = data_dir
        self.dataset = dataset
        self.prep_space = prep_space
        self.model_name = model_name
        self.method = method
        self.similarity_threshold = similarity_threshold

    def run(self, params, verbose=True):        
        X, y = load_data_multitask_synthetic_label(self.data_dir, self.dataset, similarity_threshold=self.similarity_threshold)
        #X, y = load_data_multitask(self.data_dir, self.dataset)
        X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])
        
        print("Dataset shapes: ", X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

        # pre norm for diffprep flex
        if self.method == "diffprep_flex":
            X_train, X_val, X_test = min_max_normalize(X_train, X_val, X_test)
            params["patience"] = 10
            params["num_epochs"] = 3000

        # set random seed
        set_random_seed(params)

        ## transform pipeline
        # define and fit first step
        if self.method == "diffprep_fix":
            prep_pipeline = DiffPrepFixPipeline(self.prep_space, temperature=params["temperature"],
                                             use_sample=params["sample"],
                                             diff_method=params["diff_method"],
                                             init_method=params["init_method"])
        elif self.method == "diffprep_flex":
            prep_pipeline = DiffPrepFlexPipeline(self.prep_space, temperature=params["temperature"],
                            use_sample=params["sample"],
                            diff_method=params["diff_method"],
                            init_method=params["init_method"])
        else:
            raise Exception("Wrong auto prep method")

        prep_pipeline.init_parameters(X_train, X_val, X_test)
        print("Train size: ({}, {})".format(X_train.shape[0], prep_pipeline.out_features))

        # model
        input_dim = prep_pipeline.out_features
        output_dim = len(set(y.values.ravel()))

        # model = TwoLayerNet(input_dim, output_dim)
        set_random_seed(params)
        if self.model_name == "log":
            model = LogisticRegression(input_dim, output_dim)
        else:
            raise Exception("Wrong model")

        model = model.to(params["device"])

        # loss
        loss_fn = nn.CrossEntropyLoss()

        # optimizer
        model_optimizer = torch.optim.SGD(
            model.parameters(),
            lr=params["model_lr"],
            weight_decay=params["weight_decay"],
            momentum=params["momentum"]
        )
        
        if params["prep_lr"] is None:
            prep_lr = params["model_lr"]
        else:
            prep_lr = params["prep_lr"]
    
        prep_pipeline_optimizer = torch.optim.Adam(
            prep_pipeline.parameters(),
            lr=prep_lr,
            betas=(0.5, 0.999),
            weight_decay=params["weight_decay"]
        )

        # scheduler
        # model_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, patience=patience, factor=0.1, threshold=0.001)
        prep_pipeline_scheduler = None
        model_scheduler = None

        if params["logging"]:
            logger = SummaryWriter()
        else:
            logger = None

        diff_prep = DiffPrepSGD(prep_pipeline, model, loss_fn, model_optimizer, prep_pipeline_optimizer,
                    model_scheduler, prep_pipeline_scheduler, params, writer=logger)

        result, best_model = diff_prep.fit(X_train, y_train, X_val, y_val, X_test, y_test)
        return result, best_model, logger

In [17]:
airbnb_df = pd.read_csv('./data/Airbnb/data.csv')
airbnb_df
y_rating = airbnb_df['Rating']

X, y = load_data_multitask_synthetic_label("data", "Airbnb", 0.8)
(y == y_rating).mean()

0.8013333333333333

In [18]:
def run_diffprep(data_dir, dataset, result_dir, prep_space, params, model_name, method):
    print("Dataset:", dataset, "Diff Method:", params["diff_method"], method)

    diff_prep_exp = DiffPrepExperiment(data_dir, dataset, prep_space, model_name, method, similarity_threshold=params["similarity_threshold"])
    best_result, best_model, best_logger, best_params = grid_search(diff_prep_exp, deepcopy(params))
    save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)
    print("DiffPrep Finished. val acc:", best_result["best_val_acc"], "test acc", best_result["best_test_acc"])
    return best_result, best_model, best_logger, best_params

In [22]:
import utils
from prep_space import space
from experiment.baseline_experiment import run_baseline
import os

# define hyper parameters
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    #"model_lr": [0.1, 0.01, 0.001],
    "model_lr": 0.01,
    "weight_decay": 0,
    "model": 'log',
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9,
    "similarity_threshold": 0.1,
}

auto_prep_params = {
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

DATADIR = "data"

params.update(auto_prep_params)

datasets = sorted(os.listdir(DATADIR))
dataset = "Airbnb"

print("Run {} on dataset {}".format(params["method"], dataset))

#result_dir = utils.makedir(["result", params["method"], dataset, f'Rating_ground_truth'])
result_dir = utils.makedir(["result", params["method"], dataset, f'Rating_{params["similarity_threshold"]}'])

if params["method"] in ["diffprep_fix", "diffprep_flex"]:
    best_result, best_model, best_logger, best_params = run_diffprep(DATADIR, dataset, result_dir, space, params, params["model"], params["method"])
else:
    best_result, best_model, best_logger, best_params = run_baseline(DATADIR, dataset, result_dir, space, params, params["model"], params["method"])

Run diffprep_fix on dataset Airbnb
Dataset: Airbnb Diff Method: num_diff diffprep_fix
Model lr 0.01
Dataset shapes:  (1800, 38) torch.Size([1800]) (600, 38) torch.Size([600]) (600, 38) torch.Size([600])
Train size: (1800, 125)


 50%|█████     | 1000/2000 [03:38<03:38,  4.58it/s, next_eval_time=22s, tr_loss=0.638, val_loss=0.644]

DiffPrep Finished. val acc: 0.6816666666666666 test acc 0.6533333333333333





In [27]:
result_dir = utils.makedir(["result", params["method"], dataset, "Price"])
save_result(best_result, best_model, best_logger, best_params, result_dir, save_model=True)

In [28]:
torch.nn.functional.softmax(best_model['prep_pipeline']['pipeline.0.num_tf_prob_logits'])

tensor([[0.8374, 0.0096, 0.0085, 0.1348, 0.0096],
        [0.8715, 0.0371, 0.0289, 0.0254, 0.0371],
        [0.7386, 0.0354, 0.1491, 0.0415, 0.0354],
        [0.7738, 0.0436, 0.0948, 0.0447, 0.0432],
        [0.0299, 0.0043, 0.0085, 0.9518, 0.0054],
        [0.5153, 0.1237, 0.1237, 0.1237, 0.1136],
        [0.4854, 0.1222, 0.1222, 0.1222, 0.1481],
        [0.5003, 0.1226, 0.1226, 0.1226, 0.1318],
        [0.7513, 0.1847, 0.0214, 0.0214, 0.0212],
        [0.4296, 0.1049, 0.1561, 0.1550, 0.1544],
        [0.5054, 0.1255, 0.1255, 0.1255, 0.1181],
        [0.6839, 0.0314, 0.0360, 0.2135, 0.0353],
        [0.4776, 0.1208, 0.1208, 0.1208, 0.1599],
        [0.4930, 0.1267, 0.1267, 0.1267, 0.1267],
        [0.5261, 0.1272, 0.1272, 0.1272, 0.0924],
        [0.5077, 0.1230, 0.1230, 0.1230, 0.1232],
        [0.5151, 0.1274, 0.1274, 0.1274, 0.1027],
        [0.6530, 0.0548, 0.0548, 0.1910, 0.0465],
        [0.5609, 0.1281, 0.0205, 0.1114, 0.1791],
        [0.2549, 0.0697, 0.1932, 0.2905, 0.1918],


In [19]:
torch.nn.functional.softmax(best_model['prep_pipeline']['pipeline.3.tf_prob_logits']).numpy().argmax(axis=1)

array([6, 5, 6, 6, 6, 6, 5, 2, 6, 5, 5, 6, 2, 6, 5, 5, 6, 2, 6, 1, 6, 6,
       6, 6, 6, 1, 6, 6, 5, 6, 6, 6, 2, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 0, 2, 6, 6, 6, 6, 0, 6, 6, 2, 6, 6, 2, 6, 6, 2, 6,
       6, 6, 6, 6, 6, 6])

In [17]:
# rating
for md in best_model['prep_pipeline'].keys():
    print(md, best_model['prep_pipeline'][md].shape)

alpha torch.Size([94, 3, 3])
pipeline.0.num_tf_prob_logits torch.Size([37, 5])
pipeline.0.cat_tf_prob_logits torch.Size([57, 2])
pipeline.1.tf_prob_logits torch.Size([94, 4])
pipeline.2.tf_prob_logits torch.Size([94, 10])
pipeline.3.tf_prob_logits torch.Size([94, 7])
pipeline.4.tf_prob_logits torch.Size([94, 4])
pipeline.5.tf_prob_logits torch.Size([94, 10])
pipeline.6.tf_prob_logits torch.Size([94, 7])
pipeline.7.tf_prob_logits torch.Size([94, 4])
pipeline.8.tf_prob_logits torch.Size([94, 10])
pipeline.9.tf_prob_logits torch.Size([94, 7])


In [7]:
# price
for md in best_model['prep_pipeline'].keys():
    print(md, best_model['prep_pipeline'][md].shape)

alpha torch.Size([94, 3, 3])
pipeline.0.num_tf_prob_logits torch.Size([37, 5])
pipeline.0.cat_tf_prob_logits torch.Size([57, 2])
pipeline.1.tf_prob_logits torch.Size([94, 4])
pipeline.2.tf_prob_logits torch.Size([94, 10])
pipeline.3.tf_prob_logits torch.Size([94, 7])
pipeline.4.tf_prob_logits torch.Size([94, 4])
pipeline.5.tf_prob_logits torch.Size([94, 10])
pipeline.6.tf_prob_logits torch.Size([94, 7])
pipeline.7.tf_prob_logits torch.Size([94, 4])
pipeline.8.tf_prob_logits torch.Size([94, 10])
pipeline.9.tf_prob_logits torch.Size([94, 7])


## Next steps:

1. Load the transfer datasets
2. Train using diffprep fix on one task
3. Create a new diffprep fix pipeline where the pipeline is set to the one we got above, and is frozen
4. Train the same model arch on the new task, and see accuracy etc.
5. Train the same model arch on the new task with a fresh diffprep pipeline, and see accuracy etc.
6. Compared the tau matrices and the operatiosn