In [1]:
import torch
import pandas
import numpy as np
from pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
from prep_space import space
from experiment.experiment_utils import set_random_seed, load_data, build_data, grid_search, makedir, save_result, load_data_multitask

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_prep_pipeline(path, prep_space, params, data_dir, dataset):

    X, y = load_data_multitask(data_dir, dataset)
    X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])

    prep_pipeline = DiffPrepFixPipeline(prep_space, temperature=params["temperature"],
                                use_sample=params["sample"],
                                diff_method=params["diff_method"],
                                init_method=params["init_method"])
    prep_pipeline.init_parameters(X_train, X_val, X_test)
    prep_pipeline.load_state_dict(torch.load(path))
    #prep_pipeline.eval()

    return prep_pipeline

In [3]:
# define hyper parameters
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    #"model_lr": [0.1, 0.01, 0.001],
    "model_lr": 0.01,
    "weight_decay": 0,
    "model": 'log',
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9
}

auto_prep_params = {
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

params.update(auto_prep_params)

DATADIR = "data"
dataset = "ada_prior"


In [4]:
prep_pipeline_hpw = load_prep_pipeline('./result/diffprep_fix/ada_prior/hoursPerWeek/prep_pipeline.pth', space, params, DATADIR, dataset)

In [5]:
prep_pipeline_label = load_prep_pipeline('./result/diffprep_fix/ada_prior/label/prep_pipeline.pth', space, params, DATADIR, dataset)

In [6]:
prep_pipeline_hpw, prep_pipeline_label

(DiffPrepFixPipeline(
   (pipeline): ModuleList(
     (0): FirstTransformer()
     (1): Transformer()
     (2): Transformer()
     (3): Transformer()
   )
 ),
 DiffPrepFixPipeline(
   (pipeline): ModuleList(
     (0): FirstTransformer()
     (1): Transformer()
     (2): Transformer()
     (3): Transformer()
   )
 ))

In [7]:
import torch.nn.functional as F

In [8]:
hpw_pipeline_params = prep_pipeline_hpw.state_dict()
label_pipeline_params = prep_pipeline_label.state_dict()

In [9]:
def get_pipeline_ops(prep_pipeline_params):
    pipeline_ops = {}
    for pipeline_step, pipeline in prep_pipeline_params.items():
        #print(pipeline_step, pipeline.shape)
        pipeline = F.softmax(pipeline)
        #pipeline_arr = pipeline.numpy()
        #pipeline_ops[pipeline_step] = pipeline_arr.argmax(axis=1)
        pipeline_ops[pipeline_step] = pipeline
    return pipeline_ops

In [10]:
hpw_pipeline_ops = get_pipeline_ops(hpw_pipeline_params)
label_pipeline_ops = get_pipeline_ops(label_pipeline_params)

In [11]:
pipeline_step_ = 'pipeline.0.cat_tf_prob_logits'
label_pipeline_ops[pipeline_step_]

tensor([[0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5

In [12]:
for pipeline_step in hpw_pipeline_params.keys():
    hpw_pipeline_op = hpw_pipeline_ops[pipeline_step]
    label_pipeline_op = label_pipeline_ops[pipeline_step]

    ce = -(hpw_pipeline_op * np.log2(label_pipeline_op)).sum(axis=1).mean()

    print(pipeline_step)
    print("CE", ce)
    print("Agreement %", pipeline_step, (hpw_pipeline_op.argmax(axis=1) == label_pipeline_op.argmax(axis=1)).numpy().mean())


pipeline.0.num_tf_prob_logits
CE tensor(2.0003)
Agreement % pipeline.0.num_tf_prob_logits 1.0
pipeline.0.cat_tf_prob_logits
CE tensor(1.0000)
Agreement % pipeline.0.cat_tf_prob_logits 0.9893617021276596
pipeline.1.tf_prob_logits
CE tensor(1.7923)
Agreement % pipeline.1.tf_prob_logits 1.0
pipeline.2.tf_prob_logits
CE tensor(2.5860)
Agreement % pipeline.2.tf_prob_logits 1.0
pipeline.3.tf_prob_logits
CE tensor(2.2922)
Agreement % pipeline.3.tf_prob_logits 1.0


In [25]:
hpw_pipeline_op

tensor([[0.0828, 0.0844, 0.0844, 0.0828, 0.0844, 0.0844, 0.4967],
        [0.0840, 0.0824, 0.0824, 0.0824, 0.0824, 0.0824, 0.5042],
        [0.0839, 0.0822, 0.0822, 0.0839, 0.0822, 0.0822, 0.5033],
        [0.0842, 0.0842, 0.0842, 0.0842, 0.0842, 0.0842, 0.4950],
        [0.0842, 0.0842, 0.0842, 0.0842, 0.0842, 0.0842, 0.4950],
        [0.0839, 0.0822, 0.0822, 0.0839, 0.0822, 0.0822, 0.5033],
        [0.0842, 0.0842, 0.0842, 0.0842, 0.0842, 0.0842, 0.4950],
        [0.0822, 0.0822, 0.0822, 0.0839, 0.0839, 0.0822, 0.5033],
        [0.0828, 0.0844, 0.0844, 0.0828, 0.0844, 0.0844, 0.4967],
        [0.0839, 0.0822, 0.0822, 0.0839, 0.0822, 0.0822, 0.5033],
        [0.0828, 0.0844, 0.0844, 0.0828, 0.0844, 0.0844, 0.4967],
        [0.0833, 0.0833, 0.0833, 0.0833, 0.0833, 0.0833, 0.5000],
        [0.0828, 0.0844, 0.0844, 0.0828, 0.0844, 0.0844, 0.4967],
        [0.0839, 0.0822, 0.0822, 0.0839, 0.0822, 0.0822, 0.5033],
        [0.0844, 0.0844, 0.0828, 0.0844, 0.0844, 0.0828, 0.4967],
        [0

In [13]:
import pandas as pd

ada_prior_df = pd.read_csv('./data/ada_prior/data.csv')

In [14]:
ada_prior_df

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,label
0,59,Local-gov,53612,Masters,14,Separated,Prof-specialty,Own-child,Black,Female,0,0,Y,United-States,N
1,51,Private,136913,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,Y,United-States,N
2,58,Self-emp-not-inc,331474,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,Y,United-States,Y
3,36,Private,52327,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Other,Male,0,0,Y,Iran,Y
4,61,Private,136109,11th,7,Widowed,Adm-clerical,Not-in-family,White,Female,0,0,N,United-States,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4557,57,Local-gov,212303,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,Y,United-States,Y
4558,28,Private,209205,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,N,United-States,Y
4559,24,Private,259510,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,5013,0,Y,United-States,N
4560,44,Self-emp-not-inc,163985,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,Y,United-States,Y


In [15]:
X = ada_prior_df.drop(['hoursPerWeek', 'label'], axis=1).iloc[:1000, :]
X_clean = prep_pipeline_label(X, is_fit=True, X_type="train").detach().numpy()

In [19]:
y_hpw = ada_prior_df['hoursPerWeek'].values
y_label = ada_prior_df['label'].values
y_hpw = (y_hpw == 'Y').astype(int)
y_label = (y_label == 'Y').astype(int)

In [24]:
(y_hpw != y_label).mean()

0.6992547128452433

In [20]:
-y_hpw * np.log2(y_label)

array([inf, inf, -0., ..., inf, -0., nan])

In [185]:
y_hpw = ada_prior_df['hoursPerWeek'].iloc[:1000].values
y_label = ada_prior_df['label'].iloc[:1000].values
y_hpw = (y_hpw == 'Y').astype(int)
y_label = (y_label == 'Y').astype(int)

In [186]:
X_clean.shape, y_hpw.shape, y_label.shape

((1000, 99), (1000,), (1000,))

In [187]:
preprocessed_df = pd.DataFrame(X_clean)
preprocessed_df['hoursPerWeek'] = y_hpw
preprocessed_df['label'] = y_label

In [188]:
preprocessed_df.to_csv("ada_prior_cleaned_with_label.csv", index=False)