In [1]:
import torch
import pandas
import numpy as np
from pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
from pipeline.diffprep_flex_pipeline import DiffPrepFlexPipeline
from prep_space import space
from experiment.experiment_utils import set_random_seed, load_data, build_data, grid_search, makedir, save_result, load_data_multitask

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_prep_pipeline(path, prep_space, params, data_dir, dataset):

    X, y = load_data_multitask(data_dir, dataset)
    X_train, y_train, X_val, y_val, X_test, y_test = build_data(X, y, random_state=params["split_seed"])

    prep_pipeline = DiffPrepFixPipeline(prep_space, temperature=params["temperature"],
                                use_sample=params["sample"],
                                diff_method=params["diff_method"],
                                init_method=params["init_method"])
    prep_pipeline.init_parameters(X_train, X_val, X_test)
    prep_pipeline.load_state_dict(torch.load(path))
    #prep_pipeline.eval()

    return prep_pipeline

In [3]:
# define hyper parameters
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    #"model_lr": [0.1, 0.01, 0.001],
    "model_lr": 0.01,
    "weight_decay": 0,
    "model": 'log',
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9
}

auto_prep_params = {
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

params.update(auto_prep_params)

DATADIR = "data"
dataset = "Airbnb"


In [4]:
prep_pipeline_price = load_prep_pipeline('./result/diffprep_fix/Airbnb/Price/prep_pipeline.pth', space, params, DATADIR, dataset)

In [5]:
prep_pipeline_rating = load_prep_pipeline('./result/diffprep_fix/Airbnb/Rating/prep_pipeline.pth', space, params, DATADIR, dataset)

In [6]:
prep_pipeline_price, prep_pipeline_rating

(DiffPrepFixPipeline(
   (pipeline): ModuleList(
     (0): FirstTransformer()
     (1): Transformer()
     (2): Transformer()
     (3): Transformer()
   )
 ),
 DiffPrepFixPipeline(
   (pipeline): ModuleList(
     (0): FirstTransformer()
     (1): Transformer()
     (2): Transformer()
     (3): Transformer()
   )
 ))

In [7]:
import torch.nn.functional as F

In [8]:
price_pipeline_params = prep_pipeline_price.state_dict()
rating_pipeline_params = prep_pipeline_rating.state_dict()

In [9]:
def get_pipeline_ops(prep_pipeline_params):
    pipeline_ops = {}
    for pipeline_step, pipeline in prep_pipeline_params.items():
        #print(pipeline_step, pipeline.shape)
        pipeline = F.softmax(pipeline)
        #pipeline_arr = pipeline.numpy()
        #pipeline_ops[pipeline_step] = pipeline_arr.argmax(axis=1)
        pipeline_ops[pipeline_step] = pipeline
    return pipeline_ops

In [10]:
price_pipeline_ops = get_pipeline_ops(price_pipeline_params)
rating_pipeline_ops = get_pipeline_ops(rating_pipeline_params)

In [11]:
ces = []
for pipeline_step in price_pipeline_params.keys():
    price_pipeline_op = price_pipeline_ops[pipeline_step]
    rating_pipeline_op = rating_pipeline_ops[pipeline_step]

    ce = -(price_pipeline_op * np.log2(rating_pipeline_op)).sum(axis=1).mean()

    print(pipeline_step)
    print("CE", ce)
    print("Agreement %", pipeline_step, (price_pipeline_op.argmax(axis=1) == rating_pipeline_op.argmax(axis=1)).numpy().mean())
    ces.append((price_pipeline_op.argmax(axis=1) == rating_pipeline_op.argmax(axis=1)).numpy().mean())

pipeline.0.num_tf_prob_logits
CE tensor(2.0029)
Agreement % pipeline.0.num_tf_prob_logits 1.0
pipeline.0.cat_tf_prob_logits
CE tensor(1.1179)
Agreement % pipeline.0.cat_tf_prob_logits 0.9824561403508771
pipeline.1.tf_prob_logits
CE tensor(3.1138)
Agreement % pipeline.1.tf_prob_logits 0.5
pipeline.2.tf_prob_logits
CE tensor(3.0981)
Agreement % pipeline.2.tf_prob_logits 0.776595744680851
pipeline.3.tf_prob_logits
CE tensor(3.3170)
Agreement % pipeline.3.tf_prob_logits 0.7659574468085106


In [12]:
import pandas as pd

airbnb_df = pd.read_csv('./data/Airbnb/data.csv')

In [13]:
airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Bathrooms                                           990 non-null    float64
 1   Bedrooms                                            895 non-null    float64
 2   Beds                                                991 non-null    float64
 3   LocationName                                        991 non-null    object 
 4   NumGuests                                           991 non-null    float64
 5   NumReviews                                          991 non-null    float64
 6   Price                                               1000 non-null   object 
 7   Rating                                              1000 non-null   object 
 8   latitude                                            1000 non-null   float64
 9 

In [14]:
X = airbnb_df.drop(['Rating', 'Price'], axis=1).iloc[:600, :]
X_clean = prep_pipeline_rating(X, is_fit=True, X_type="train").detach().numpy()

In [15]:
y_rating = airbnb_df['Rating'].iloc[:600].values
y_rating = (y_rating == 'Y').astype(int)

In [None]:
y_price = airbnb_df['Price'].iloc[:600].values
y_price = (y_price == 'Y').astype(int)

In [16]:
airbnb_dirty = pd.read_csv('./data/Airbnb/raw/dirty_train.csv')
((airbnb_dirty['Price'] > 40)[:600] == y_rating).mean()

In [24]:
y_price = (airbnb_dirty['Price'] > 40)[:600].astype(int)

In [25]:
(y_rating == y_price).mean()

0.6733333333333333

In [37]:
X_clean.shape, y_rating.shape, y_price.shape

((600, 94), (600,), (600,))

In [38]:
X_clean.shape

(600, 94)

In [26]:
preprocessed_df = pd.DataFrame(X_clean)
preprocessed_df['Rating'] = y_rating
preprocessed_df['Price'] = airbnb_dirty['Price'][:600] #y_price

In [27]:
preprocessed_df.to_csv("airbnb_cleaned_with_rating_controlled.csv", index=False)