# Imports

In [1]:
import os
os.environ['NUMPY_EXPERIMENTAL_ARRAY_FUNCTION'] = '0'
import warnings
warnings.filterwarnings("ignore")
import sys, pathlib
path = (pathlib.Path.home() / 'PILOT').as_posix()
if path not in sys.path:
    sys.path.append(path)
    
import pandas as pd
import numpy as np
import time

from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from tqdm.notebook import tqdm
    
from pilot.pypilot import PILOT
from pilot import Pilot

%load_ext line_profiler
%load_ext autoreload
%autoreload 2

# Load data

In [2]:
df = pd.read_csv(pathlib.Path.home() / 'PILOT' / 'Data' / 'abalone.csv')
X = df.drop(columns='target').values
y = df['target'].values

categorical = [0]

In [2]:
def generate_data(n_sample, n_feature) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    X = []
    y = np.random.randn(n_sample)
    cat_feat = []
    for feature in range(n_feature):
        if np.random.choice(['cat', 'num'], p=[0.1, 0.9]) == 'cat':
            x = np.random.choice([0, 1, 2], size=n_sample)
            if np.random.choice(['p_conc', 'nothing']) == 'p_conc':
                y = y + np.where(x == 0, 1, np.where(x == 1, -1, 0))
            cat_feat.append(feature)
        else:
            kind = np.random.choice(['pcon', 'plin', 'blin', 'lin', 'nothing'])
            x = np.random.randn(n_sample) * np.random.randint(1, 10) + np.random.randint(1, 10)
            split = np.random.choice(x)
            if kind == 'pcon':
                y = y + np.where(x < split, 1, -1)
            elif kind == 'lin':
                y = y + 0.1 + x * 0.1
            elif kind == 'plin':
                y = y + np.where(x < split, 0.1 - 0.1 * x, 0.1 + 0.1 * x)
            elif kind == 'blin':
                crossing_y = 0.1 - 0.1 * split
                second_intercept = crossing_y - 0.1 * split
                y = y + np.where(x < split, 0.1 - 0.1 * x, second_intercept + 0.1 * x)
        X.append(x)
        
    return np.array(X).T, y, np.array(cat_feat, dtype=np.int64)

# Experiment

## Full

In [None]:
n_samples = np.logspace(1, 4, num=20, dtype=np.int64)
n_features = np.logspace(0, 2.8, num=20, dtype=np.int64) + 1

results = []
for n_sample in tqdm(n_samples):
    for n_feature in tqdm(n_features):
        for random_seed in range(5):
            np.random.seed(random_seed)
            X, y, cat_feat = generate_data(n_sample, n_feature)
            pilot = PILOT(max_depth=10, min_sample_split=2, min_sample_leaf=1, truncation_factor=1, rel_tolerance=0.01, min_unique_values_regression=2)
            start = time.time()
            pilot.fit(X, y, cat_feat)
            end = time.time()
            results.append(dict(n_samples=n_sample, n_features=n_feature, time_elapsed=end - start, **pilot.model_tree.nodes_selected(), kind='pilot'))
            tree = DecisionTreeRegressor(max_depth=10, min_samples_split=2, min_samples_leaf=1)
            start = time.time()
            tree.fit(X, y)
            end = time.time()
            results.append(dict(n_samples=n_sample, n_features=n_feature, time_elapsed=end - start, kind='CART'))
pd.DataFrame(results).to_csv('/home/servot82/PILOT/Output/pilot_computation_time_simulation.csv', index=False)

In [6]:
results = pd.read_csv('/home/servot82/PILOT/Output/pilot_plain_python_computation_time_simulation.csv')

results = results.assign(
    log_t=np.log(results['time_elapsed']),
    log_n=np.log(results['n_samples']),
    log_p=np.log(results['n_features']),
    log__nlog_n=np.log(np.log(results['n_samples'])*results['n_samples']),
    log__plog_p=np.log(np.log(results['n_features'])*results['n_features']),
)

summary = results.groupby(['n_samples', 'n_features', 'kind'])['time_elapsed'].mean().unstack()
summary

Unnamed: 0_level_0,kind,CART,pilot
n_samples,n_features,Unnamed: 2_level_1,Unnamed: 3_level_1
10,2,0.000184,0.003707
10,3,0.000172,0.008291
10,4,0.000178,0.014538
10,6,0.000189,0.014877
10,8,0.000246,0.025492
...,...,...,...
88,116,0.002249,7.473681
88,163,0.003259,17.716648
88,228,0.004628,31.770261
88,321,0.005860,73.325343


In [7]:
summary.loc[61, 450]

kind
CART      0.005446
pilot    45.345239
Name: (61, 450), dtype: float64

## Pcon only

In [7]:
n_samples = np.logspace(1, 3, num=10, dtype=np.int64)
n_features = np.logspace(0, 1.6, num=10, dtype=np.int64) + 1

results = []
for n_sample in tqdm(n_samples):
    for n_feature in tqdm(n_features):
        for random_seed in range(5):
            np.random.seed(random_seed)
            X, y, cat_feat = generate_data(n_sample, n_feature)
            pilot = Pilot.PILOT(max_depth=10, min_sample_split=2, min_sample_leaf=1, truncation_factor=1, rel_tolerance=0.01, min_unique_values_regression=2, regression_nodes=['pcon'])
            start = time.time()
            pilot.fit(X, y, cat_feat)
            end = time.time()
            results.append(dict(n_samples=n_sample, n_features=n_feature, time_elapsed=end - start, **pilot.model_tree.nodes_selected(), kind='pilot'))
            tree = DecisionTreeRegressor(max_depth=10, min_samples_split=2, min_samples_leaf=1)
            start = time.time()
            tree.fit(X, y)
            end = time.time()
            results.append(dict(n_samples=n_sample, n_features=n_feature, time_elapsed=end - start, kind='CART'))
pd.DataFrame(results).to_csv('/home/servot82/PILOT/Output/pilot_pcon_computation_time_simulation.csv', index=False)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
results = pd.read_csv('/home/servot82/PILOT/Output/pilot_pcon_computation_time_simulation.csv')

results = results.assign(
    log_t=np.log(results['time_elapsed']),
    log_n=np.log(results['n_samples']),
    log_p=np.log(results['n_features']),
    log__nlog_n=np.log(np.log(results['n_samples'])*results['n_samples']),
    log__plog_p=np.log(np.log(results['n_features'])*results['n_features']),
)

summary = results.groupby(['n_samples', 'n_features', 'kind'])['time_elapsed'].mean().unstack()
summary.assign(ratio=summary['pilot'] / summary['CART'])

Unnamed: 0_level_0,kind,CART,pilot,ratio
n_samples,n_features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,2,0.000163,0.035932,219.887803
10,3,0.000142,0.002127,14.956084
10,4,0.000169,0.002603,15.424979
10,6,0.000167,0.002596,15.505554
10,8,0.000191,0.003514,18.398002
...,...,...,...,...
1000,8,0.004232,0.347446,82.107648
1000,12,0.006290,0.551456,87.672421
1000,18,0.008939,0.713557,79.827482
1000,27,0.012820,1.089844,85.011177


# Profiling

In [3]:
X, y, cat = generate_data(1000, 20)

In [10]:
p = PILOT(regression_nodes=['pcon'])

In [14]:
import cProfile
import pstats

cProfile.run('p.fit(X, y, cat)','profile_results')
stats = pstats.Stats('profile_results')
stats.strip_dirs().sort_stats('cumtime').print_stats()

Fri Jun 21 16:59:43 2024    profile_results

         27019194 function calls (27019170 primitive calls) in 37.601 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   37.601   37.601 {built-in method builtins.exec}
        1    0.000    0.000   37.601   37.601 <string>:1(<module>)
        1    0.000    0.000   37.601   37.601 pypilot.py:814(fit)
     25/1    0.002    0.000   37.599   37.599 pypilot.py:657(build_tree)
       25   10.818    0.433   37.593    1.504 pypilot.py:162(best_split)
   466180    3.659    0.000   15.837    0.000 pypilot.py:145(update_moments)
  2387673    2.945    0.000   11.156    0.000 fromnumeric.py:2123(sum)
  2851527    4.273    0.000    9.044    0.000 fromnumeric.py:69(_wrapreduction)
  4760892    4.451    0.000    4.451    0.000 {method 'reduce' of 'numpy.ufunc' objects}
   953654    0.743    0.000    2.236    0.000 {method 'sum' of 'numpy.ndarray' objects}
   927680 

<pstats.Stats at 0x7f3dc03b8ac0>