In [271]:
import sys
sys.path.append("..")

In [272]:
import gurobipy
import pandas as pd
from IORFA import *
import numpy as np
import time
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.linear_model import LinearRegression

In [273]:
train_ratio = 0.5
val_ratio = 0.25
test_ratio = 0.25
seed = 42

In [274]:

## Create artificial data set with
n = 500
x1 = np.random.normal(loc = 0,scale=1, size=n)
x2 = np.random.normal(loc=0, scale=1, size=n)
x3 = np.random.normal(loc=0, scale=1, size=n)
x4 = np.random.normal(loc=0, scale=1, size=n)
x5 = np.random.normal(loc =0,scale=1, size=n)
x6 = np.random.normal(loc=0, scale=1, size=n)
x7 = np.random.normal(loc=0, scale=1, size=n)
x8 = np.random.normal(loc=0, scale=1, size=n)

simulated_data = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 
                                'x5': x5, 'x6': x6, 'x7': x7, 'x8': x8})

x = simulated_data[[f'x{i}' for i in range(1, 9)]]

orig_cols = simulated_data.columns

# Create an instance of the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Assuming you have a feature matrix X
# Apply the min-max scaling to the data
x = scaler.fit_transform(x)

y = 0.2*(x[:, 0] > 0.2)*(x[:, 5] < 0.5) + 0.3*x[:, 3]

y_bar = y

y = y_bar

x = np.array(x)
y = np.array(y)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=seed)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                test_size=test_ratio/(test_ratio+val_ratio), random_state=seed)

In [275]:
x_train = pd.DataFrame(x_train, columns = orig_cols)
x_test = pd.DataFrame(x_test, columns = orig_cols)

In [276]:
T = DecisionTreeRegressor(max_depth=2)
T.fit(x_train, y_train)

In [277]:
# Extract the rules from the decision tree
tree_rules = export_text(T, feature_names=list(orig_cols))

print("Tree Rules:")
print(tree_rules)

Tree Rules:
|--- x6 <= 0.50
|   |--- x4 <= 0.59
|   |   |--- value: [0.32]
|   |--- x4 >  0.59
|   |   |--- value: [0.41]
|--- x6 >  0.50
|   |--- x4 <= 0.52
|   |   |--- value: [0.11]
|   |--- x4 >  0.52
|   |   |--- value: [0.20]



In [278]:
import re

def parse_tree_rules(tree_rules_str):
    lines = tree_rules_str.split("\n")
    rules = []
    conditions = []

    for line in lines:
        if '---' in line:
            # Determine the level of the current condition by counting the leading '|'
            level = line.count("|", 0, line.index("---"))
            # Extract variable, operator, and value
            match = re.search(r"(x\d+)\s*([<>=]+)\s*([\d\.]+)", line)
            if match:
                var, op, value = match.groups()

                print(var, op, value)

                # Adjust the size of conditions list to match the current level
                conditions = conditions[:level]
                print()
                conditions.append(f"({var} {op} {value})")

        elif 'value:' in line:
            # Extract value
            value = re.findall(r"\[([\d\.]+)\]", line)[0]
            # Combine conditions to a single string and append to rules
            rule = " & ".join(conditions)
            rules.append((rule, float(value)))

    return conditions

In [279]:
parse_tree_rules(tree_rules)

x6 <= 0.50

x4 <= 0.59

x4 > 0.59

x6 > 0.50

x4 <= 0.52

x4 > 0.52



['(x6 <= 0.50)', '(x6 > 0.50)', '(x4 > 0.52)']

In [280]:
from sklearn.tree import _tree

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = ""
        
        for p in path[:-1]:
            if rule != "":
                rule += " & "
            rule += str(p)

        rules += [rule]
        
    return rules


In [281]:
rules

['(x6 > 0.5) & (x4 > 0.481)',
 '(x6 > 0.5) & (x4 <= 0.481)',
 '(x6 <= 0.5) & (x4 <= 0.571)',
 '(x6 <= 0.5) & (x4 > 0.571)']

In [282]:
for i, rule in enumerate(rules):
    x_test[f'rule_{i}'] = x_test.eval(rule).astype(int)

In [283]:
x_train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8
0,0.380217,0.318813,0.314206,0.490491,0.678176,0.351433,0.605666,0.649317
1,0.648094,0.377432,0.303682,0.873934,0.593315,0.568287,0.893026,0.714408
2,0.663560,0.574788,0.230656,0.582927,0.478830,0.196578,0.674897,0.755313
3,0.721451,0.463873,0.275977,0.457763,0.727842,0.400903,0.773866,0.671936
4,0.491168,0.336360,0.284268,0.642601,0.510424,0.304149,0.885499,0.517305
...,...,...,...,...,...,...,...,...
245,0.472280,0.658372,0.429751,0.372000,0.408421,0.416801,0.672361,0.261801
246,0.292107,0.313208,0.266700,0.579649,0.520324,0.560504,0.708351,0.472778
247,0.469423,0.641307,0.302103,0.606805,0.597027,0.650178,0.689605,0.660183
248,0.566199,0.648834,0.451907,0.432633,0.655285,0.062487,0.902941,0.632012


In [284]:
# def linear_regression_pipeline(X_train, X_test, y_train, y_test):

#     model = LinearRegression()
#     model.fit(X_train, y_train)

#     return round(model.score(X_train, y_train), 3), round(model.score(X_test, y_test), 3), model.coef_

In [285]:
# train_R2, test_R2 = linear_regression_pipeline(x_train, x_test, y_train, y_test)

In [290]:
# train_R2, test_R2

Objective function

In [294]:
from sklearn.metrics import mean_squared_error
# Algo 8.1 LocalSearch

def InitialSteps(X, y, max_depth, alpha):

    X = X.copy()

    T = DecisionTreeRegressor(max_depth=max_depth)
    T.fit(x_train, y_train)

    rules = get_rules(T, list(orig_cols), None)
    for i, rule in enumerate(rules):
        X[f'rule_{i}'] = X.eval(rule).astype(int)
    
    rulefit = LinearRegression()
    rulefit.fit(X, y) # rulefit is LinReg because we added rules already :D
    SSE = mean_squared_error(rulefit.predict(X), y)

    ####### Needed to define Loss(T, X, y) ######
    is_leaf = np.logical_and(T.tree_.children_left == -1, T.tree_.children_right == -1)
    d = np.count_nonzero(~is_leaf) # recall complexity d is |T| = number of branching nodes (not 2^max_depth)
    ######

    n = y.shape[0]
    L_init = SSE*n + alpha*d

    print(f"SSE contributes {SSE*n*100/L_init}% to the initial loss")
    print(f"Complexity contributes {alpha*d*100/L_init}%")

    return L_init

In [299]:
InitialSteps(x_train, y_train, 3, alpha = 0.00001)

SSE contributes 1.2360291662818856e-23% to the initial loss
Complexity contributes 100.0%


7.000000000000001e-05

In [319]:
def nodes(T):

    lc = list(T.tree_.children_left)
    rc = list(T.tree_.children_right)

    nodes = [0] + lc + rc # nodes are root + left children + right_children
    nodes = list(set(nodes)) # but some children can be duplicates
    nodes = np.setdiff1d(nodes, np.array(-1)) # also -1 is not a node but indicates a node doesn't exist

    return nodes

In [324]:
def shuffle(arr):
    # need a fn for this because np.random.shuffle is not inplace
    arr_copy = arr.copy()
    np.random.shuffle(arr_copy)
    return arr_copy 

In [328]:
def find_leaf_nodes(T, t):
    # List to store the leaf nodes
    leaf_nodes = []

    # Recursive helper function
    def explore(node):
        # If this is a leaf node, add it to the list
        if T.tree_.children_left[node] == T.tree_.children_right[node] == -1:
            leaf_nodes.append(node)
        else:
            # If this is a branch node, explore its children
            if T.tree_.children_left[node] != -1:
                explore(T.tree_.children_left[node])
            if T.tree_.children_right[node] != -1:
                explore(T.tree_.children_right[node])

    # Start exploring from node t
    explore(t)

    return leaf_nodes

In [346]:
def construct_I(T, X, nodes_in_T_t):

    
    belongs_to = T.apply(X) # leaf node obs belong to

    I = []

    for i in range(X.shape[0]):

        if belongs_to[i] in nodes_in_T_t:

            I.append(i) 
    
    return I

In [None]:
def OptimizeNodeParallel(T, X, y):

    # ... to finish

In [349]:
def LocalSearch(T, X, y, max_depth):
    
    # Extracting initial loss
    error_prev = InitialSteps(X, y, max_depth, alpha = 0.00001)
    
    X = np.array(X)
    y = np.array(y)
    
    for t in shuffle(nodes(T)):

        nodes_in_T_t = find_leaf_nodes(T, t)

        I = construct_I(T, X, nodes_in_T_t)

        T_t = OptimizeNodeParallel(T_t, X[I], y[I])

        # Replace tth node in T with T_t


