Реализация дерева решений для Uplift-моделирования с использованием numpy. За основу взят следующий код:


https://www.kaggle.com/code/pratikgarai/decision-tree-regression-from-scratch-using-numpy

В качестве метрики используется **DeltaDeltaP**.

In [163]:
from dataclasses import dataclass, make_dataclass
import numpy as np


class Node: 
    def __init__(self) :
        self.left = None
        self.right = None
        self.feature_id = None
        self.feature_thresh = None
        self.is_leaf = False
        self.result = None
        self.metric = 0
        self.depth = -1
    
    def set_leaf(self, result, level) :
        self.is_leaf = True
        self.result = result
        self.level = level
    
    def set_branch(self, best_feature, best_feature_thresh, best_metric, left, right, depth) :
        self.feature_id = best_feature
        self.feature_thresh = best_feature_thresh
        self.metric = best_metric
        self.left = left
        self.right = right
        self.depth = depth
    
    def traverse(self, inp):
        if self.is_leaf: 
            return self.result, True
        if inp[self.feature_id] <= self.feature_thresh : 
            return self.left, False
        else: 
            return self.right, False
    
    def __repr__(self) : 
        s = ""
        s += f"Level   : {self.depth}\n"
        s += f"Leaf    : {self.is_leaf}\n"
        if self.is_leaf : 
            s += f"Result  : {self.result}\n"
        else : 
            s += f"Feature : {self.feature_id}\n"
            s += f"Thresh  : {self.feature_thresh}\n"
        return s


@dataclass
class UpliftTreeRegressor:
    """

    Parameters
    ----------
    max_depth : np.int :
        maximum depth of tree    
    min_samples_leaf : int :
        minimum count of samples in leaf    
    min_samples_leaf_treated : int :
        minimum count of treated samples in leaf
    min_samples_leaf_control : int :    
        minimum count of control samples in leaf
    Returns
    -------
    
    """
    max_depth: int = 3  
    min_samples_leaf: int = 1000  
    min_samples_leaf_treated: int = 300  
    min_samples_leaf_control: int = 300  
    

    def fit(self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray):
        """Fit model."""
        data = np.concatenate((X, treatment.reshape(-1, 1), y.reshape(-1, 1)), axis=1)
        self.root = self.build(0, data)

        
    def split_data(self, data, feature_idx, thresh) : 
        return  data[data[:, feature_idx] <= thresh], data[data[:, feature_idx] > thresh]
    
    def build(self, depth, data):
        n = Node()       
        if depth==self.max_depth:            
            n.set_leaf(np.mean(data[:, -1]), depth)
            return n
        elif len(data[data[:, -2]==1])<self.min_samples_leaf_treated:
            n.set_leaf(np.mean(data[:, -1]), depth)
            return n
        elif len(data[data[:, -2]==0])<self.min_samples_leaf_control:
            n.set_leaf(np.mean(data[:, -1]), depth)
            return n
        else:
            n_features = data[:, :-2].shape[1]
            best_metric = 0
            left_partition = None
            right_partition = None
            best_feature = None
            best_feature_thresh = None 
            
            for feature_idx in range(n_features):
                column_values = data[:, feature_idx]
                unique_values = np.unique(column_values)
                if len(unique_values) > 10:
                    percentiles = np.percentile(
                        column_values, [3, 5, 10, 20, 30, 50, 70, 80, 90, 95, 97]
                    )
                else:
                    percentiles = np.percentile(unique_values, [10, 50, 90])
                threshold_options = np.unique(percentiles)   
                
                for thresh in threshold_options:
                    left, right = self.split_data(data, feature_idx, thresh)
                    if len(left)<self.min_samples_leaf or len(right)<self.min_samples_leaf:
                        continue
                    delta_left = np.absolute(np.mean(left[left[:, -2]==1][:, -1])-np.mean(left[left[:, -2]==0][:, -1]))
                    delta_right = np.absolute(np.mean(right[right[:, -2]==1][:, -1])-np.mean(right[right[:, -2]==0][:, -1]))
                    delta_delta_p = np.absolute(delta_left-delta_right)
                    if delta_delta_p > best_metric:
                        best_metric=delta_delta_p
                        left_partition = left
                        right_partition = right
                        best_feature = feature_idx
                        best_feature_thresh = thresh
                        
            if best_metric:           # костыль, чтобы обходить исчерпание цикла
                n.set_branch(
                    best_feature, 
                    best_feature_thresh,
                    best_metric,
                    self.build(depth+1, left_partition),
                    self.build(depth+1, right_partition),
                    depth
                )
            else:
                n.set_leaf(np.mean(data[:, -1]), depth)
            return n

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Return predicts for X."""
        if not self.root : 
            raise Exception("Tree Not Fit Yet!")
        predictions = []        
        for i in X : 
            res = self.root
            fin = False
            while not fin : 
                res, fin = res.traverse(i)
            predictions.append(res)
        return np.array(predictions)       

In [56]:
import pandas as pd
df = pd.read_csv('uplift_experiment.csv')
df.head()

Unnamed: 0,id,feat0,feat1,feat2,feat3,feat4,treatment,target
0,0,0.400729,0.808413,0.943385,0.19877,-2.896355,1,906
1,1,1.115889,-2.107072,-0.512322,-0.2068,0.705603,1,558
2,2,-0.153704,-0.010682,-0.38602,0.274708,-0.277327,1,386
3,3,-1.598411,0.084529,-1.271734,-1.552409,0.819762,0,391
4,4,2.328404,-1.731663,0.069395,0.339403,0.629478,0,434


In [58]:
X = df[['feat0', 'feat1', 'feat2', 'feat3', 'feat4']].to_numpy()
treatment = df['treatment'].to_numpy()
y = df['target'].to_numpy()

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, treatment_train, treatment_test, y_train, y_test = train_test_split(
    X, treatment, y, test_size=0.33, random_state=42)

In [164]:
uplift = UpliftTreeRegressor()
uplift.fit(X_train, treatment_train, y_train)
uplift_preds = uplift.predict(X_test)

In [202]:
def uplift_at_k(y, uplift, treatment):
    data = np.concatenate((y.reshape(-1, 1), uplift.reshape(-1, 1), treatment.reshape(-1, 1)),  axis=1)
    data = data[data[:, 1].argsort()[::-1]] # сортируем по убыванию uplift
    top = len(data)//5
    data = data[:top, :] # отбираем 20% топа
    treatment = np.absolute(np.mean(data[data[:, -1]==1][:, 0]))
    control = np.absolute(np.mean(data[data[:, -1]==0][:, 0]))
    return treatment-control

In [203]:
uplift_at_k(y_test, uplift_preds, treatment_test)

426.21284216147535