In [111]:
import xgboost as xgb
import sklearn.datasets  
import pandas as pd
from sklearn.model_selection import train_test_split
import re
# Load a sample dataset
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
data= sklearn.datasets.load_breast_cancer(as_frame=True)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost classifier
model = xgb.XGBClassifier(max_depth= 20 , random_state=42)
model.fit(X_train, y_train)


In [None]:
data

In [87]:
import shap
import numpy as np
explainer = shap.Explainer(model)

# Calculate SHAP values
shap_values = explainer(X_train)

In [88]:
feature_importances = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': np.abs(shap_values.values).mean(axis=0)
})

# Sort features by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

In [90]:
# Convert the model trees to a DataFrame
booster = model.get_booster()
trees_df = booster.trees_to_dataframe()


In [91]:
def switch(feature_names, series):
    # Create a mapping dictionary
    feature_map = {f'f{i+1}': name for i, name in enumerate(feature_names)}
    
    
    # Map the series using the dictionary
    return series.map(feature_map)

In [92]:
trees_df['Feature1'] = switch(data['feature_names'], trees_df['Feature'])

In [93]:
trees_df['Feature1'] = trees_df['Feature1'].str.replace(" ", "_", regex=False)
trees_df['Feature1'] = trees_df['Feature1'].fillna('Leaf')


In [94]:
trees_df

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Category,Feature1
0,0,0,0-0,f27,0.142400,0-1,0-2,0-2,305.283295,106.548126,,worst_concavity
1,0,1,0-1,f23,967.000000,0-3,0-4,0-4,48.968056,71.188194,,worst_perimeter
2,0,2,0-2,f20,14.850000,0-5,0-6,0-6,14.253098,35.359928,,fractal_dimension_error
3,0,3,0-3,f13,43.139999,0-7,0-8,0-8,6.420822,66.504761,,perimeter_error
4,0,4,0-4,f26,0.193500,0-9,0-10,0-10,3.601319,4.683434,,worst_compactness
...,...,...,...,...,...,...,...,...,...,...,...,...
597,98,1,98-1,Leaf,,,,,0.023873,1.305181,,Leaf
598,98,2,98-2,Leaf,,,,,-0.027404,1.007463,,Leaf
599,99,0,99-0,f21,25.730000,99-1,99-2,99-2,0.033745,2.304444,,worst_radius
600,99,1,99-1,Leaf,,,,,0.027880,1.029626,,Leaf


In [95]:
def get_rules(tree_no, index, leaf_id):
    # Initialize the list to store the rule path
    rules = []
    
    # Start with the leaf node
    current_node = trees_df[trees_df['ID'] == leaf_id]
    
    while not current_node.empty:
        feature = current_node['Feature1'].values[0]
        split = current_node['Split'].values[0]
        parent_id = current_node['Node'].values[0]
        # Determine if the current node is a 'Yes' or 'No' branch from its parent
        parent_node = trees_df[trees_df['Yes'] == leaf_id] if not trees_df[trees_df['Yes'] == leaf_id].empty else trees_df[trees_df['No'] == leaf_id]
        if parent_node.empty:
            break
        
        parent_feature = parent_node['Feature1'].values[0]
        parent_split = parent_node['Split'].values[0]
        
        if leaf_id in parent_node['Yes'].values:
            rule = f"{parent_feature} <= {parent_split}"
        else:
            rule = f"{parent_feature} > {parent_split}"
        
        rules.append(rule)
        
        # Move to the parent node
        leaf_id = parent_node['ID'].values[0]
        current_node = parent_node
    
    # Reverse the rules to get the correct order from root to leaf
    rules.reverse()
    
    # Print the rule path and the class prediction at the leaf
    #predicted_class = 1 if current_node['Gain'].values[0] > 0 else 0
    rule_path = " and ".join(rules)
    return(f"Tree {tree_no} - Rule {index}: {rule_path} -> {current_node['Gain'].values[0]}")


In [96]:
ruleset=[]
for tree_no in range(101):

    current_tree = trees_df[trees_df['Tree']==tree_no]

    leaf_nodes = current_tree[current_tree['Feature1'] == 'Leaf']

    # Get IDs of leaf nodes
    leaf_ids = leaf_nodes['ID'].tolist()
    for index,id in enumerate(leaf_ids):
        rule = get_rules(tree_no, index, id)
        ruleset.append(rule)
        #print(rule)


In [100]:
import re

In [109]:
pattern = r'(?<=:\s)(.*?)(?=\s->)'

# Find all matches
matches = re.findall(pattern, ruleset[0])
matches[0]


'worst_concavity > 0.142399997 and fractal_dimension_error <= 14.8500004'

In [143]:
BC = sklearn.datasets.load_breast_cancer()
df = pd.DataFrame(data=BC.data, columns=BC.feature_names)
df['target'] = BC.target
pd.set_option('display.max_columns', None)
df.iloc[0].squeeze()

  df.iloc[0].squeeze()[0]


17.99

In [98]:
def evaluate_condition(condition, features):
    """
    Evaluates a condition string using the provided features.
    
    :param condition: A condition string with feature names and values
    :param features: A dictionary containing feature names and their values
    :return: True if the condition is met, otherwise False
    """
    # Replace feature names in condition string with their values
    for feature, value in features.items():
        condition = condition.replace(feature, str(value))
    
    # Evaluate the condition
    return eval(condition)

def predict(features, rules):
    """
    Predicts the output based on the rules.
    
    :param features: A dictionary containing feature names and their values
    :param rules: A list of dictionaries containing rule conditions and outputs
    :return: The predicted output
    """
    for rule in rules:
        if evaluate_condition(rule['condition'], features):
            return rule['output']
    
    # Default output if no conditions are met
    return None

# Example usage
features = {
    'worst_concavity': 0.15,
    'fractal_dimension_error': 14.0,
    'worst_perimeter': 960.0,
    'perimeter_error': 40.0,
    'worst_compactness': 0.2,
    'mean_radius': 23.0,
    'worst_texture': 101.0
}

prediction = predict(features, rules)
print(f"Prediction: {prediction}")


NameError: name 'rules' is not defined