In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import export_text
import random

# Tạo dữ liệu ngẫu nhiên
data = {
    'Feature1': [random.choice(['A', 'B']) for _ in range(100)],
    'Feature2': [random.choice(['X', 'Y', 'Z']) for _ in range(100)],
    'Feature3': [random.choice(['One', 'Two', 'Three', 'Four']) for _ in range(100)],
    'Feature4': [random.choice(['Red', 'Green', 'Blue', 'Yellow' ]) for _ in range(100)],
    'Feature5': [random.uniform(0.0, 10.0) for _ in range(100)],
    'Label': [random.choice([0, 1]) for _ in range(100)]
}

df = pd.DataFrame(data)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X = df.drop('Label', axis=1)
y = df['Label']
X_train=X
y_train=y
data_test = {
    'Feature1': [random.choice(['A', 'B']) for _ in range(100)],
    'Feature2': [random.choice(['X', 'Y', 'Z']) for _ in range(100)],
    'Feature3': [random.choice(['One', 'Two', 'Three', 'Four']) for _ in range(100)],
    'Feature4': [random.choice(['Red', 'Green', 'Blue', 'Yellow' ]) for _ in range(100)],
    'Feature5': [random.uniform(0.0, 10.0) for _ in range(100)],
    'Label': [random.choice([0, 1]) for _ in range(100)]
}
df_test = pd.DataFrame(data_test)
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_test = df_test.drop('Label', axis=1)
y_test = df_test['Label']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import LabelEncoder, StandardScaler
label_encoders = {}
for column in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])
    label_encoders[column] = le

scaler = StandardScaler()
numerical_features = X_train.select_dtypes(exclude=['object']).columns
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Sử dụng một mô hình học máy, ví dụ Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Đo lường hiệu suất của mô hình
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Trực quan hóa cây quyết định
tree_rules = export_text(clf, feature_names=list(X.columns))
print(tree_rules)

Accuracy: 0.49
|--- Feature5 <= -1.53
|   |--- class: 1
|--- Feature5 >  -1.53
|   |--- Feature5 <= -1.28
|   |   |--- class: 0
|   |--- Feature5 >  -1.28
|   |   |--- Feature5 <= -0.99
|   |   |   |--- Feature2 <= -0.77
|   |   |   |   |--- class: 0
|   |   |   |--- Feature2 >  -0.77
|   |   |   |   |--- Feature3 <= -0.83
|   |   |   |   |   |--- Feature1 <= -0.02
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- Feature1 >  -0.02
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- Feature3 >  -0.83
|   |   |   |   |   |--- class: 1
|   |   |--- Feature5 >  -0.99
|   |   |   |--- Feature3 <= 0.06
|   |   |   |   |--- Feature5 <= -0.94
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Feature5 >  -0.94
|   |   |   |   |   |--- Feature5 <= -0.66
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- Feature5 >  -0.66
|   |   |   |   |   |   |--- Feature5 <= -0.31
|   |   |   |   |   |   |   |--- Feature4 <= -0.76
|   |   |   |   |   |   |   |   |--- clas

In [74]:
from collections import Counter

#get the leaf for each training sample
leaves_index = clf.apply(X_train)
#use Counter to find the number of elements on each leaf
cnt = Counter( leaves_index )
print(cnt)
top_5_rules = cnt.most_common(5)
print(top_5_rules)


Counter({76: 8, 11: 7, 20: 6, 74: 6, 3: 5, 1: 5, 63: 4, 16: 4, 71: 4, 66: 3, 41: 3, 59: 3, 36: 3, 45: 3, 10: 2, 19: 2, 69: 2, 75: 2, 51: 2, 38: 2, 14: 2, 50: 2, 35: 2, 22: 2, 9: 2, 39: 1, 33: 1, 30: 1, 44: 1, 49: 1, 48: 1, 34: 1, 57: 1, 32: 1, 70: 1, 62: 1, 68: 1, 61: 1, 6: 1})
[(76, 8), (11, 7), (20, 6), (74, 6), (3, 5)]


In [77]:
# Assuming clf is a trained DecisionTreeClassifier
# Get the leaf index for each training sample
leaves_index = clf.apply(X_train)

# Define a function to extract rules based on leaf indices
def extract_rules_from_leaves(tree, leaf_indices, feature_names, class_names):
    leaf_rules = {}

    def traverse_tree(node, current_rule):
        if tree.feature[node] != -2:
            feature = feature_names[tree.feature[node]]
            threshold = tree.threshold[node]

            # Extend the current rule with the feature and threshold
            rule_extension = f"if {feature} <= {threshold:.4f}"
            new_rule = current_rule + ("" if current_rule == "" else " and ") + rule_extension

            # Traverse the left and right children
            traverse_tree(tree.children_left[node], new_rule)
            traverse_tree(tree.children_right[node], new_rule)
        else:
            # Leaf node, add the class prediction to the rule
            class_prediction = class_names[np.argmax(tree.value[node])]
            leaf_rules[leaf_indices[node]] = current_rule + f" then class: {class_prediction}"

    # Start traversing from the root
    traverse_tree(0, "")

    return leaf_rules

# Get the rules for each leaf index
leaf_rules = extract_rules_from_leaves(clf.tree_, leaves_index, X_train.columns, clf.classes_)

# Print the rules for each leaf
for leaf_index, rule in leaf_rules.items():
    print(f"Leaf {leaf_index}: {rule}")

Leaf 19: if Feature5 <= -1.5320 then class: 1
Leaf 75: if Feature5 <= -1.5320 and if Feature5 <= -1.2755 then class: 0
Leaf 66: if Feature5 <= -1.5320 and if Feature5 <= -1.2755 and if Feature5 <= -0.9922 and if Feature3 <= 0.0622 and if Feature1 <= -0.0200 and if Feature4 <= 1.2078 and if Feature5 <= 0.8356 and if Feature5 <= 0.3585 and if Feature4 <= -0.7561 and if Feature5 <= 0.1807 and if Feature5 <= 0.2706 then class: 1
Leaf 20: if Feature5 <= -1.5320 and if Feature5 <= -1.2755 and if Feature5 <= -0.9922 and if Feature3 <= 0.0622 and if Feature1 <= -0.0200 and if Feature5 <= 0.0303 then class: 0
Leaf 51: if Feature5 <= -1.5320 and if Feature5 <= -1.2755 and if Feature5 <= -0.9922 and if Feature2 <= -0.7716 and if Feature3 <= -0.8268 and if Feature1 <= -0.0200 then class: 0
Leaf 16: if Feature5 <= -1.5320 and if Feature5 <= -1.2755 and if Feature5 <= -0.9922 and if Feature3 <= 0.0622 and if Feature1 <= -0.0200 and if Feature4 <= 1.2078 and if Feature5 <= 0.8356 and if Feature5 <= 0