In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import dtreeviz
import matplotlib.pyplot as plt
from pypinyin import pinyin, lazy_pinyin, Style

In [None]:
def word_to_py(word):
    temp = pinyin(word,style=Style.FIRST_LETTER)
    result = ""
    for one_word in [_[0] for _ in temp]:
        result+=str(one_word).upper()
    return result

In [None]:
# 加载数据
data = pd.read_csv(r"F:\cache_data\zone_ana\sb\train_data\train_20240905.csv")

In [None]:
# dataset["土类"] = word_to_py(dataset["土类"])
data["NEW_TL"] =data['NEW_TL'].apply(word_to_py)

In [None]:
data["TL_label"] = data.NEW_TL.astype("category").cat.codes

In [None]:
data['TL_label'] = data['TL_label'].astype('category')
data['DLMC'] = data['DLMC'].astype('category')
data['MZMC'] = data['MZMC'].astype('category')

In [None]:
result = data.groupby('TL_label', observed=True)["NEW_TL"].apply(lambda x: list(x.unique())).to_dict()
print(result)

In [None]:
label = 'NEW_TL'
features = ['DLMC','MZMC','Centroid_X','Centroid_Y','aligned_AnalyticalHillshading_MEAN','aligned_ChannelNetworkBaseLevel_MEAN','aligned_ChannelNetworkDistance_MEAN','aligned_ConvergenceIndex_MEAN',
 'aligned_Analytical_Hillshading_MEAN','aligned_Aspect_MEAN','aligned_Channel_Network_Base_Level_MEAN','aligned_Channel_Network_Distance_MEAN','aligned_Convergence_Index_MEAN',
 'aligned_dem_MEAN','aligned_ETP2022_3_MEAN','aligned_ETP2022_8_MEAN','aligned_ETP2022_mean_MEAN','aligned_evi_MEAN','aligned_LS_Factor_MEAN','aligned_lswi_MEAN','aligned_mndwi_MEAN',
 'aligned_ndmi_MEAN','aligned_ndvi_MEAN','aligned_ndwi_MEAN','aligned_NIGHT2022_MEAN','aligned_pca_1_MEAN','aligned_pca_2_MEAN','aligned_Plan_Curvature_MEAN','aligned_PRE2022_3_MEAN',
 'aligned_PRE2022_8_MEAN','aligned_PRE2022_mean_MEAN','aligned_Profile_Curvature_MEAN','aligned_Relative_Slope_Position_MEAN','aligned_savi_MEAN','aligned_Slope_MEAN','aligned_TMP2022_3_MEAN',
 'aligned_TMP2022_8_MEAN','aligned_TMP2022_mean_MEAN','aligned_Topographic_Wetness_Index_MEAN','aligned_Total_Catchment_Area_MEAN','aligned_Valley_Depth_MEAN','aligned_vari_MEAN',
 'MRRTF_MEAN','MRVBF_MEAN','slope_postion_101_smooth_MAJORITY']

In [None]:
result_df = data[[label]+features]

In [None]:
result_df.to_csv(r"C:\Users\Runker\Desktop\TEST.csv", index=False)

In [None]:
# 分离特征和标签
X = data[features]  
y = data[label]

In [None]:
# 首先识别数值特征列
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# 使用 .loc 将每个数值列的 NaN 值填充为该列的平均值
for col in numerical_cols:
    X.loc[:, col] = X[col].fillna(X[col].mean())

In [None]:
# 假设 X 是一个 pandas DataFrame
# 首先识别分类特征列
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.columns.difference(categorical_cols)
# 创建一个 ColumnTransformer，它将应用于数据的转换
transformers = [
    ('num', 'passthrough', numerical_cols),  # 数值列直接传递
    ('cat', OneHotEncoder(), categorical_cols)  # 分类列使用独热编码
]
ct = ColumnTransformer(transformers)

# 应用转换
X_encoded = ct.fit_transform(X)
# 训练决策树模型
clf = DecisionTreeClassifier()
# 现在 X_encoded 包含了编码后的特征，可以用于训练决策树
clf = clf.fit(X_encoded, y)


In [None]:


# 可视化决策树
viz = dtreeviz.model(clf, 
               X_encoded, 
               y, 
               target_name=label, 
               feature_names=X_encoded.columns, 
               class_names=list(clf.classes_))

# 显示决策树
viz.view()

# 如果需要保存决策树图像，可以使用以下代码
# viz.save("decision_tree.svg")


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# Load the data
file_path = r"C:\Users\Runker\Desktop\TEST.csv"
data = pd.read_csv(file_path)

# Encode text columns to numerical values
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
data[data.select_dtypes(include=['float64', 'int64']).columns] = num_imputer.fit_transform(data.select_dtypes(include=['float64', 'int64']))

# Split data into features and target variable
X = data.drop('NEW_TL', axis=1)
y = data['NEW_TL']

# Train the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X, y)

# Plot the decision tree
plt.figure(figsize=(20,10))
tree.plot_tree(dt_model, feature_names=X.columns, class_names=label_encoders['NEW_TL'].classes_, filled=True, rounded=True)
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import dtreeviz
import warnings
# Ignore specific warnings
warnings.filterwarnings("ignore", message="Glyph .* missing from current font.")
# Load the data
file_path = r"C:\Users\Runker\Desktop\TEST.csv"
data = pd.read_csv(file_path)

# Encode text columns to numerical values
label_encoders = {}
for column in data.select_dtypes(include=['object','category']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
data[data.select_dtypes(include=['float64', 'int64']).columns] = num_imputer.fit_transform(data.select_dtypes(include=['float64', 'int64']))

# Split data into features and target variable
X = data.drop('NEW_TL', axis=1)
y = data['NEW_TL']

# Train the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X, y)


# Visualize the decision tree using dtreeviz
viz = dtreeviz.model(dt_model, X, y, target_name='NEW_TL', feature_names=X.columns, class_names=list(label_encoders['NEW_TL'].classes_))

# Display the visualization
viz.view()


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import dtreeviz
import warnings

# Ignore specific warnings
warnings.filterwarnings("ignore", message="Glyph .* missing from current font.")

# Load the data
file_path = r"C:\Users\Runker\Desktop\TEST.csv"
data = pd.read_csv(file_path)

# Encode text columns to numerical values
label_encoders = {}
for column in data.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
data[data.select_dtypes(include=['float64', 'int64']).columns] = num_imputer.fit_transform(data.select_dtypes(include=['float64', 'int64']))

# Split data into features and target variable
X = data.drop('NEW_TL', axis=1)
y = data['NEW_TL']

# Train the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X, y)

# Extract feature names and class names as lists
feature_names = X.columns.tolist()
class_names = [str(cls) for cls in label_encoders['NEW_TL'].classes_]

# Visualize the decision tree using dtreeviz
viz = dtreeviz.model(dt_model, X, y, target_name='NEW_TL', feature_names=feature_names, class_names=class_names)

# Display the visualization
viz.view()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import plotly.graph_objects as go
import networkx as nx
import warnings
from plotly.offline import plot
import plotly.io as pio

# Ignore specific warnings
warnings.filterwarnings("ignore", message="Glyph .* missing from current font.")

# Load the data
file_path = r"C:\Users\Runker\Desktop\TEST.csv"
data = pd.read_csv(file_path)

# Encode text columns to numerical values
label_encoders = {}
for column in data.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
data[data.select_dtypes(include=['float64', 'int64']).columns] = num_imputer.fit_transform(data.select_dtypes(include=['float64', 'int64']))

# Split data into features and target variable
X = data.drop('NEW_TL', axis=1)
y = data['NEW_TL']

# Train the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X, y)

# Extract tree structure
n_nodes = dt_model.tree_.node_count
children_left = dt_model.tree_.children_left
children_right = dt_model.tree_.children_right
feature = dt_model.tree_.feature
threshold = dt_model.tree_.threshold
value = dt_model.tree_.value

# Create node labels
feature_names = X.columns.tolist()
class_names = [str(cls) for cls in label_encoders['NEW_TL'].classes_]

def create_node_label(node, feature, threshold, value):
    if feature[node] != -2:  # not a leaf node
        return f"{feature_names[feature[node]]} <= {threshold[node]:.2f}"
    else:
        class_counts = value[node][0]
        majority_class = class_names[np.argmax(class_counts)]
        return f"Class: {majority_class}"

node_labels = [create_node_label(i, feature, threshold, value) for i in range(n_nodes)]

# Create edges
edges = []
for i in range(n_nodes):
    if children_left[i] != children_right[i]:
        edges.extend([(i, children_left[i]), (i, children_right[i])])

# Function to compute node depths
def compute_node_depths(n_nodes, children_left, children_right):
    node_depth = np.zeros(n_nodes, dtype=np.int64)
    stack = [(0, 0)]  # start with the root node id and its depth
    while len(stack) > 0:
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
    return node_depth

# Compute node depths
node_depths = compute_node_depths(n_nodes, children_left, children_right)

# Create node coordinates
def hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5):
    def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None, parsed = []):
        if pos is None:
            pos = {root:(xcenter,vert_loc)}
        else:
            pos[root] = (xcenter, vert_loc)
        children = list(G.neighbors(root))
        if not isinstance(G, nx.DiGraph) and parent is not None:
            children.remove(parent)  
        if len(children)!=0:
            dx = width/len(children) 
            nextx = xcenter - width/2 - dx/2
            for child in children:
                nextx += dx
                pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap, 
                                    vert_loc = vert_loc-vert_gap, xcenter=nextx,
                                    pos=pos, parent = root, parsed = parsed)
        return pos

    return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter)

G = nx.Graph()
G.add_edges_from(edges)
pos = hierarchy_pos(G, 0)

# Create Plotly figure
edge_trace = go.Scatter(
    x=[], y=[], line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')

for edge in edges:
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

node_trace = go.Scatter(
    x=[], y=[], text=[], mode='markers+text', textposition="top center",
    hoverinfo='text', marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='节点深度',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])

for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color'] += tuple([node_depths[node]])
    node_info = f'节点 {node}<br>{node_labels[node]}<br>深度: {node_depths[node]}'
    node_trace['text'] += tuple([node_info])

fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>决策树可视化',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="基于用户数据的决策树",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
# 保存为交互式 HTML 文件
plot(fig, filename='decision_tree_interactive.html', auto_open=False)
# Display the visualization
fig.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import plotly.graph_objects as go
import networkx as nx
import warnings

# Ignore specific warnings
warnings.filterwarnings("ignore", message="Glyph .* missing from current font.")

# Load the data
file_path = r"C:\Users\Runker\Desktop\TEST.csv"
data = pd.read_csv(file_path)

# Identify categorical and continuous variables
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
continuous_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Encode text columns to numerical values
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
data[continuous_columns] = num_imputer.fit_transform(data[continuous_columns])

# Split data into features and target variable
X = data.drop('NEW_TL', axis=1)
y = data['NEW_TL']

# Train the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X, y)

# Extract tree structure
n_nodes = dt_model.tree_.node_count
children_left = dt_model.tree_.children_left
children_right = dt_model.tree_.children_right
feature = dt_model.tree_.feature
threshold = dt_model.tree_.threshold
value = dt_model.tree_.value

# Create node labels
feature_names = X.columns.tolist()
class_names = [str(cls) for cls in label_encoders['NEW_TL'].classes_]

def create_node_label(node, feature, threshold, value):
    if feature[node] != -2:  # not a leaf node
        feature_name = feature_names[feature[node]]
        if feature_name in categorical_columns:
            return f"[类别] {feature_name} = {threshold[node]:.0f}"
        else:
            return f"[连续] {feature_name} <= {threshold[node]:.2f}"
    else:
        class_counts = value[node][0]
        majority_class = class_names[np.argmax(class_counts)]
        return f"Class: {majority_class}"

node_labels = [create_node_label(i, feature, threshold, value) for i in range(n_nodes)]

# Create edges
edges = []
for i in range(n_nodes):
    if children_left[i] != children_right[i]:
        edges.extend([(i, children_left[i]), (i, children_right[i])])

# Function to compute node depths
def compute_node_depths(n_nodes, children_left, children_right):
    node_depth = np.zeros(n_nodes, dtype=np.int64)
    stack = [(0, 0)]  # start with the root node id and its depth
    while len(stack) > 0:
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
    return node_depth

# Compute node depths
node_depths = compute_node_depths(n_nodes, children_left, children_right)

# Create node coordinates
def hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5):
    def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None, parsed = []):
        if pos is None:
            pos = {root:(xcenter,vert_loc)}
        else:
            pos[root] = (xcenter, vert_loc)
        children = list(G.neighbors(root))
        if not isinstance(G, nx.DiGraph) and parent is not None:
            children.remove(parent)  
        if len(children)!=0:
            dx = width/len(children) 
            nextx = xcenter - width/2 - dx/2
            for child in children:
                nextx += dx
                pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap, 
                                    vert_loc = vert_loc-vert_gap, xcenter=nextx,
                                    pos=pos, parent = root, parsed = parsed)
        return pos

    return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter)

G = nx.Graph()
G.add_edges_from(edges)
pos = hierarchy_pos(G, 0)

# Create Plotly figure
edge_trace = go.Scatter(
    x=[], y=[], line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')

for edge in edges:
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

node_trace = go.Scatter(
    x=[], y=[], text=[], mode='markers+text', textposition="top center",
    hoverinfo='text', marker=dict(
        showscale=True,
        colorscale='YlOrRd',
        reversescale=True,
        color=[],
        size=15,
        colorbar=dict(
            thickness=15,
            title='节点深度',
            xanchor='left',
            titleside='right'
        ),
        line_width=2,
        symbol=[],
    ))

for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])

for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color'] += tuple([node_depths[node]])
    if feature[node] != -2:  # not a leaf node
        feature_name = feature_names[feature[node]]
        if feature_name in categorical_columns:
            node_trace['marker']['symbol'] += tuple(['square'])
        else:
            node_trace['marker']['symbol'] += tuple(['circle'])
    else:
        node_trace['marker']['symbol'] += tuple(['diamond'])
    
    node_info = f'节点 {node}<br>{node_labels[node]}<br>深度: {node_depths[node]}'
    node_trace['text'] += tuple([node_info])

fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>决策树可视化 (区分变量类型)',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[
                    dict(text="基于用户数据的决策树", showarrow=False, xref="paper", yref="paper", x=0.005, y=-0.002),
                    dict(text="○ 连续变量  □ 类别变量  ◇ 叶节点", showarrow=False, xref="paper", yref="paper", x=0.5, y=1.05, xanchor='center', yanchor='bottom')
                ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )

# Display the visualization
fig.show()

# Save as interactive HTML file
from plotly.offline import plot
plot(fig, filename='decision_tree_variable_types.html', auto_open=False)

# Save as static PNG file
import plotly.io as pio
pio.write_image(fig, 'decision_tree_variable_types.png')

print("可视化已保存为 'decision_tree_variable_types.html' 和 'decision_tree_variable_types.png'")