### 附加题: 对比不同 kernel 方法下的 SVM 分类器 （对完整SVM进行调参）

这一题本质上是让我们以 kernel 的选择作为目标元参数，其他参数作为冗余或固定元参数，进行调参实验，发现不同 kernel 方法下的 SVM 分类器的分类效果数值上的区别及其显著性，并且从可视化分析上也作出进一步解释。

这部分内容请见 [03svm_kernel_hpo.ipynb](./03svm_kernel_hpo.ipynb)

我们使用dataclass，要求传入函数的参数是强类型，而且有一个随机概率分布，这样方便定义调参。

In [None]:
#| exports
from scholarly_infrastructure.rv_args.nucleus import RandomVariable, experiment_setting
from optuna.distributions import IntDistribution, FloatDistribution, CategoricalDistribution
from typing import Optional, Union

In [None]:
#| exports
@experiment_setting
class SupportVectorClassifierConfig:
    # 惩罚系数 C
    C: float = ~RandomVariable(
        default=1.0,
        description="Regularization parameter. The strength of the regularization is inversely proportional to C.",
        distribution=FloatDistribution(1e-5, 1e2, log=True)
    )
    
    # 核函数类型
    kernel: str = ~RandomVariable(
        default="rbf",
        description="Kernel type to be used in the algorithm.",
        distribution=CategoricalDistribution(choices=["linear", "poly", "rbf", "sigmoid", "precomputed"])
    )
    
    # 多项式核函数的度数
    degree: int = ~RandomVariable(
        default=3,
        description="Degree of the polynomial kernel function ('poly').",
        distribution=IntDistribution(1, 10, log=False)
    )
    
    # 核函数系数 gamma
    gamma: Union[str, float] = ~RandomVariable(
        default="scale",
        description="Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.",
        distribution=CategoricalDistribution(choices=["scale", "auto"])  # 可以添加浮点数分布视需求
    )
    
    # 核函数独立项 coef0
    coef0: float = ~RandomVariable(
        default=0.0,
        description="Independent term in kernel function. It is significant in 'poly' and 'sigmoid'.",
        distribution=FloatDistribution(0, 1)
    )
    
    # 收缩启发式算法
    shrinking: bool = ~RandomVariable(
        default=True,
        description="Whether to use the shrinking heuristic.",
        distribution=CategoricalDistribution(choices=[True, False])
    )
    
    # 是否启用概率估计
    probability: bool = ~RandomVariable(
        default=False,
        description="Whether to enable probability estimates. Slows down fit when enabled.",
        distribution=CategoricalDistribution(choices=[True, False])
    )
    
    # 停止准则的容差 tol
    tol: float = ~RandomVariable(
        default=1e-3,
        description="Tolerance for stopping criterion.",
        distribution=FloatDistribution(1e-6, 1e-1, log=True)
    )
    
    # 内核缓存的大小（MB）
    cache_size: float = ~RandomVariable(
        default=200,
        description="Specify the size of the kernel cache (in MB).",
        distribution=FloatDistribution(50, 500, log=False)
    )
    
    # 类别权重 class_weight
    class_weight: Optional[Union[dict, str]] = ~RandomVariable(
        default=None,
        description="Set C of class i to class_weight[i]*C or use 'balanced' to adjust weights inversely to class frequencies.",
        distribution=CategoricalDistribution(choices=[None, "balanced"])
    )
    
    # 是否启用详细输出
    verbose: bool = ~RandomVariable(
        default=False,
        description="Enable verbose output (may not work properly in a multithreaded context).",
        distribution=CategoricalDistribution(choices=[True, False])
    )
    
    # 最大迭代次数
    max_iter: int = ~RandomVariable(
        default=-1,
        description="Hard limit on iterations within solver, or -1 for no limit.",
        distribution=IntDistribution(-1, 1000, log=False)
    )
    
    # 决策函数形状
    decision_function_shape: str = ~RandomVariable(
        default="ovr",
        description="Whether to return a one-vs-rest ('ovr') decision function or original one-vs-one ('ovo').",
        distribution=CategoricalDistribution(choices=["ovo", "ovr"])
    )
    
    # 是否打破决策函数平局
    break_ties: bool = ~RandomVariable(
        default=False,
        description="If True, break ties according to the confidence values of decision_function when decision_function_shape='ovr'.",
        distribution=CategoricalDistribution(choices=[True, False])
    )
    
    # 随机种子 random_state
    random_state: Optional[int] = ~RandomVariable(
        default=None,
        description="Controls random number generation for probability estimates. Ignored when probability=False.",
        distribution=IntDistribution(0, 100)  # 根据需求设置范围
    )

NameError: name 'experiment_setting' is not defined

In [None]:
# show_dataframe_doc(SupportVectorClassifierConfig)[:1]
SupportVectorClassifierConfig.show_dataframe_doc()[:1]
# SupportVectorClassifierConfig.get_optuna_search_space(frozen_rvs={"verbose", "cache_size", "random_state"})

Unnamed: 0,name,type,default,default_factory,init,repr,hash,compare,metadata,kw_only,description,distribution
0,C,<class 'float'>,1.0,<dataclasses._MISSING_TYPE object at 0x7f64b04...,True,True,,True,,<dataclasses._MISSING_TYPE object at 0x7f64b04...,Regularization parameter. The strength of the ...,"FloatDistribution(high=100.0, log=True, low=1e..."
