# SVM 控制线计算时间

In [1]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, load_iris
from sklearn.model_selection import train_test_split
from core.svm_smo_classifier import SVMClassifier
from sklearn.metrics import classification_report

import os
from scipy import stats
import time
from sklearn.svm import SVC
 
from core.svm_smo_classifier import SVMClassifier

In [None]:
# !ipyparallel并行计算准备
import ipyparallel as ipp
c = ipp.Client()
print("Connected engines:", c.ids)  # ! 需要先在在控制台启动引擎 ipcluster start -n 10
dview = c[:]
dview.push({
    'SVMClassifier': SVMClassifier
})
dview.execute('import numpy as np')
dview.execute('import pandas as pd')
dview.execute('from sklearn.svm import SVC')
dview.execute('import os')
dview.execute('from scipy import stats')
dview.execute('import time')

生成数据和对应高斯核

In [4]:
## 生成多元正太分布的虚拟数据(加标签)
def NormalData(dim, pnum, nnum, ab=0):
    ## 正样本
    pmean = np.zeros(dim)  # 均值
    ppoints = np.random.multivariate_normal(
        mean=pmean, cov=np.eye(dim), size=pnum
    )
    ppoints = np.concatenate((ppoints, np.ones((pnum, 1))), axis=1)  # 添加标签1  shape:(pnum, dim+1)

    ## 负样本
    nmean = pmean.copy()
    nmean[0] = ab
    npoints = np.random.multivariate_normal(
        mean=nmean, cov= np.eye(dim), size=nnum
    )

    npoints = np.concatenate((npoints, -np.ones((nnum, 1))), axis=1)  # 添加标签-1 shape:(nnum, dim+1)
    
    ## 总样本
    points = np.concatenate((ppoints, npoints), axis=0) #shape:(pnum+nnum,dim+1)

    return points


## 生成高斯核矩阵
def Gaussian(x, gamma):
    n_samples = np.shape(x)[0]
    kernel = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        for j in range(n_samples):
            diff = x[i, :] - x[j, :]
            kernel[i, j] = np.exp(-np.linalg.norm(diff)**2 / (2 * gamma**2))
    return kernel


## test
if __name__ == '__main__':
    dview.push({
    'NormalData': NormalData,
    'Gaussian': Gaussian
    })
    pnum,nnum,dim = 5, 3, 10
    dataset = NormalData(dim=dim, pnum=pnum, nnum=nnum)
    print(dataset.shape)
    kernel_matrix  = Gaussian(dataset,1) # 高斯核矩阵
    print(kernel_matrix.shape)


(8, 11)
(8, 8)


评估SVM训练时间

In [51]:
# SVM训练和评估函数
def run_svm_experiment(test_num, dim, pnum, nnum, use_precomputed_kernel):
    times = []
    
    for num in range(test_num):
        # 生成数据
        dataset = NormalData(dim=dim, pnum=pnum, nnum=nnum)
        X = dataset[:, :dim]
        y = np.append(np.ones(pnum), -np.ones(nnum))

        if use_precomputed_kernel:
            # 使用预计算核
            kernel = Gaussian(X, gamma=1.0)
            svm_model = SVC(kernel='precomputed', C=1.0, max_iter=80, tol=1e-4, shrinking=True)
        else:
            # 使用标准RBF核
            kernel = X
            svm_model = SVC(C=1.0, kernel='rbf', gamma=1.0, max_iter=80, tol=1e-4, shrinking=True, cache_size=0.1)

        start = time.perf_counter()
        svm_model.fit(kernel, y)
        end = time.perf_counter()

        times.append({
            "experiment_num": num + 1,
            "dimension": dim,
            "total_samples": pnum + nnum,
            "time": end - start,
            "iterations": svm_model.n_iter_
        })

    return times



if __name__ == '__main__':
    dview.push({
        'run_svm_experiment': run_svm_experiment
    })

    # 参数设置
    K, R = 1000, 1000
    total_num = K * R   # 实验次数

    dim_list = [1, 10, 100, 1000]
    ref_size_list = [5, 25, 50]
    win_size_list = [5, 25, 50]

    # 将需要并行执行的任务放入列表
    tasks = []
    for dim in dim_list:
        for pnum, nnum in zip(ref_size_list, win_size_list):
            tasks.append((total_num, dim, pnum, nnum, True))

    # 并行执行任务
    async_results = dview.map_async(lambda t: run_svm_experiment(*t), tasks)
    results = async_results.get()

    # 扁平化结果
    flat_results = [item for sublist in results for item in sublist]

    # 收集结果并打印
    df_with_cache = pd.DataFrame(flat_results)
    # print("Results with cache:\n", df_with_cache)


    # 重复相同的过程用于没有缓存的实验
    tasks = []
    for dim in dim_list:
        for pnum, nnum in zip(ref_size_list, win_size_list):
            tasks.append((total_num, dim, pnum, nnum, False))

    # 并行执行任务
    async_results = dview.map_async(lambda t: run_svm_experiment(*t), tasks)
    results = async_results.get()

    # 扁平化结果
    flat_results = [item for sublist in results for item in sublist]

    # 收集结果并打印
    df_without_cache = pd.DataFrame(flat_results)
    # print("Results without cache:\n", df_without_cache)


计算平均耗时

In [None]:
def calculate_average_times(df_without_cache, df_with_cache):
    # 将两个 DataFrame 合并，以便能够同时处理它们
    df_without_cache['type'] = 'without_cache'
    df_with_cache['type'] = 'with_cache'
    combined_df = pd.concat([df_without_cache, df_with_cache])

    # 计算每组的平均时间
    average_times = combined_df.groupby(['dimension', 'total_samples', 'type']).agg({'time': 'mean'}).reset_index()
    # 重构DataFrame以适应所需的表格格式
    pivot_table = average_times.pivot_table(values='time', index=['dimension', 'total_samples'], columns='type')
    pivot_table.columns = ['time (with caching)','time (without caching)'] ## 注意with排在前面
    pivot_table.reset_index(inplace=True)
    
    return pivot_table

if __name__ == '__main__':
    # 计算平均时间
    result_table = calculate_average_times(df_without_cache, df_with_cache)

    result_table['time (without caching)']  /= 3600
    result_table['time (with caching)']  /= 3600

    # 输出表格
    print("Average Time to Calculate the Control Limit:")
    print(result_table.round(4))

    # 输出excel表格

    
    result_table.to_excel('result.xlsx', index=False)

In [None]:
# 假设 df_without_cache 已经包含相关数据
filtered_df = df_without_cache[(df_without_cache['dimension'] == 100) & (df_without_cache['total_samples'] == 50)]
# 计算平均运行时间
mean_run_time_without_cache = filtered_df['time'].mean()
# 转换为微秒每小时单位并保留六位小数
# 打印计算的平均值
print(f"Calculated mean run time (without caching) for dimension=1 and total_samples=10: {mean_run_time_without_cache}")