In [1]:
# 向量化DBSCAN实现
import numpy as np
import pandas as pd
import time
from collections import deque
from scipy.spatial import cKDTree
import warnings
warnings.filterwarnings('ignore')

print("=" * 50)
print("向量化DBSCAN实现")
print("=" * 50)

向量化DBSCAN实现


In [2]:
class VectorizedDBSCAN:
    """向量化优化的DBSCAN实现"""
    
    def __init__(self, eps=0.5, min_samples=5):
        self.eps = eps
        self.min_samples = min_samples
        self.labels_ = None
        self.tree_ = None
        
    def _build_kdtree(self, X):
        """构建KD树加速邻域查询"""
        self.tree_ = cKDTree(X)
        
    def _region_query_vectorized(self, point_idx):
        """使用KD树加速的邻域查询"""
        distances, indices = self.tree_.query(
            self.tree_.data[point_idx], 
            k=len(self.tree_.data),
            distance_upper_bound=self.eps
        )
        # 过滤掉无穷大距离的点
        return indices[distances < np.inf]
    
    def fit_predict(self, X):
        """执行向量化DBSCAN聚类"""
        n_samples = len(X)
        self.labels_ = np.full(n_samples, -1)
        
        # 构建KD树
        self._build_kdtree(X)
        
        cluster_id = 0
        
        for i in range(n_samples):
            if self.labels_[i] != -1:
                continue
                
            # 使用KD树查询邻域
            neighbors = self._region_query_vectorized(i)
            
            if len(neighbors) < self.min_samples:
                self.labels_[i] = -1
                continue
                
            # 发现新簇
            self.labels_[i] = cluster_id
            seed_set = deque(neighbors)
            seed_set.remove(i)
            
            # 扩展簇
            while seed_set:
                j = seed_set.popleft()
                
                if self.labels_[j] == -1:
                    self.labels_[j] = cluster_id
                    
                if self.labels_[j] != -1:
                    continue
                    
                self.labels_[j] = cluster_id
                
                # 查询j的邻域
                j_neighbors = self._region_query_vectorized(j)
                
                if len(j_neighbors) >= self.min_samples:
                    for n in j_neighbors:
                        if self.labels_[n] == -1:
                            seed_set.append(n)
                            
            cluster_id += 1
            
        return self.labels_


In [3]:
# 测试和对比
print("\n1. 加载数据...")
try:
    df = pd.read_csv("../data/processed/data_10000.csv")
    X = df[['LAT_scaled', 'LON_scaled']].values
    print(f"  加载 {len(X)} 条数据")
except:
    # 创建测试数据
    print("  创建模拟数据...")
    from sklearn.datasets import make_blobs
    X, _ = make_blobs(n_samples=10000, centers=5, random_state=42)
    print(f"  创建 {len(X)} 条模拟数据")

print("\n2. 运行向量化DBSCAN...")


1. 加载数据...
  加载 10000 条数据

2. 运行向量化DBSCAN...


In [4]:
vectorized_dbscan = VectorizedDBSCAN(eps=0.3, min_samples=5)

start_time = time.time()
vectorized_labels = vectorized_dbscan.fit_predict(X)
vectorized_time = time.time() - start_time

print(f"  运行时间: {vectorized_time:.2f} 秒")
print(f"  聚类数量: {len(set(vectorized_labels[vectorized_labels != -1]))}")
print(f"  噪声点数量: {sum(vectorized_labels == -1)}")

# 与基础版本对比
print("\n3. 与基础版本对比...")
from sklearn.cluster import DBSCAN as SklearnDBSCAN

sklearn_dbscan = SklearnDBSCAN(eps=0.3, min_samples=5)
start_time = time.time()
sklearn_labels = sklearn_dbscan.fit_predict(X)
sklearn_time = time.time() - start_time

print(f"  scikit-learn运行时间: {sklearn_time:.2f} 秒")
print(f"  我们的向量化版本运行时间: {vectorized_time:.2f} 秒")
print(f"  速度提升: {sklearn_time/vectorized_time:.2f}x")

  运行时间: 0.02 秒
  聚类数量: 6
  噪声点数量: 0

3. 与基础版本对比...
  scikit-learn运行时间: 0.90 秒
  我们的向量化版本运行时间: 0.02 秒
  速度提升: 48.75x


In [5]:
# 保存结果
np.save("../results/metrics/vectorized_labels.npy", vectorized_labels)
np.save("../results/metrics/sklearn_labels.npy", sklearn_labels)

print("\n 向量化DBSCAN实现完成！")


 向量化DBSCAN实现完成！
