In [42]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 读取数据
allinfo = pd.read_csv(r"dataset\allinfo.csv")
print(allinfo.columns)
allinfo = allinfo.copy()
data = allinfo.groupby('following_id').mean()

# 选择需要进行主成分分析的特征列
features = ['distance', 'front_speed', 'following_speed', 'front_a', 'following_a', 
             'front_width', 'following_width', 'deltaspeed']

# 将特征列进行标准化
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[features])

# 创建 PCA 模型并拟合数据
pca = PCA(n_components=8)
principal_components = pca.fit_transform(scaled_data)

# 创建包含主成分的数据框
principal_df = pd.DataFrame(data=principal_components)

# 获取每个主成分的方差比例和累积贡献率
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# 创建总方差解释表
variance_explained_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance_ratio))],
    'Explained Variance Ratio': explained_variance_ratio,
    'Cumulative Variance Ratio': cumulative_variance_ratio
})

component_names = [f"PC{i+1}" for i in range(len(pca.components_))]
component_df = pd.DataFrame(data=pca.components_, columns=features, index=component_names)


# 打印总方差解释表
print(variance_explained_df)

print(component_df.iloc[-3:,])



Index(['Unnamed: 0.1', 'Unnamed: 0', 'distance', 'following_x', 'front_x',
       'frame', 'front_speed', 'front_a', 'following_speed', 'following_a',
       'front_id', 'following_id', 'front_width', 'following_width',
       'front_class', 'following_class', 'front_feature', 'following_feature',
       'deltaspeed'],
      dtype='object')
  Principal Component  Explained Variance Ratio  Cumulative Variance Ratio
0                 PC1              3.293793e-01                   0.329379
1                 PC2              1.585099e-01                   0.487889
2                 PC3              1.305102e-01                   0.618399
3                 PC4              1.288682e-01                   0.747268
4                 PC5              1.104089e-01                   0.857676
5                 PC6              8.291683e-02                   0.940593
6                 PC7              5.940667e-02                   1.000000
7                 PC8              2.424473e-31          

In [27]:
data_evaluate = data[['distance', 'following_x', 'front_x', 'front_speed', 'following_speed','driving_style_hmm', 'driving_style_kmeans']]

In [45]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 对主成分进行 KMeans 聚类
kmeans = KMeans(n_clusters=3)
cluster_labels = kmeans.fit_predict(principal_df.iloc[:, :3])  # 选择前三个主成分进行聚类
data['driving_style_kmeans'] = cluster_labels
data['deltaspeed'] = data['front_speed'] - data['following_speed']
data['deltaa'] = data['front_a'] - data['following_a']
cluster_means = data.groupby('driving_style_kmeans').mean()
# 计算轮廓系数
silhouette_avg = silhouette_score(principal_df.iloc[:, :3], cluster_labels)
print(f"Silhouette Score kmeans: {silhouette_avg}")



  super()._check_params_vs_input(X, default_n_init=10)


Silhouette Score kmeans: 0.29367983205443593


In [46]:
cluster_means

Unnamed: 0_level_0,Unnamed: 0.1,Unnamed: 0,distance,following_x,front_x,frame,front_speed,front_a,following_speed,following_a,front_id,front_width,following_width,front_class,following_class,front_feature,following_feature,deltaspeed,deltaa
driving_style_kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,200.5,201.5,47.022118,176.039095,223.061213,12665.928571,22.091147,4.481633,21.873904,4.306688,545.809524,11.874762,13.299048,0.833333,0.857143,,,0.217243,0.174944
1,194.736842,195.736842,33.27433,188.320809,221.595139,11780.710526,23.917115,4.466155,24.071077,4.682252,510.236842,6.999474,5.564211,0.394737,0.131579,,,-0.153962,-0.216097
2,194.871681,195.871681,46.321916,182.740698,229.062614,11279.560901,23.178634,4.190784,23.364616,4.508831,476.942698,15.183744,9.275148,0.901793,0.639648,,,-0.185982,-0.318047


In [47]:
data.to_csv(r'dataset/datawithcluster_pca.csv')