In [1]:
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np


In [2]:
file = pd.read_excel('dataset_experiment_results.xlsx')
file.head(60)
file.to_csv('dataset_experiment_results.csv')


In [103]:
columns = file.columns
print(columns)


Index(['Unnamed: 0', 'dataset', 'size', 'type', 'task', '#columns',
       'ave_rows_with_missing_values_ratio_per_window',
       'max_rows_with_missing_values_ratio_per_window',
       'total_rows_with_missing_values_ratio', 'overall missing value ratio',
       'overall average null columns ratio',
       'average null columns ratio among all windows',
       'max average #null columns ratio among all windows',
       'average missing value ratio among all windows',
       'max missing value ration among all windows',
       'IForest_ave_anomaly_ratio', 'IForest_max_anomaly_ratio',
       'ECOD_ave_anomaly_ratio', 'ECOD_max_anomaly_ratio',
       'mean_ave_anomaly_ratio', 'max_ave_anomaly_ratio',
       'ECOD_overall_anomaly_ratio', 'IForest_overall_anomaly_ratio',
       'ave_overall_anomaly_ratio', 'hdddm_drift_percentage',
       'kdq_drift_percentage', 'ave_drift_percentage',
       'ks_max_drift_percentage', 'hdddm_ave_drift_percentage',
       'hdddm_max_drift_percentage', 'kd

In [104]:
basic_info = file[['size', 'type', 'task', '#columns']]
basic_info.head()


Unnamed: 0,size,type,task,#columns
0,188318,commerce,regression,132
1,10886,commerce,regression,12
2,9984,others,classification,5
3,18159,ecology,classification,9
4,494021,Science&technology,classification,42


In [105]:
basic_info_one_hot = pd.get_dummies(basic_info, columns=['task', 'type'])
basic_info_one_hot.head()


Unnamed: 0,size,#columns,task_classification,task_regression,type_Science&technology,type_Social,type_commerce,type_ecology,type_others,type_power consumption
0,188318,132,0,1,0,0,1,0,0,0
1,10886,12,0,1,0,0,1,0,0,0
2,9984,5,1,0,0,0,0,0,1,0
3,18159,9,1,0,0,0,0,1,0,0
4,494021,42,1,0,1,0,0,0,0,0


In [106]:
mean = np.mean(basic_info_one_hot, axis=0)
std = np.std(basic_info_one_hot, axis=0)
standarized_basic_info_one_hot = (basic_info_one_hot - mean) / std


In [107]:
# model = TSNE(n_components=4, init='pca', learning_rate='auto')
from sklearn.decomposition import PCA
model = PCA(n_components=3)


In [108]:
pca_basic_info = model.fit_transform(standarized_basic_info_one_hot)
pca_basic_info = pd.DataFrame(pca_basic_info)
pca_basic_info.head()


Unnamed: 0,0,1,2
0,1.325391,1.141998,4.577284
1,-0.392923,1.497946,1.520403
2,1.480338,0.512162,-3.538516
3,0.369345,-0.982329,-0.63445
4,2.85274,-0.815927,0.030782


In [109]:
missing_value_features = file[['ave_rows_with_missing_values_ratio_per_window',
       'max_rows_with_missing_values_ratio_per_window',
       'total_rows_with_missing_values_ratio', 'overall missing value ratio',
       'overall average null columns ratio',
       'average null columns ratio among all windows',
       'max average #null columns ratio among all windows',
       'average missing value ratio among all windows',
       'max missing value ration among all windows']]

pca_missing_value_features = model.fit_transform(missing_value_features)
pca_missing_value_features = pd.DataFrame(pca_missing_value_features)


In [110]:
outliers_features = file[['IForest_ave_anomaly_ratio', 'IForest_max_anomaly_ratio',
       'ECOD_ave_anomaly_ratio', 'ECOD_max_anomaly_ratio',
       'mean_ave_anomaly_ratio', 'max_ave_anomaly_ratio',
       'ECOD_overall_anomaly_ratio', 'IForest_overall_anomaly_ratio',
       'ave_overall_anomaly_ratio']]
pca_outliers_features = model.fit_transform(outliers_features)
pca_outliers_features = pd.DataFrame(pca_outliers_features)


In [111]:
data_drift_features = file[['hdddm_drift_percentage',
       'kdq_drift_percentage', 'ave_drift_percentage',
       'hdddm_warning_percentage', 'kdq_warning_percentage',
       'ave_warning_percentage', 'ks_ave_drift_percentage',
       'ks_max_drift_percentage', 'hdddm_ave_drift_percentage',
       'hdddm_max_drift_percentage', 'kdq_ave_drift_percentage',
       'kdq_max_drift_percentage', 'cbdb_ave_drift_percentage',
       'cbdb_max_drift_percentage', 'pca_ave_drift_percentage',
       'pca_max_drift_percentage', 'ave_drift_percentage.1',
       'max_drift_percentage', 'ks_ave_warning_percentage',
       'ks_max_warning_percentage', 'hdddm_ave_warning_percentage',
       'hdddm_max_warning_percentage', 'kdq_ave_warning_percentage',
       'kdq_max_warning_percentage', 'cbdb_ave_warning_percentage',
       'cbdb_max_warning_percentage', 'pca_ave_warning_percentage',
       'pca_max_warning_percentage', 'ave_warning_percentage.1',
       'max_warning_percentage']]
pca_data_drift_features = model.fit_transform(data_drift_features)
pca_data_drift_features = pd.DataFrame(pca_data_drift_features)


In [112]:
concept_drift_features = file[['perm', 'adwin', 'ddm', 'eddm',
       'concept_drift_ave', 'adwin_warning', 'ddm_warning', 'eddm_warning',
       'ave_concept_drift_warning']]
pca_concept_drift_features = model.fit_transform(concept_drift_features)
pca_concept_drift_features = pd.DataFrame(pca_concept_drift_features)


In [113]:
combined_pca_features = pd.concat([pca_basic_info, pca_missing_value_features, pca_concept_drift_features, pca_data_drift_features, pca_outliers_features], axis=1)


In [114]:
from sklearn.cluster import KMeans

k = 5  # Choose the desired number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(combined_pca_features)


KMeans(n_clusters=5, random_state=42)

In [115]:
kmeans.labels_


array([0, 0, 3, 1, 1, 0, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 4, 3, 0, 0, 0,
       2, 2, 4, 4, 2, 1, 0, 2, 2, 2, 2, 2], dtype=int32)

In [116]:
combined_pca_features['cluster'] = kmeans.labels_


In [117]:
representative_datasets_indices = []

for i in range(k):
    cluster_data = combined_pca_features[combined_pca_features['cluster'] == i]
    cluster_center = kmeans.cluster_centers_[i]
    
    # Calculate the Euclidean distance between each data point and the cluster center
    distances = np.linalg.norm(cluster_data.iloc[:, :-1].values - cluster_center, axis=1)
    
    # Find the index of the dataset with the minimum distance to the cluster center
    min_distance_index = np.argmin(distances)
    
    # Get the representative dataset's index in the original dataframe
    representative_index = cluster_data.index[min_distance_index]
    
    # Store the representative dataset
    representative_datasets_indices.append(representative_index)
    
print(representative_datasets_indices)


[5, 29, 16, 40, 6]


In [118]:
representative_datasets = file['dataset'].iloc[representative_datasets_indices]
print(representative_datasets)


5            dataset_experiment_info/electricity_prices
29    dataset_experiment_info/insects/incremental_re...
16     dataset_experiment_info/beijing_multisite/shunyi
40               dataset_experiment_info/room_occupancy
6                       dataset_experiment_info/tetouan
Name: dataset, dtype: object
