In [None]:
import numpy as np
import pandas as pd
import dimod
from dwave.system import DWaveSampler, EmbeddingComposite
from dwave.embedding.chain_strength import uniform_torque_compensation
from dwave.embedding.chain_breaks import majority_vote
import warnings
from scipy.stats import ConstantInputWarning

In [None]:
class FeatureSelection(object):
    def __init__(self, num_features, dependence_coefficients, influence_coefficients):
        # Number of features
        self.num_features = num_features
        self.dependence_coefficients = dependence_coefficients
        self.influence_coefficients = influence_coefficients
        
        # Create binary variables for the features
        self.qubo_linear = {i: -influence_coefficients[i] for i in range(num_features)}
        self.qubo_quadratic = {(i, j): dependence_coefficients[i][j]
                       for i in range(num_features) for j in range(i + 1, num_features)
                       if not np.isnan(dependence_coefficients[i][j]) and dependence_coefficients[i][j] != 0}

    def compile(self):
        # Combine linear and quadratic terms
        return dimod.BinaryQuadraticModel(self.qubo_linear, self.qubo_quadratic, 0.0, vartype=dimod.BINARY)

In [None]:
# 初始化一個集合來儲存所有選中的特徵
all_selected_features = set()

# 初始化一個字典來儲存每個 class 的 selected_features
class_selected_features = {}

# Load the CSV file
for i in range(0, 6):
    # Load the class 0~6 CSV file
    file_path = f'../data_p/quantum_data.address_class{i}.csv'
    df = pd.read_csv(file_path)

    # # 保留前 1% 的資料行數
    # num_rows_to_keep = int(len(df) * 0.000001)  # 你可以調整這個比例
    # df = df.iloc[:num_rows_to_keep, :]

    # 固定保留前 100 行
    df = df.iloc[:100, :]  # 固定保留前 100 行

    # 偵測資料中是否有空行或空列
    if df.isnull().sum().sum() > 0:
        print(f"Class {i}: Detected empty data, handling missing values...")
        # 選擇處理方式，如刪除含有空值的行或列
        df = df.dropna(axis=0, how='any')  # 刪除含有空值的行 (也可以選擇 axis=1 刪除列)

    # 過濾掉所有常數列
    features = df.iloc[:, :-1]
    result = df.iloc[:, -1]
    features = features.loc[:, (features != features.iloc[0]).any()]  # 只保留變化的列

    # 提取列名
    columns = features.columns
    n_features = features.shape[1]

    # Calculate the correlation matrix for features
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=ConstantInputWarning)
        feature_correlation = features.corr(method='spearman')

    # Calculate the correlation of each feature with the result
    result_correlation = features.apply(lambda x: x.corr(result, method='spearman'))

    # 在進行量子運算之前，使用過濾後的資料
    feature_qubo = FeatureSelection(n_features, feature_correlation.values, result_correlation.values)

    # 檢查 `qubo_quadratic` 的長度
    expected_length = n_features * (n_features - 1) // 2  # 理論上的二次項長度
    if len(feature_qubo.qubo_quadratic) != expected_length:
        print(f"Unexpected quadratic length: {len(feature_qubo.qubo_quadratic)}, expected: {expected_length}")
        continue  # 如果不匹配，跳過這個樣本

    bqm = feature_qubo.compile()

    # 使用 D-Wave 量子計算機來解 QUBO 問題
    qpu_advantage = DWaveSampler(solver={'chip_id': 'Advantage_system6.4'})
    sampler = EmbeddingComposite(qpu_advantage)   
    response = sampler.sample(bqm, num_reads=1000, chain_strength=uniform_torque_compensation(bqm), chain_break_method=majority_vote, auto_scale=True, reduce_intersample_correlation=True)
    
    # Print results
    print("All energies:", response.record['energy'])

    # Find the best sample (modify this as per your criteria)
    best_sample = list(response.first.sample.items())

    # Identify selected features
    selected_features = [int(key) for key, value in best_sample if value == 1]

    # Filter the DataFrame to keep only the selected columns
    filtered_df = df.iloc[:, selected_features]

    # 將本次迭代選中的特徵添加到集合中
    all_selected_features.update(selected_features)
    
    # 將本次迭代選中的特徵儲存到字典中
    class_selected_features[i] = selected_features

    # Add the index of the last column (class) to the selected features
    last_column = df[columns[-1]]
    filtered_df = pd.concat([filtered_df, last_column], axis=1)

    # Save the filtered DataFrame to a new CSV file
    filtered_df.to_csv(f'../data_p/QA_data.address_class{i}.csv', index=False)

# Print non-duplicate selected features from all iterations
print("Combined Selected Features (No Duplicates):", sorted(all_selected_features))

# 也可以打印出每個 class 的 selected_features 來確認
for class_num, features in class_selected_features.items():
    print(f"Class {class_num} Selected Features:", features)


In [None]:
# 從原始資料根據 all_selected_features 選擇特徵並保存

# 讀取原始數據
file_path = "../data_p/data.address.csv"
df = pd.read_csv(file_path)

# 將索引轉換為列選擇器
# selected_column_indices = list(all_selected_features)

# 只選擇 mixer 的特徵
selected_column_indices = list(class_selected_features[4])

selected_column_indices.append(len(df.columns) - 1)  # 添加最後一列的索引

# 使用 .iloc 來選擇指定索引的列
filtered_df = df.iloc[:, selected_column_indices]

# 確定原始數據集中哪些列不在處理過的數據集中
missing_columns = [col for col in df.columns[:-1] if col not in filtered_df.columns]  # 排除最後一列（class）

# 為缺失的列創建全為 0 的數據，並添加到處理過的數據集中
for col in missing_columns:
    filtered_df[col] = 0

# 重新排列列的順序以匹配原始數據集
filtered_df = filtered_df[df.columns]

# 將所有 NaN 值替換為 0
filtered_df.fillna(0, inplace=True)

# 保存修改後的數據集
filtered_df.to_csv("../data_p/all_selected_features_QA_data.address.csv", index=False)

In [None]:
# import pandas as pd

# # 從原始數據根據 all_selected_features 選擇特徵並保存

# # 讀取原始數據
# file_path = "../data_p/data.address.csv"
# df = pd.read_csv(file_path)

# # 選擇 mixer 的特徵
# selected_column_indices = list(class_selected_features[4])
# selected_column_indices.append(len(df.columns) - 1)  # 添加最後一列的索引

# # 創建一個新的 DataFrame，初始值設為0
# zero_df = pd.DataFrame(0, index=df.index, columns=df.columns)

# # 保留 selected_column_indices 中的列的原始數據
# zero_df.iloc[:, selected_column_indices] = df.iloc[:, selected_column_indices]

# # 將所有 NaN 值替換為 0
# zero_df.fillna(0, inplace=True)

# # 保存修改後的數據集
# zero_df.to_csv("../data_p/all_selected_features_quantum_qubo_data.address.csv", index=False)


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 指定 CSV 檔案的路徑
file_path = "../data_p/all_selected_features_QA_data.address.csv"

# 使用 pandas 的 read_csv 函數讀取 CSV 檔案
df = pd.read_csv(file_path)

# 將所有的 NaN 值替換為 0
df.fillna(0, inplace=True)

# 將 std_balance_btc 列中的數值只取到小數點後 10 位
df['std_balance_btc'] = df['std_balance_btc'].round(10)

# 保存最後一列
last_column = df.iloc[:, -1].copy()

# 初始化 StandardScaler
scaler = StandardScaler()

# 選擇除了最後一列之外的數值型列進行標準化
numeric_df = df.iloc[:, :-1].select_dtypes(include=['float64', 'int64'])

# 使用 StandardScaler 對選定的數值型數據進行標準化
scaled_features = scaler.fit_transform(numeric_df.values)

# 將標準化後的數據轉換回 DataFrame
scaled_df = pd.DataFrame(scaled_features, index=df.index, columns=numeric_df.columns)

# 將最後一列加回 DataFrame
scaled_df = pd.concat([scaled_df, last_column], axis=1)

# 定義新檔案的路徑
new_file_path = "../data_p/normalization_all_selected_features_QA_data.address.csv"

# 儲存修改後的 DataFrame 到新檔案
scaled_df.to_csv(new_file_path, index=False)


In [None]:
# 繪製特徵相關性熱圖
def plot_feature_correlation_heatmap(correlation_matrix, title='Feature Correlation Heatmap'):
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(title)
    plt.show()

# 繪製特徵與類別相關性柱狀圖
def plot_feature_class_correlation_bar(correlations, title='Feature-Class Correlation'):
    plt.figure(figsize=(12, 6))
    correlations.plot(kind='bar', color='skyblue')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Correlation with Class')
    plt.axhline(y=0, color='black', linestyle='--')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

In [None]:
# 定義 class 索引和名稱的對應
class_names = {
    0: 'Exchange',
    1: 'Faucet',
    2: 'Gambling',
    3: 'Market',
    4: 'Mixer',
    5: 'Mining Pool'
}

# Load the CSV file
for i in range(0, 6):
    # Load the class 0~5 CSV file
    file_path = f'../data_p/quantum_data.address_class{i}.csv'
    df = pd.read_csv(file_path)

    # 移除包含空值的列
    df = df.dropna(axis=1)

    # Extracting each column as an array
    columns = df.columns
    features = df[columns[:-1]]  # All columns except the last one
    result = df[columns[-1]]    # The last column
    n_features = features.shape[1]

    # Calculate the correlation matrix for features
    feature_correlation = features.corr(method='spearman')
    # 繪製特徵相關性熱圖，並使用 class 名稱作為標題
    plot_feature_correlation_heatmap(feature_correlation, f'Feature Correlation Heatmap for {class_names[i]}')

    # Calculate the correlation of each feature with the result
    result_correlation = features.apply(lambda x: x.corr(result, method='spearman'))
    # 繪製特徵與結果類別相關性柱狀圖，並使用 class 名稱作為標題
    plot_feature_class_correlation_bar(result_correlation, f'Feature-Class Correlation for {class_names[i]}')

In [None]:
import pandas as pd

# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()

# Loop through class0 to class5
for i in range(0, 6):
    # Read each file
    file_path = f'../data_p/quantum_qubo_data.address_class{i}.csv'
    df = pd.read_csv(file_path)

    # Remove rows where the last column (class{i}) has a value of 0
    df = df[df[df.columns[-1]] != 0]

    # Rename the last column to 'class', and set its value to the current class number for rows with 1
    df.rename(columns={df.columns[-1]: 'class'}, inplace=True)
    df['class'] = df['class'].apply(lambda x: i if x == 1 else x)

    # Add the DataFrame to the merged DataFrame
    merged_df = pd.concat([merged_df, df], ignore_index=True)

# Fill all NaN values with 0
merged_df.fillna(0, inplace=True)

# Read the comparison DataFrame
data_df = pd.read_csv('../data_p/data.address.csv')

# Identify missing columns in the merged DataFrame
missing_columns = set(data_df.columns) - set(merged_df.columns)

# Add missing columns to the merged DataFrame with default value 0
for col in missing_columns:
    merged_df[col] = 0

# Move the 'class' column to the end
class_column = merged_df.pop('class')
merged_df['class'] = class_column

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('../data_p/quantum_qubo_data.address.csv', index=False)


In [None]:
import pandas as pd

# 讀取原始數據
file_path = "../data_p/data.address.csv"
df = pd.read_csv(file_path)

# 將索引轉換為列選擇器
# selected_column_indices = list(all_selected_features)

# 只選擇 mixer 的特徵
selected_column_indices = list(class_selected_features[4])

selected_column_indices.append(len(df.columns) - 1)  # 添加最後一列的索引

# 使用 .iloc 來選擇指定索引的列
filtered_df = df.iloc[:, selected_column_indices]

# 確定原始數據集中哪些列不在處理過的數據集中
missing_columns = [col for col in df.columns[:-1] if col not in filtered_df.columns]  # 排除最後一列（class）

# 為缺失的列創建全為 0 的數據，並添加到處理過的數據集中
for col in missing_columns:
    filtered_df[col] = 0

# 重新排列列的順序以匹配原始數據集
filtered_df = filtered_df[df.columns]

# 將所有 NaN 值替換為 0
filtered_df.fillna(0, inplace=True)

# 保存修改後的數據集
filtered_df.to_csv("../data_p/all_selected_features_quantum_qubo_data.address.csv", index=False)
