In [1]:
import numpy as np
import pandas as pd

from pyqubo import Array
import neal
import matplotlib.pyplot as plt
import requests
import seaborn as sns


In [2]:
# Project: feature selection for credit scoring
class FeatureSelection_v1(object):
    def __init__(self, num_features, dependence_coefficients, influence_coefficients, alpha):
        self.alpha = alpha
        
        # Number of features
        self.num_features = num_features
        self.dependence_coefficients = dependence_coefficients
        self.influence_coefficients = influence_coefficients
        
        # Create binary variables for the features
        self.array = Array.create('feature', shape=self.num_features, vartype='BINARY')

    def influence(self):
        # Objective: Maximize influence of the features
        H = sum(-self.influence_coefficients[i] * self.array[i] for i in range(self.num_features))
        return self.alpha * H
        
    def dependency(self):
        # Objective: Minimize dependency among the features
        H = sum(self.dependence_coefficients[i][j] * self.array[i] * self.array[j] 
                for i in range(self.num_features) for j in range(i + 1, self.num_features))
        return (1 - self.alpha) * H



In [6]:
# 初始化一個集合來儲存所有選中的特徵
all_selected_features = set()

# 初始化一個字典來儲存每個 class 的 selected_features
class_selected_features = {}

# Load the CSV file
for i in range(0, 6):
    # Load the class 0~6 CSV file
    file_path = f'../data_p/quantum_data.address_class{i}.csv'
    df = pd.read_csv(file_path)

    # Extracting each column as an array
    columns = df.columns
    features = df[columns[:-1]]  # All columns except the last one
    result = df[columns[-1]]    # The last column
    
    # Remove constant columns
    constant_columns = features.columns[features.nunique() == 1]
    if len(constant_columns) > 0:
        print(f"Constant columns in class {i}: {constant_columns}")
        features = features.drop(columns=constant_columns)

    n_features = features.shape[1]

    # Calculate the correlation matrix for features
    feature_correlation = features.corr(method='spearman')

    # Calculate the correlation of each feature with the result
    result_correlation = features.apply(lambda x: x.corr(result, method='spearman'))
    
    # Define alpha for the QUBO problem
    alpha = 0.5

    feature_qubo = FeatureSelection_v1(n_features, feature_correlation.values, result_correlation.values, alpha)

    # Feature influence coefficients: Correlation of each feature with the result
    objective = feature_qubo.influence() + feature_qubo.dependency()
    model = objective.compile()
    qubo, offset = model.to_qubo()

    # # Check for problematic QUBO coefficients
    # problematic_coeffs = {k: v for k, v in qubo.items() if np.isnan(v) or abs(v) > 1e10}
    # if problematic_coeffs:
    #     print(f"Problematic QUBO coefficients in class {i}: {problematic_coeffs}")

    # Solve QUBO using Simulated Annealing Sampler
    sampler = neal.SimulatedAnnealingSampler()
    response = sampler.sample_qubo(qubo)
    print("qubo:", qubo)
        
    # Print results
    for sample, energy in response.data(['sample', 'energy']):
        print(sample, energy)
        
    # Find the best sample (modify this as per your criteria)
    # For simplicity, we're taking the first sample as an example
    best_sample = list(response.samples())[0]

    # Identify selected features
    selected_features = [int(key.split('[')[1].split(']')[0]) for key, value in best_sample.items() if value == 1]
    # print("Selected Features:", selected_features)

    # Filter the DataFrame to keep only the selected columns
    filtered_df = df.iloc[:, selected_features]
    
    # 將本次迭代選中的特徵添加到集合中
    all_selected_features.update(selected_features)
    
    # 將本次迭代選中的特徵儲存到字典中
    class_selected_features[i] = selected_features

    # Add the index of the last column (class) to the selected features
    last_column = df[columns[-1]]
    filtered_df = pd.concat([filtered_df, last_column], axis=1)

    # # Save the filtered DataFrame to a new CSV file
    filtered_df.to_csv(f'../data_p/quantum_qubo_data.address_class{i}.csv', index=False)

# Print non-duplicate selected features from all iterations
print("Combined Selected Features (No Duplicates):", sorted(all_selected_features))

# 也可以打印出每個 class 的 selected_features 來確認
for class_num, features in class_selected_features.items():
    print(f"Class {class_num} Selected Features:", features)


Constant columns in class 0: Index(['f_received_digits_0', 'r_payback'], dtype='object')
qubo: {('feature[2]', 'feature[1]'): 0.21900543261247887, ('feature[33]', 'feature[17]'): -0.07311002628018329, ('feature[38]', 'feature[29]'): 0.0004931717160897263, ('feature[33]', 'feature[16]'): 0.0954217297619923, ('feature[32]', 'feature[9]'): 0.45483825660198135, ('feature[16]', 'feature[9]'): 0.4598761489763672, ('feature[46]', 'feature[26]'): 0.012545967693945295, ('feature[20]', 'feature[18]'): -0.0008550236958669844, ('feature[39]', 'feature[11]'): 0.0021442352393282677, ('feature[41]', 'feature[2]'): 0.04811634560419883, ('feature[38]', 'feature[22]'): 0.26233699474836564, ('feature[41]', 'feature[33]'): 0.5, ('feature[36]', 'feature[11]'): 0.0020382750648169825, ('feature[38]', 'feature[9]'): 0.4525096553818661, ('feature[44]', 'feature[33]'): 0.004357928609426155, ('feature[43]', 'feature[4]'): -0.014611260142826933, ('feature[25]', 'feature[14]'): 0.46308964548429105, ('feature[44]',

In [None]:
# # 初始化一個集合來儲存所有選中的特徵
# all_selected_features = set()

# # 初始化一個字典來儲存每個 class 的 selected_features
# class_selected_features = {}

# # Load the CSV file
# for i in range(0, 6):
#     # Load the class 0~6 CSV file
#     file_path = f'../data_p/quantum_data.address_class{i}.csv'
#     df = pd.read_csv(file_path)

#     # Extracting each column as an array
#     columns = df.columns
#     features = df[columns[:-1]]  # All columns except the last one
#     result = df[columns[-1]]    # The last column
    
#     # Remove constant columns
#     constant_columns = features.columns[features.nunique() == 1]
#     if len(constant_columns) > 0:
#         print(f"Constant columns in class {i}: {constant_columns}")
#         features = features.drop(columns=constant_columns)

#     n_features = features.shape[1]

#     # Calculate the correlation matrix for features
#     feature_correlation = features.corr(method='spearman')

#     # Calculate the correlation of each feature with the result
#     result_correlation = features.apply(lambda x: x.corr(result, method='spearman'))
    
#     # Define alpha for the QUBO problem
#     alpha = 0.5

#     feature_qubo = FeatureSelection_v1(n_features, feature_correlation.values, result_correlation.values, alpha)

#     # Feature influence coefficients: Correlation of each feature with the result
#     objective = feature_qubo.influence() + feature_qubo.dependency()
#     model = objective.compile()
#     qubo, offset = model.to_qubo()

#     # Solve QUBO using Simulated Annealing Sampler
#     sampler = neal.SimulatedAnnealingSampler()
#     response = sampler.sample_qubo(qubo)
#     # print("qubo:", qubo)
        
#     # Print results
#     for sample, energy in response.data(['sample', 'energy']):
#         print(sample, energy)
        
#     # Find the best sample (modify this as per your criteria)
#     # For simplicity, we're taking the first sample as an example
#     best_sample = list(response.samples())[0]

#     # Identify selected features
#     selected_features = [int(key.split('[')[1].split(']')[0]) for key, value in best_sample.items() if value == 1]
#     # print("Selected Features:", selected_features)

#     # Filter the DataFrame to keep only the selected columns
#     filtered_df = df.iloc[:, selected_features]
    
#     # 將本次迭代選中的特徵添加到集合中
#     all_selected_features.update(selected_features)
    
#     # 將本次迭代選中的特徵儲存到字典中
#     class_selected_features[i] = selected_features

#     # Add the index of the last column (class) to the selected features
#     last_column = df[columns[-1]]
#     filtered_df = pd.concat([filtered_df, last_column], axis=1)

#     # # Save the filtered DataFrame to a new CSV file
#     filtered_df.to_csv(f'../data_p/quantum_qubo_data.address_class{i}.csv', index=False)

# # Print non-duplicate selected features from all iterations
# print("Combined Selected Features (No Duplicates):", sorted(all_selected_features))

# # 也可以打印出每個 class 的 selected_features 來確認
# for class_num, features in class_selected_features.items():
#     print(f"Class {class_num} Selected Features:", features)


In [4]:
# 從原始資料根據 all_selected_features 選擇特徵並保存

# 讀取原始數據
file_path = "../data_p/data.address.csv"
df = pd.read_csv(file_path)

# 將索引轉換為列選擇器
# selected_column_indices = list(all_selected_features)

# 只選擇 mixer 的特徵
selected_column_indices = list(class_selected_features[4])

selected_column_indices.append(len(df.columns) - 1)  # 添加最後一列的索引

# 使用 .iloc 來選擇指定索引的列
filtered_df = df.iloc[:, selected_column_indices]

# 確定原始數據集中哪些列不在處理過的數據集中
missing_columns = [col for col in df.columns[:-1] if col not in filtered_df.columns]  # 排除最後一列（class）

# 為缺失的列創建全為 0 的數據，並添加到處理過的數據集中
for col in missing_columns:
    filtered_df[col] = 0

# 重新排列列的順序以匹配原始數據集
filtered_df = filtered_df[df.columns]

# 將所有 NaN 值替換為 0
filtered_df.fillna(0, inplace=True)

# 保存修改後的數據集
filtered_df.to_csv("../data_p/all_selected_features_quantum_qubo_data.address.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

In [4]:
# import pandas as pd

# # 從原始數據根據 all_selected_features 選擇特徵並保存

# # 讀取原始數據
# file_path = "../data_p/data.address.csv"
# df = pd.read_csv(file_path)

# # 選擇 mixer 的特徵
# selected_column_indices = list(class_selected_features[4])
# selected_column_indices.append(len(df.columns) - 1)  # 添加最後一列的索引

# # 創建一個新的 DataFrame，初始值設為0
# zero_df = pd.DataFrame(0, index=df.index, columns=df.columns)

# # 保留 selected_column_indices 中的列的原始數據
# zero_df.iloc[:, selected_column_indices] = df.iloc[:, selected_column_indices]

# # 將所有 NaN 值替換為 0
# zero_df.fillna(0, inplace=True)

# # 保存修改後的數據集
# zero_df.to_csv("../data_p/all_selected_features_quantum_qubo_data.address.csv", index=False)


In [1]:
# import pandas as pd

# # 讀取原始數據的列名（不加載所有數據到內存）
# df_columns = pd.read_csv("../data_p/data.address.csv", nrows=0).columns
# print("Columns in original data:", df_columns.tolist())  # 原始列名

# # 讀取 QA 檔案，提取其中的列名
# qa_columns = pd.read_csv(f"../data_p/quantum_qubo_data.address_class{4}.csv", nrows=0).columns
# print("QA Columns:", qa_columns.tolist())  # 查看 QA 文件的列名

# # 從原始數據中選擇與 QA 文件匹配的列
# selected_columns = [col for col in df_columns if col in qa_columns]
# print("Selected columns before adding the last column:", selected_columns)

# # 確保選擇最後一列（通常是類別標籤）
# if df_columns[-1] not in selected_columns:
#     selected_columns.append(df_columns[-1])

# print("Final selected columns:", selected_columns)

# # 初始化空的 DataFrame 用於控制 CSV 寫入模式
# header_written = False  # 新增一個旗標，初始為 False

# # 逐列處理並節省內存
# chunk_size = 10000  # 設定一個合理的分塊大小
# for chunk in pd.read_csv("../data_p/data.address.csv", chunksize=chunk_size, usecols=selected_columns):
#     # 將未選擇的列補全並設置為 0
#     for col in df_columns:
#         if col not in selected_columns:
#             chunk[col] = 0
    
#     # 填充 NaN 值為 0
#     chunk = chunk.fillna(0)

#     # 按原始數據列順序排列
#     chunk = chunk[df_columns]
    
#     # 只對字符串類型的列進行 strip 操作
#     for col in chunk.select_dtypes(include=['object']).columns:
#         chunk[col] = chunk[col].str.strip()

#     # 保存每一塊處理過的數據到 CSV 文件
#     chunk.to_csv("../data_p/all_selected_features_quantum_qubo_data.address.csv", mode='a', header=not header_written, index=False)

#     # 設置標誌旗幟為 True，確保標題只在第一個 chunk 中寫入
#     if not header_written:
#         header_written = True


Columns in original data: ['n_tx', 'total_days', 'total_spent_btc', 'total_received_btc', 'total_spent_usd', 'total_received_usd', 'mean_balance_btc', 'std_balance_btc', 'mean_balance_usd', 'std_balance_usd', 'n_received', 'n_spent', 'n_coinbase', 'n_payback', 'n_received_inclusive', 'n_spent_inclusive', 'n_coinbase_inclusive', 'n_payback_inclusive', 'f_tx', 'f_received', 'f_coinbase', 'f_spent_digits_-3', 'f_spent_digits_-2', 'f_spent_digits_-1', 'f_spent_digits_0', 'f_spent_digits_1', 'f_spent_digits_2', 'f_spent_digits_3', 'f_spent_digits_4', 'f_spent_digits_5', 'f_spent_digits_6', 'f_received_digits_-3', 'f_received_digits_-2', 'f_received_digits_-1', 'f_received_digits_0', 'f_received_digits_1', 'f_received_digits_2', 'f_received_digits_3', 'f_received_digits_4', 'f_received_digits_5', 'f_received_digits_6', 'r_payback', 'n_inputs_in_spent', 'n_outputs_in_spent', 'interval_1st_moment', 'interval_2nd_moment', 'interval_3rd_moment', 'interval_4th_moment', 'dist_total_1st_moment', 'd

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 指定 CSV 檔案的路徑
file_path = "../data_p/all_selected_features_quantum_qubo_data.address.csv"

# 使用 pandas 的 read_csv 函數讀取 CSV 檔案
df = pd.read_csv(file_path)

# 將所有的 NaN 值替換為 0
df.fillna(0, inplace=True)

# 將 std_balance_btc 列中的數值只取到小數點後 10 位
df['std_balance_btc'] = df['std_balance_btc'].round(10)

# 保存最後一列
last_column = df.iloc[:, -1].copy()

# 初始化 StandardScaler
scaler = StandardScaler()

# 選擇除了最後一列之外的數值型列進行標準化
numeric_df = df.iloc[:, :-1].select_dtypes(include=['float64', 'int64'])

# 使用 StandardScaler 對選定的數值型數據進行標準化
scaled_features = scaler.fit_transform(numeric_df.values)

# 將標準化後的數據轉換回 DataFrame
scaled_df = pd.DataFrame(scaled_features, index=df.index, columns=numeric_df.columns)

# 將最後一列加回 DataFrame
scaled_df = pd.concat([scaled_df, last_column], axis=1)

# 定義新檔案的路徑
new_file_path = "../data_p/normalization_all_selected_features_quantum_qubo_data.address.csv"

# 儲存修改後的 DataFrame 到新檔案
scaled_df.to_csv(new_file_path, index=False)


In [None]:
# 繪製特徵相關性熱圖
def plot_feature_correlation_heatmap(correlation_matrix, title='Feature Correlation Heatmap'):
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(title)
    plt.show()

# 繪製特徵與類別相關性柱狀圖
def plot_feature_class_correlation_bar(correlations, title='Feature-Class Correlation'):
    plt.figure(figsize=(12, 6))
    correlations.plot(kind='bar', color='skyblue')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Correlation with Class')
    plt.axhline(y=0, color='black', linestyle='--')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

In [None]:
# 定義 class 索引和名稱的對應
class_names = {
    0: 'Exchange',
    1: 'Faucet',
    2: 'Gambling',
    3: 'Market',
    4: 'Mixer',
    5: 'Mining Pool'
}

# Load the CSV file
for i in range(0, 6):
    # Load the class 0~5 CSV file
    file_path = f'../data_p/quantum_data.address_class{i}.csv'
    df = pd.read_csv(file_path)

    # 移除包含空值的列
    df = df.dropna(axis=1)

    # Extracting each column as an array
    columns = df.columns
    features = df[columns[:-1]]  # All columns except the last one
    result = df[columns[-1]]    # The last column
    n_features = features.shape[1]

    # Calculate the correlation matrix for features
    feature_correlation = features.corr(method='spearman')
    # 繪製特徵相關性熱圖，並使用 class 名稱作為標題
    plot_feature_correlation_heatmap(feature_correlation, f'Feature Correlation Heatmap for {class_names[i]}')

    # Calculate the correlation of each feature with the result
    result_correlation = features.apply(lambda x: x.corr(result, method='spearman'))
    # 繪製特徵與結果類別相關性柱狀圖，並使用 class 名稱作為標題
    plot_feature_class_correlation_bar(result_correlation, f'Feature-Class Correlation for {class_names[i]}')

In [None]:
import pandas as pd

# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()

# Loop through class0 to class5
for i in range(0, 6):
    # Read each file
    file_path = f'../data_p/quantum_qubo_data.address_class{i}.csv'
    df = pd.read_csv(file_path)

    # Remove rows where the last column (class{i}) has a value of 0
    df = df[df[df.columns[-1]] != 0]

    # Rename the last column to 'class', and set its value to the current class number for rows with 1
    df.rename(columns={df.columns[-1]: 'class'}, inplace=True)
    df['class'] = df['class'].apply(lambda x: i if x == 1 else x)

    # Add the DataFrame to the merged DataFrame
    merged_df = pd.concat([merged_df, df], ignore_index=True)

# Fill all NaN values with 0
merged_df.fillna(0, inplace=True)

# Read the comparison DataFrame
data_df = pd.read_csv('../data_p/data.address.csv')

# Identify missing columns in the merged DataFrame
missing_columns = set(data_df.columns) - set(merged_df.columns)

# Add missing columns to the merged DataFrame with default value 0
for col in missing_columns:
    merged_df[col] = 0

# Move the 'class' column to the end
class_column = merged_df.pop('class')
merged_df['class'] = class_column

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('../data_p/quantum_qubo_data.address.csv', index=False)


In [None]:
import pandas as pd

# 讀取原始數據
file_path = "../data_p/data.address.csv"
df = pd.read_csv(file_path)

# 將索引轉換為列選擇器
# selected_column_indices = list(all_selected_features)

# 只選擇 mixer 的特徵
selected_column_indices = list(class_selected_features[4])

selected_column_indices.append(len(df.columns) - 1)  # 添加最後一列的索引

# 使用 .iloc 來選擇指定索引的列
filtered_df = df.iloc[:, selected_column_indices]

# 確定原始數據集中哪些列不在處理過的數據集中
missing_columns = [col for col in df.columns[:-1] if col not in filtered_df.columns]  # 排除最後一列（class）

# 為缺失的列創建全為 0 的數據，並添加到處理過的數據集中
for col in missing_columns:
    filtered_df[col] = 0

# 重新排列列的順序以匹配原始數據集
filtered_df = filtered_df[df.columns]

# 將所有 NaN 值替換為 0
filtered_df.fillna(0, inplace=True)

# 保存修改後的數據集
filtered_df.to_csv("../data_p/all_selected_features_quantum_qubo_data.address.csv", index=False)
