In [1]:
import numpy as np
import pandas as pd

from pyqubo import Array
import neal
import matplotlib.pyplot as plt
import requests


In [2]:

# Load the CSV file
file_path = '../data_p/quantum_data.address_class2.csv'
df = pd.read_csv(file_path)

# Extracting each column as an array
columns = df.columns
features = df[columns[:-1]]  # All columns except the last one
result = df[columns[-1]]    # The last column
n_features = features.shape[1]

# Calculate the correlation matrix for features
feature_correlation = features.corr(method='spearman')

# Calculate the correlation of each feature with the result
result_correlation = features.apply(lambda x: x.corr(result))

feature_correlation, result_correlation, n_features



(                             n_tx  total_days  total_spent_btc  \
 n_tx                     1.000000    0.716942         0.457638   
 total_days               0.716942    1.000000         0.381055   
 total_spent_btc          0.457638    0.381055         1.000000   
 total_received_btc       0.457638    0.381055         1.000000   
 total_spent_usd          0.463463    0.384557         0.994190   
 ...                           ...         ...              ...   
 dist_receive_4th_moment  0.999536    0.719445         0.459794   
 dist_payback_1st_moment  0.991284    0.773083         0.468831   
 dist_payback_2nd_moment  0.987512    0.773083         0.471885   
 dist_payback_3rd_moment -0.322281   -0.599076        -0.190734   
 dist_payback_4th_moment  0.999536    0.719445         0.459794   
 
                          total_received_btc  total_spent_usd  \
 n_tx                               0.457638         0.463463   
 total_days                         0.381055         0.384557   

In [3]:
# not used

# Assuming alpha is given and set to 0.5 for demonstration purposes
alpha = 0.5

# Define n_features as the number of features (excluding the last column which is the result)
n_features = df.shape[1] - 1

# Feature influence coefficients: Correlation of each feature with the result
influence_coefficients = np.abs(result_correlation.values)

# Independence coefficients: Absolute pairwise correlations among features
# We only need the unique pairwise coefficients, which are in the upper triangle of the correlation matrix
# excluding the diagonal. We use np.triu_indices to get the indices of the upper triangle.
independence_indices = np.triu_indices(n_features, k=1)
independence_coefficients = np.abs(feature_correlation.values)[independence_indices]

# Now we need to construct the QUBO matrix with the influence coefficients on the diagonal
# and the independence coefficients on the off-diagonal elements.
qubo_matrix = np.zeros((n_features, n_features))

# Set the linear coefficients on the diagonal for influence terms
np.fill_diagonal(qubo_matrix, alpha * influence_coefficients)

# Set the quadratic coefficients for independence terms
for i, (row, col) in enumerate(zip(*independence_indices)):
    qubo_matrix[row, col] = qubo_matrix[col, row] = (1 - alpha) * independence_coefficients[i]

# The QUBO matrix is now ready to be used with a solver
qubo_matrix

array([[0.10349186, 0.35847095, 0.22881901, ..., 0.49375613, 0.16114057,
        0.49976825],
       [0.35847095, 0.02758046, 0.19052741, ..., 0.38654133, 0.29953779,
        0.35972269],
       [0.22881901, 0.19052741, 0.11489372, ..., 0.23594259, 0.0953669 ,
        0.22989691],
       ...,
       [0.49375613, 0.38654133, 0.23594259, ..., 0.05668283, 0.15711782,
        0.49423668],
       [0.16114057, 0.29953779, 0.0953669 , ..., 0.15711782, 0.05197271,
        0.16428566],
       [0.49976825, 0.35972269, 0.22989691, ..., 0.49423668, 0.16428566,
        0.05446429]])

In [4]:
# Project: feature selection for credit scoring
class FeatureSelection_v1(object):
    def __init__(self, num_features, dependence_coefficients, influence_coefficients, alpha):
        self.alpha = alpha
        
        # Number of features
        self.num_features = num_features
        self.dependence_coefficients = dependence_coefficients
        self.influence_coefficients = influence_coefficients
        
        # Create binary variables for the features
        self.array = Array.create('feature', shape=self.num_features, vartype='BINARY')

    def influence(self):
        # Objective: Maximize influence of the features
        H = sum(-self.influence_coefficients[i] * self.array[i] for i in range(self.num_features))
        return self.alpha * H
        
    def dependency(self):
        # Objective: Minimize dependency among the features
        H = sum(self.dependence_coefficients[i][j] * self.array[i] * self.array[j] 
                for i in range(self.num_features) for j in range(i + 1, self.num_features))
        return (1 - self.alpha) * H



In [5]:
# Define alpha for the QUBO problem
alpha = 0.5

feature_qubo = FeatureSelection_v1(n_features, feature_correlation.values, result_correlation.values, alpha)

# Feature influence coefficients: Correlation of each feature with the result
#feature_qubo = FeatureSelection_v1(features, independence_coefficients, influence_coefficients, alpha)
objective = feature_qubo.influence() + feature_qubo.dependency()
model = objective.compile()
qubo, offset = model.to_qubo()

# Solve QUBO using Simulated Annealing Sampler
sampler = neal.SimulatedAnnealingSampler()
response = sampler.sample_qubo(qubo)
# print("qubo:", qubo)
    
# Print results
for sample, energy in response.data(['sample', 'energy']):
    print(sample, energy)
    
# Find the best sample (modify this as per your criteria)
# For simplicity, we're taking the first sample as an example
best_sample = list(response.samples())[0]

# Identify selected features
selected_features = [int(key.split('[')[1].split(']')[0]) for key, value in best_sample.items() if value == 1]

# Filter the DataFrame to keep only the selected columns
filtered_df = df.iloc[:, selected_features]

# Add the index of the last column (class) to the selected features
last_column = df[columns[-1]]
filtered_df = pd.concat([filtered_df, last_column], axis=1)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('../data_p/quantum_qubo_data.address_class2.csv', index=False)

{'feature[0]': 0, 'feature[10]': 1, 'feature[11]': 0, 'feature[12]': 1, 'feature[13]': 0, 'feature[14]': 1, 'feature[15]': 0, 'feature[16]': 0, 'feature[17]': 0, 'feature[18]': 0, 'feature[19]': 0, 'feature[1]': 1, 'feature[20]': 0, 'feature[21]': 0, 'feature[22]': 1, 'feature[23]': 1, 'feature[24]': 1, 'feature[25]': 0, 'feature[26]': 1, 'feature[27]': 0, 'feature[28]': 1, 'feature[29]': 0, 'feature[2]': 1, 'feature[30]': 0, 'feature[31]': 1, 'feature[32]': 0, 'feature[33]': 1, 'feature[34]': 0, 'feature[35]': 1, 'feature[36]': 0, 'feature[37]': 0, 'feature[38]': 0, 'feature[39]': 0, 'feature[3]': 1, 'feature[40]': 0, 'feature[41]': 0, 'feature[42]': 0, 'feature[43]': 0, 'feature[44]': 0, 'feature[45]': 0, 'feature[46]': 0, 'feature[47]': 0, 'feature[48]': 0, 'feature[49]': 0, 'feature[4]': 1, 'feature[50]': 1, 'feature[51]': 1, 'feature[52]': 1, 'feature[53]': 0, 'feature[54]': 0, 'feature[55]': 0, 'feature[56]': 0, 'feature[57]': 1, 'feature[58]': 1, 'feature[59]': 1, 'feature[5]': 

  cold_beta = np.log(number_min_gaps/max_single_qubit_excitation_rate) / (2*min_effective_field)


In [6]:
independence_coefficients

array([0.7169419 , 0.45763803, 0.45763803, ..., 0.31423564, 0.98847335,
       0.32857131])