# Getting Datasets
Datasets from https://github.com/BrunoMog/Pibic-Quantum-Computing

In [1]:
import os
import pandas as pd
from local.listDatabases import get_csv_path_list, get_pkl_path_list, get_dataframes_and_samples
import pickle

DATASETS = get_dataframes_and_samples('./local')
NEW_DATASET = {}

for dt_name in DATASETS:
    if 'high_noise' not in dt_name:
        NEW_DATASET[dt_name] = DATASETS[dt_name]

DATASETS = NEW_DATASET

print(list(DATASETS.keys()))
print(len(DATASETS))

['blobs_2classes_16features_500samples_low_noise', 'blobs_2classes_16features_500samples_no_noise', 'blobs_2classes_16features_50samples_low_noise', 'blobs_2classes_16features_50samples_no_noise', 'blobs_2classes_2features_500samples_low_noise', 'blobs_2classes_2features_500samples_no_noise', 'blobs_2classes_2features_50samples_low_noise', 'blobs_2classes_2features_50samples_no_noise', 'blobs_2classes_4features_500samples_low_noise', 'blobs_2classes_4features_500samples_no_noise', 'blobs_2classes_4features_50samples_low_noise', 'blobs_2classes_4features_50samples_no_noise', 'blobs_2classes_8features_500samples_low_noise', 'blobs_2classes_8features_500samples_no_noise', 'blobs_2classes_8features_50samples_low_noise', 'blobs_2classes_8features_50samples_no_noise', 'blobs_3classes_16features_500samples_low_noise', 'blobs_3classes_16features_500samples_no_noise', 'blobs_3classes_16features_50samples_low_noise', 'blobs_3classes_16features_50samples_no_noise', 'blobs_3classes_2features_500sa

In [2]:
# Dataframe com os dados
print(len(DATASETS['blobs_2classes_2features_500samples_low_noise']['samples']))
print(DATASETS['blobs_2classes_2features_500samples_low_noise']['samples'][0][:100])
print(len(DATASETS['blobs_2classes_2features_500samples_low_noise']['samples'][0]))
DATASETS['blobs_2classes_2features_500samples_low_noise']['df']['target']

30
[426 917  84 852 968 931 213 703 998 614 338  48 232 759  96 914  23 485
 428 413 727 895 702 844 886 107 355 869 401 699 219 267 118  47 740 430
 855 650 304 228 970 554 195 290 745 964 336 615 559 487 847 421 503 447
 974 455 919 996 540 536  93 125 979 957 763  32 164 751 162 306 509 551
 716 523  44 479 320 275 286 905 532 773 651 143 152 307 557 122 175 947
 738 414 266 861 325 261 368 117 802 294]
700


0      0
1      1
2      0
3      1
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: target, Length: 1000, dtype: int64

In [3]:
# organizar o dataset pela quantidade de features e classes
DATASET_BY_FEATURES = {}

for name, data in DATASETS.items():

    num_features = len(data['df'].columns) - 1
    num_classes = len(set(data['df']['target']))
    
    if num_features not in DATASET_BY_FEATURES: DATASET_BY_FEATURES[num_features] = {}
    if num_classes not in DATASET_BY_FEATURES[num_features]: DATASET_BY_FEATURES[num_features][num_classes] = {}
    
    DATASET_BY_FEATURES[num_features][num_classes][name] = data

[print([(feat, classes) for classes in DATASET_BY_FEATURES[feat].keys() ]) for feat in DATASET_BY_FEATURES.keys() ]
list(DATASET_BY_FEATURES[2][2].keys())

[(16, 2), (16, 3), (16, 4)]
[(2, 2), (2, 3), (2, 4)]
[(4, 2), (4, 3), (4, 4)]
[(8, 2), (8, 3), (8, 4)]


['blobs_2classes_2features_500samples_low_noise',
 'blobs_2classes_2features_500samples_no_noise',
 'blobs_2classes_2features_50samples_low_noise',
 'blobs_2classes_2features_50samples_no_noise',
 'circles_500samples_low_noise',
 'circles_500samples_no_noise',
 'circles_50samples_low_noise',
 'circles_50samples_no_noise',
 'moons_500samples_low_noise',
 'moons_500samples_no_noise',
 'moons_50samples_low_noise',
 'moons_50samples_no_noise',
 'xor_500samples_low_noise',
 'xor_500samples_no_noise',
 'xor_50samples_low_noise',
 'xor_50samples_no_noise']

# Getting Circuits

In [4]:
from circuits.listCircuits import get_circuit_path_list
from circuits.pennylaneCircuitParse import get_pennylane_circuit_from_file
PATH = 'circuits/CostaSH'

# Circuit list by number of qubits
CIRCUITS = [[]]

CIRCUITS.append([])
for path in get_circuit_path_list(PATH + '/1qubit'):
    name = path.split('/')[-1].split('.')[0]
    circuit, dev, nparams = get_pennylane_circuit_from_file(path)
    CIRCUITS[1].append( {'name':name, 'circuit' : circuit, 'dev' : dev, 'nparams' : nparams, 'path':path} )

for i in range(2, 5):
    CIRCUITS.append([])
    for path in get_circuit_path_list(PATH + f'/{i}qubits'):
        name = path.split('/')[-1].split('.')[0]
        circuit, dev, nparams = get_pennylane_circuit_from_file(path, {'&L' : 1})
        CIRCUITS[i].append( {'name':name, 'circuit' : circuit, 'dev' : dev, 'nparams' : nparams, 'path':path} )

[print(len(Nqubits), [circuit['name'] for circuit in Nqubits]) for  Nqubits in CIRCUITS ]
print()

0 []
6 ['hr', 'ru', 'ruu', 'rx', 'ry', 'rz']
5 ['alpha', 'beta', 'gamma', 'phi', 'theta']
10 ['0_zero', '1_mochi', '2_yadi', '3_tatu', '4_wana', '5_tano', '6_samanu', '7_sambwadi', '8_nake', '9_divwe']
15 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']



# Training

In [5]:
from pennylane.optimize import NesterovMomentumOptimizer, GradientDescentOptimizer, SPSAOptimizer
from qmlHelper.metrics import square_loss_silhouette, square_loss_calinski_harabasz_score, square_loss_davies_bouldin_score

EMBEDDING = ['phasex', 
            #  'phasey', 
             'amplitude']
### (optimizer, opt_params)
OPTIMIZERS = {
              'SPSAOptimizer(10)' : (SPSAOptimizer, [10]),
              # 'GradientDescentOptimizer(1)' : (GradientDescentOptimizer, [1]), 
            #   'NesterovMomentumOptimizer(0.1)' :(NesterovMomentumOptimizer, [0.1]),  
              } 
METRICS = {'silhouette' : square_loss_silhouette, 
           'calinski harabasz' : square_loss_calinski_harabasz_score,
          #  'davies bouldin' : square_loss_davies_bouldin_score
           }


In [6]:
import traceback
from qmlHelper.utils import train_ansatz, cost
from qmlHelper.metrics import unsupervised_accuracy
from sklearn.model_selection import train_test_split

SEED = 157
BATCH_SIZE = 1
STEPS = 0
CIRCUIT_ARGS = {'encoding': 'phase',
                'meas': 'expval',
                'measwire': [0],
                'run_quiet': True}


def run_one_task(use_bias, qubits, circuit_path, circuit_name, nparams, n_feat, n_classes, 
                  df_name, sample_index, dataset,
                  encoding, opt_name, optimizer, opt_params, 
                  metrc_name, metric, wires_to_measure, measure_type ):
    
    circuit, dev, nparams = get_pennylane_circuit_from_file(circuit_path, {'&L' : 1})

    circuit_args = CIRCUIT_ARGS
    circuit_args['encoding'] = encoding
    circuit_args['meas'] = measure_type
    circuit_args['measwire'] = range(wires_to_measure)

    data = dataset.drop(columns=['target'], inplace=False).to_numpy()
    labels = dataset['target'].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split( data, labels, test_size=0.3, random_state=SEED, stratify=labels )
    
    try:
        weights, bias = train_ansatz(circuit  = circuit,
                                    n_params = nparams, 
                                    circuit_args = circuit_args, 
                                    data = X_train, 
                                    labels = y_train, 
                                    batch_size = BATCH_SIZE, 
                                    Steps = STEPS, 
                                    cost_metric = metric, 
                                    opt = optimizer(*opt_params),
                                    seed=SEED, 
                                    threshold_n_classes = n_classes, 
                                    use_bias = use_bias
                                    )   
                                                
        entry = {'ARQUITETURA_ANSATZ' :  circuit,
                'INPUT_EMBEDDING': encoding,
                'DATASET': df_name,
                'DATASET_DIVISION_INDEX': sample_index,
                'OPTIMIZER': opt_name,
                'UNSUPERVISED_METRIC': metrc_name,
                'MEASURED_WIRES': wires_to_measure,
                'MEASURE_TYPE': measure_type,
                'TRAIN_METRIC_COST': cost(circuit, weights, bias, metric, X_train, y_train, n_classes, circuit_args),
                'TEST_METRIC_COST':  cost(circuit, weights, bias, metric, X_test,  y_test,  n_classes, circuit_args),
                'TRAIN_ACCURACY': unsupervised_accuracy(circuit, weights, bias, X_train, y_train, n_classes, circuit_args),
                'TEST_ACCURACY': unsupervised_accuracy(circuit, weights, bias, X_test, y_test, n_classes, circuit_args),
                'WEIGHT': weights,
                'BIAS': bias,
                'USE_BIAS': 'YES' if use_bias else 'NO',
        }

        return entry

    except Exception as e:
        entry_id = {'ARQUITETURA_ANSATZ' :  circuit_name, 'INPUT_EMBEDDING': encoding, 'DATASET': df_name, 'DATASET_DIVISION_INDEX': sample_index, 'OPTIMIZER': opt_name, 'UNSUPERVISED_METRIC': metrc_name, 'MEASURED_WIRES': wires_to_measure, 'MEASURE_TYPE': measure_type}
        print('Error at:', entry_id)
        print(f"Error: {str(e)}\nTraceback:\n{traceback.format_exc()}")
        return entry_id

def run_one_task_args(args):
    try:
        return run_one_task(*args)
    except Exception as e:
        print("ERROR")
        return {}


In [7]:
MAX_SAMPLES = 10
TASKS = []

for use_bias in [True]:
    for qubits in range(1, 5):
        for circuit in CIRCUITS[qubits]:

            for n_feat, datasets_by_class in DATASET_BY_FEATURES.items():
                for n_classes, datasets_by_name in datasets_by_class.items():
                    for df_name, dataset in datasets_by_name.items():
                        for encoding in EMBEDDING:
                            if encoding == 'amplitude' and n_feat != 2**qubits: continue # 2^N features to N qubits
                            if encoding.startswith('phase') and n_feat > qubits: continue # não há qubits suficientes para carregar os dados                
                            for sample_index, sample in zip(range(MAX_SAMPLES), dataset['samples']):    
                                for opt_name, (optimizer, opt_params) in OPTIMIZERS.items():
                                    for metrc_name, metric in METRICS.items():
                                        # ENTRY_GROUP = []
                                        # print({'ARQUITETURA_ANSATZ' :  circuit['name'], 'INPUT_EMBEDDING': encoding, 'DATASET': df_name})
                                        for wires_to_measure in [1]: #, n_classes]:
                                            for measure_type in ['expval']: # + 'probs' ?
                                                if wires_to_measure > qubits: continue
                                                
                                                TASKS.append((use_bias, qubits, circuit['path'], circuit['name'], circuit['nparams'], n_feat, n_classes, 
                                                              df_name, sample_index, dataset['df'].loc[sample], 
                                                              encoding, opt_name, optimizer, opt_params, 
                                                              metrc_name, metric, wires_to_measure, measure_type ))

                                                # ENTRY_GROUP.append(run_one_task(*TASKS[-1]))
                                        ### END OF for wires to measure
                                        # save_dataframe(pd.DataFrame(ENTRY_GROUP), FILENAME)
                                ### END OF for optimizer
                            ## END OF for samples

print(len(TASKS))

28080


In [11]:
from local.listDatabases import save_dataframe
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from tqdm import tqdm
import os
MAX_WORKERS = max(os.cpu_count() - 1, 1)

FILENAME = 'reports_data/Ansatz_reduced_training_reports(CostaSH).csv'

def run_task_parallel(tasks):
        results = []
        with tqdm(total=len(tasks)) as pbar:
            with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
                futures = []
                for task in tasks: futures.append(executor.submit(run_one_task_args, task))
                for future in concurrent.futures.as_completed(futures):
                    results.append(future.result())
                    pbar.update(1)

final_df = pd.DataFrame(run_task_parallel(TASKS[:4]))
save_dataframe(final_df, FILENAME)

  0%|          | 0/4 [00:00<?, ?it/s]


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [10]:
result = [run_one_task_args(task) for task in tqdm(TASKS[:4])]

100%|██████████| 4/4 [00:13<00:00,  3.49s/it]
