In [None]:
def print_section(text):
    print('##############################################')
    print('###  ', text)
    print('##############################################')


print_section('Start imports')
import importlib
import os
import sys
import pathlib
sys.path.append(str(pathlib.PurePath(pathlib.Path.cwd().parent)))

import tqdm
import numpy as np
import scipy
import scipy.optimize
import pickle
import pandas as pd
import matplotlib
import matplotlib.cm as cm
import matplotlib.axes as am
import matplotlib.pyplot as plt
%matplotlib widget
import statsmodels.api as sm
import SALib
import SALib.analyze.sobol
import itertools

import dimod
import dwave
import dwave.system
from dwave.system import DWaveSampler, EmbeddingComposite, FixedEmbeddingComposite
import dwave.inspector
import dwave_networkx as dnx
import minorminer

from src.particle_funcs import distance_matrix as distance_matrix
from src.particle_funcs import io as particles_io
import src.leap_funcs.qubo.q_matrix as q_matrix

from src import leap_funcs as leap_funcs
from src.leap_funcs import embedding_quality
from src.leap_funcs.qubo import parameterstudy

from src import h5py_funcs
from src.h5py_funcs import inspections, discoveries, init_custom_getstates, io, parameterstudy_using_info_file

print_section('Finished imports')


In [None]:
import io
print('io.DEFAULT_BUFFER_SIZE', io.DEFAULT_BUFFER_SIZE)

In [None]:
read_from = 'pickle'  # 'pickle' only, 'marshal' unmarshals numpy.ndarray as bytes-object and raises error when marshaling pathlib.Path objects
assert read_from == 'pickle', 'marshal unmarshals numpy.ndarray as bytes-object and raises error when marshaling pathlib.Path objects'
deserializer = ...
if read_from == 'pickle':
    deserializer = pickle
elif read_from == 'marshal':
    deserializer = marshal
else:
    raise ValueError(f'Unknown read_from value: {read_from}. Use "pickle" or "marshal".')

data_dir = pathlib.Path('03_inspect/01_data')

dict_files_information, dict_infos_read, dict_study, dict_success = \
{}, {}, {}, {}

files_to_read = {'dict_files_information': {'is_to_load': True },
                 'dict_infos_read'       : {'is_to_load': True },
                 'dict_study': {
                    1: {'is_to_load': True },
                    2: {'is_to_load': True },
                    3: {'is_to_load': True },
                    4: {'is_to_load': True },
                    5: {'is_to_load': True },
                    6: {'is_to_load': True },
                    7: {'is_to_load': True },
                    8: {'is_to_load': True }
                    },
                 'dict_success': {
                    1: {'is_to_load': False},
                    2: {'is_to_load': False},
                    3: {'is_to_load': False},
                    4: {'is_to_load': False},
                    5: {'is_to_load': False},
                    6: {'is_to_load': False},
                    7: {'is_to_load': False},
                    8: {'is_to_load': False}
                    }
                 }

In [None]:
for key, val in files_to_read.items():
    if key in ('dict_files_information', 'dict_infos_read'):
        exec(f'{key} = None')  # create variables dynamically
        if val['is_to_load']:
            file_name_path = data_dir.joinpath(key + '.' + 'pickle')
            print(f'Reading {key} from {file_name_path}')
            with open(file_name_path, 'rb') as f:
                exec(f'{key} = deserializer.load(f)')
    elif key in ('dict_study', 'dict_success'):
        exec(f'{key} = dict()')  # create variables dynamically
        for sub_key, sub_val in val.items():
            if sub_val['is_to_load']:
                file_name_path = data_dir.joinpath(f'{key}_{sub_key}'+ '.' + 'pickle')
                print(f'Reading {key}[{sub_key}] from {file_name_path}')
                with open(file_name_path, 'rb') as f:
                    exec(f'{key}[{sub_key}] = deserializer.load(f)')
            else:
                exec(f'{key}[{sub_key}] = None')
    else:
        raise ValueError(f'Unknown key: {key}. Check files_to_read')


In [None]:
def compute_sim_annealing_solution(study:int|None=None, substudy:int|None=None, num_particles:int=None, num_neighbours:int|None=None, qubos:dict|None=None):
    print_section('Obtain exact/correct solution via simulated annealing - Start')
    import ast
    import dwave.samplers
    if num_neighbours is None:
        num_neighbours = num_particles
    #num_particles = 5
    qubos_key = f'{num_particles}_{num_neighbours}'
    qubos_key_long = f'0{num_particles}_0{num_neighbours}'
    qubos_key_long_short = f'0{num_particles}_{num_neighbours}'
    qubos_key_short_long = f'{num_particles}_0{num_neighbours}'
    if qubos is None:
        _qubos = dict_infos_read[study][substudy]['qubos']
    else:
        _qubos = qubos
    if qubos_key in _qubos:
        #qubo = dict_infos_read[study][substudy]['qubos'][f'{num_particles}_{num_particles}']
        qubo = _qubos[qubos_key]
    elif qubos_key_long in _qubos:
        qubo = _qubos[qubos_key_long]
    elif qubos_key_long_short in _qubos:
        qubo = _qubos[qubos_key_long_short]
    elif qubos_key_short_long in _qubos:
        qubo = _qubos[qubos_key_short_long]
    else:
        raise ValueError(f'QUBO key {qubos_key} not found in provided qubo dict. Available keys: {list(_qubos.keys())}')

    #sim_annealing_sample = dimod.samplers.ExactSolver().sample_qubo(
    #    {ast.literal_eval(key): value['data'] for key, value in qubo.items()})
    sim_annealing_sample = dwave.samplers.SimulatedAnnealingSampler().sample_qubo(
        {ast.literal_eval(key): value['data'] for key, value in qubo.items()},
        num_reads=10000)
    sim_annealing_sample = sim_annealing_sample.aggregate() # accumulates number of occurences
    #print(type(sim_annealing_sample))
    exact_sol = sim_annealing_sample.record
    exact_sol.sort(order='energy')
    #print(exact_sol)
    print_section('Obtain correct solution via simulated annealing - Finished')
    return exact_sol
print(compute_sim_annealing_solution(study=2, substudy=0, num_particles=5, num_neighbours=5))
print(compute_sim_annealing_solution(qubos=dict_infos_read[1][0]['qubos'], num_particles=5, num_neighbours=3))

In [None]:
for i, v in enumerate(dict_study[2]):
    for k2, v2 in v.items():
        print(i, k2)

In [None]:
for study_key in dict_study.keys():
    # if study_key in [0,1,2]:
    #     continue
    if dict_study[study_key] is None:
        continue
    for i, v in enumerate(dict_study[study_key]):
        print(study_key, i)
        for k2, v2 in v.items():
            #print(i, k2, list(v2.keys()), list(v2['custom'].keys()))
            dict_study[study_key][i][k2] = v2['custom']
            #del v2['custom']
# for k2, v2 in dict_study[2][0].items():
#     print(i, k2, list(v2.keys()), list(v2['custom'].keys()))
#     dict_study[2][0][k2] = v2['custom']
    #del v2['custom']

In [None]:

for i, v in enumerate(dict_study[2]):
    for k2, v2 in v.items():
        print(i, k2)
        for k3, v3 in v2.items():
            print(' ', k3)

In [None]:
importlib.reload(inspections)
importlib.reload(inspections.stats)

In [None]:
df = pd.DataFrame.from_dict(dict_study[2][0], orient='index').reset_index(names=['set_id'])
#df.head()

In [None]:
#df_params_from_info = pd.DataFrame.from_records(dict_infos_read[2][0]['study']['data']['sets'][:,0], index=dict_infos_read[2][0]['study']['data']['identifiers'])
dict_df_params_from_info = {}
dict_substudies_need_merge = {}
for key, val in dict_study.items():
    dict_df_params_from_info[key] = None
    dict_substudies_need_merge[key] = None
    print(key)
    if val is not None:
        dict_df_params_from_info[key] = []
        dict_substudies_need_merge[key] = [[0]]
        study_id = key
        for substudy_id in range(len(dict_infos_read[study_id])):
            dict_df_params_from_info[key].append(None)
            study_info = dict_infos_read[study_id][substudy_id]['study']['data']
            study_info_data_sets = dict_infos_read[study_id][substudy_id]['study']['data']['sets']
            dtype_names = study_info_data_sets.dtype.names
            print(dtype_names)
            if study_info_data_sets.ndim == 1:
                study_info_data_sets = np.atleast_2d(study_info_data_sets).T # Transpose because np.atleast_2d does (n,) -> (1, n), but we need (n, 1) to be consistent
            dict_df_params_from_info[study_id][substudy_id] = pd.DataFrame(dict_infos_read[study_id][substudy_id]['study']['data']['identifiers'], columns=['identifiers']).merge(
                            pd.DataFrame(study_info_data_sets[:,0]), left_index=True, right_index=True)
            dict_df_params_from_info[study_id][substudy_id]['identifiers'] = dict_df_params_from_info[study_id][substudy_id]['identifiers'].apply(lambda id: id.decode('utf-8'))
            _shape = dict_df_params_from_info[study_id][substudy_id].shape
            if substudy_id > 0:
                _is_equal_to_previous = dict_df_params_from_info[study_id][substudy_id].equals(dict_df_params_from_info[study_id][substudy_id-1])
                if _is_equal_to_previous:
                    dict_substudies_need_merge[study_id][-1].append(substudy_id)
                else:
                    dict_substudies_need_merge[study_id].append([substudy_id])
                print(_shape, end=' ')
                print('  is equal to previous substudy:', _is_equal_to_previous)
            else:
                print(_shape)
dict_substudies_need_merge

In [None]:


dict_study_merged_substudies = {}
for key, val in dict_study.items():
    print(key)
    dict_study_merged_substudies[key] = []
    if val is not None:
        if len(dict_substudies_need_merge[key]) == 0:
            print(f'Study {key} does not require merge.')
            dict_study_merged_substudies[key].append(dict_study[key][0])
            continue

        for ids_to_merge in dict_substudies_need_merge[key]:
            merged_ids = []
            merged_dict = {}
            print(f'IDs to merge: {ids_to_merge}')
            for id in ids_to_merge:
                print(f'ID {id} not merged, merge now to', merged_ids)
                for key_dict_1 in merged_dict.keys():
                    if key_dict_1 in list(dict_study[key][id].keys()):
                        raise ValueError(f'Problem merging substudy {id} because {key_dict_1} already exists in merged_dict.')
                merged_dict.update(dict_study[key][id])
                merged_ids.append(id)
            dict_study_merged_substudies[key].append(merged_dict)

In [None]:
for key in dict_study_merged_substudies.keys():
    print(len(dict_study_merged_substudies[key]))

In [None]:
dict_success_2_local = inspections.extract_success_dict(dict_for_df=dict_study[2][0],
                                                        exact_sols=compute_sim_annealing_solution(study=2, substudy=0, num_particles=5),
                                                        n_samples_to_compare=10, n_exact_sols_to_compare=10, is_skip_custom_key_in_dict_for_df=True)
list(dict_success_2_local.keys())

In [None]:
_cache_dict_sim_anneling_solutions = {}

In [None]:
def extract_success_dict_study_1(dict_for_df:dict=None, n_samples_to_compare:int=0, n_exact_sols_to_compare:int=0,
                         is_skip_custom_key_in_dict_for_df:bool=False, is_print_sols:bool=False, is_print_meta:bool=False, print_prefix:str=' '):
    global _cache_dict_sim_anneling_solutions
    dict_success_dicts = {}
    print(f'Extract {dict_infos_read[1][0]['study']['data'].shape[0]} success dicts from study 1...')
    for row_id in range(dict_infos_read[1][0]['study']['data'].shape[0]):
        _num_particels = dict_infos_read[1][0]['study']['data']['sets'][row_id]['num_particles'][0]
        _num_neighbours = dict_infos_read[1][0]['study']['data']['sets'][row_id]['num_nearest_neighbours'][0]
        _id = dict_infos_read[1][0]['study']['data']['identifiers'][row_id]
        print(f'  Processing num_particles={_num_particels}, num_neighbours={_num_neighbours} for id={_id}')
        #if _num_particels>=6: 
        #    continue
        if row_id in _cache_dict_sim_anneling_solutions[1].keys():
            print(f'  Found cached solution for id={_id} (row_id: {row_id})')
        else:
            _cache_dict_sim_anneling_solutions[1][row_id] = compute_sim_annealing_solution(qubos=dict_infos_read[1][0]['qubos'], num_particles=_num_particels, num_neighbours=_num_neighbours)
        
        #dict_success_dicts[_id.decode('utf-8')] = inspections.extract_success_dict(dict_for_df=dict_for_df,
        _id_decoded = _id.decode('utf-8')
        dict_success_dicts.update( inspections.extract_success_dict(dict_for_df={_id_decoded:dict_for_df[_id_decoded]},
                                                             exact_sols=_cache_dict_sim_anneling_solutions[1][row_id],
                                                             n_samples_to_compare=n_samples_to_compare,
                                                             n_exact_sols_to_compare=n_exact_sols_to_compare,
                                                             is_skip_custom_key_in_dict_for_df=is_skip_custom_key_in_dict_for_df,
                                                             is_print_sols=is_print_sols, is_print_meta=is_print_meta, print_prefix=print_prefix + ' '))
    return dict_success_dicts    
    

In [None]:
# samples_data = dict_study_merged_substudies[2][0]['zz_4593727025']['sampleset']['000_0142']['_record']['data']
# list_dfs = []
# for run_sub_key, run_sub in  dict_study_merged_substudies[2][0]['zz_4593727025']['sampleset'].items():
#     list_dfs.append(pd.DataFrame(run_sub['_record']['data'].tolist(), columns=run_sub['_record']['data'].dtype.names))
# combined_df = pd.concat(list_dfs)
# combined_df['sample'] = combined_df['sample'].apply(lambda x: tuple(x))
# res_df = combined_df.groupby('sample', as_index=False).aggregate({'energy':'first',	'num_occurrences':'sum','chain_break_fraction': 'first'})
# res_df.sort_values('energy')

In [None]:
dict_success_local_from_merged_studies = {}
#n_samples_to_compare = (1,2,3,4)
#n_exact_sols_to_compare = (1,2,3,4)
n_samples_to_compare = (4,)
n_exact_sols_to_compare = (4,)
is_skip_custom_key_in_dict_for_df = True
for study_key in dict_study_merged_substudies.keys():
    #if  study_key !=5: continue
    if study_key in _cache_dict_sim_anneling_solutions.keys():
        pass
    else:
        _cache_dict_sim_anneling_solutions[study_key] = {}
    dict_success_local_from_merged_studies[study_key] = {}
    print(f'Processing study {study_key}...')
    if dict_study_merged_substudies[study_key] is not None and study_key != 1:
        for merged_substudies_key in range(len(dict_study_merged_substudies[study_key])):
            print(f'  Processing merged substudies {merged_substudies_key}...i.e. substudies {dict_substudies_need_merge[study_key][merged_substudies_key]}')
            if merged_substudies_key in _cache_dict_sim_anneling_solutions[study_key].keys():
                pass
            else:
                _cache_dict_sim_anneling_solutions[study_key][merged_substudies_key] = compute_sim_annealing_solution(study=study_key, substudy=dict_substudies_need_merge[study_key][merged_substudies_key][0], num_particles=5)
            
            for i, (_n_sam, _n_ex) in enumerate(itertools.product(n_samples_to_compare, n_exact_sols_to_compare)):
                if _n_sam > _n_ex:
                    continue

                print(f'  Comparing samples: {_n_sam} with exact solutions: {_n_ex}')
                _kwargs = {'dict_for_df':dict_study_merged_substudies[study_key][merged_substudies_key],
                #_kwargs = {'dict_for_df':{k: v for k,v in dict_study_merged_substudies[study_key][merged_substudies_key].items() 
                #                          if k == list(dict_study_merged_substudies[study_key][merged_substudies_key].keys())[0]
                #                          or k == list(dict_study_merged_substudies[study_key][merged_substudies_key].keys())[1]
                #                          or k == list(dict_study_merged_substudies[study_key][merged_substudies_key].keys())[2]},
                            'exact_sols':_cache_dict_sim_anneling_solutions[study_key][merged_substudies_key],
                            'n_samples_to_compare':_n_sam, 'n_exact_sols_to_compare':_n_ex, 
                            'is_skip_custom_key_in_dict_for_df':is_skip_custom_key_in_dict_for_df}
                
                if i == 0:
                    #dict_success_local_from_merged_studies[study_key][merged_substudies_key] = inspections.extract_success_dict(**_kwargs)
                    #dict_success_local_from_merged_studies[study_key][merged_substudies_key] = [inspections.extract_success_dict(**_kwargs)]
                    dict_success_local_from_merged_studies[study_key][merged_substudies_key] = inspections.extract_success_dict(**_kwargs)
                else:
                    assert False
                    for row_id, val in inspections.extract_success_dict(**_kwargs).items():
                        for ee in [e for e in list(val.keys()) if e.startswith('fraction_samples_')]:
                            print('      ', row_id, ee, val[ee])
                        #dict_success_local_from_merged_studies[study_key][merged_substudies_key][row_id].update({k:v for k,v in val.items() 
                        #                                                                             if k not in dict_success_local_from_merged_studies[study_key][merged_substudies_key][row_id]})
                    dict_success_local_from_merged_studies[study_key][merged_substudies_key].append(inspections.extract_success_dict(**_kwargs))

    elif dict_study_merged_substudies[study_key] is not None and study_key == 1:
        
        for merged_substudies_key in range(len(dict_study_merged_substudies[study_key])):
            dict_success_local_from_merged_studies[study_key][merged_substudies_key] = {}
            
            for i, (_n_sam, _n_ex) in enumerate(itertools.product(n_samples_to_compare, n_exact_sols_to_compare)):
                if _n_sam > _n_ex:
                    continue

                print(f'  Comparing samples: {_n_sam} with exact solutions: {_n_ex}')
                _kwargs = {'dict_for_df':dict_study_merged_substudies[study_key][0], 
                           'n_samples_to_compare':_n_sam,
                           'n_exact_sols_to_compare':_n_ex,
                           'is_skip_custom_key_in_dict_for_df':is_skip_custom_key_in_dict_for_df}
                if i == 0:
                    dict_success_local_from_merged_studies[study_key][merged_substudies_key] = extract_success_dict_study_1(**_kwargs)
                else:
                    for row_id, val in extract_success_dict_study_1(**_kwargs).items():
                        dict_success_local_from_merged_studies[study_key][merged_substudies_key][row_id].update({k:v for k,v in val.items() 
                                                                                                     if k not in dict_success_local_from_merged_studies[study_key][merged_substudies_key][row_id]})
    else:
        dict_success_local_from_merged_studies[study_key] = None

In [None]:
max(val['num_samples_matched_corrected_1_2'] for val in dict_success_local_from_merged_studies[5][0].values())

In [None]:
for k, v in dict_success_local_from_merged_studies[5][0].items():
    print(k)
    for kk, vv in v.items():
        if kk=='num_samples' or kk=='fraction_samples_is_found_best' or kk.startswith('num_samples_matched_corrected_') or kk.startswith('fraction_samples_matched_corrected_'):
            print('...', kk, vv)

In [None]:
dict_ids = {}
#for k, v in dict_success_local_from_merged_studies[2][0][4].items():
for k, v in dict_success_local_from_merged_studies[2][0].items():
    print(k)
    for kk, vv in v.items():
        if kk=='num_samples' or kk=='fraction_samples_is_found_best' or kk.startswith('num_samples_matched_corrected_') or kk.startswith('fraction_samples_matched_corrected_'):
            print('...', kk, vv)


In [None]:
# dict_success_local_from_merged_studies = {}
# n_samples_to_compare = 10
# n_exact_sols_to_compare = 10
# is_skip_custom_key_in_dict_for_df = True
# for study_key in dict_study_merged_substudies.keys():
#     dict_success_local_from_merged_studies[study_key] = {}
#     print(f'Processing study {study_key}...')
#     if dict_study_merged_substudies[study_key] is not None and study_key != 1:
#         for merged_substudies_key in range(len(dict_study_merged_substudies[study_key])):
#             print(f'  Processing merged substudies {merged_substudies_key}...i.e. substudies {dict_substudies_need_merge[study_key][merged_substudies_key]}')
#             dict_success_local_from_merged_studies[study_key][merged_substudies_key] = inspections.extract_success_dict(dict_for_df=dict_study_merged_substudies[study_key][merged_substudies_key],
#                                                         exact_sols=compute_sim_annealing_solution(study=study_key, substudy=dict_substudies_need_merge[study_key][merged_substudies_key][0], num_particles=5),
#                                                         n_samples_to_compare=n_samples_to_compare, n_exact_sols_to_compare=n_exact_sols_to_compare, is_skip_custom_key_in_dict_for_df=is_skip_custom_key_in_dict_for_df)
#     elif dict_study_merged_substudies[study_key] is not None and study_key == 1:
        
#         for merged_substudies_key in range(len(dict_study_merged_substudies[study_key])):
#             dict_success_local_from_merged_studies[study_key][merged_substudies_key] = {}
#             dict_success_local_from_merged_studies[study_key][merged_substudies_key] = extract_success_dict_study_1(dict_for_df=dict_study_merged_substudies[study_key][0], n_samples_to_compare=n_samples_to_compare, 
#                                                                                 n_exact_sols_to_compare=n_exact_sols_to_compare, is_skip_custom_key_in_dict_for_df=is_skip_custom_key_in_dict_for_df)
#     else:
#         dict_success_local_from_merged_studies[study_key] = None

In [None]:
print(list(dict_success_local_from_merged_studies[1][0].keys()))
print(list(dict_success_local_from_merged_studies[1][0]['zz_5703083384'].keys()))

In [None]:
print(list(dict_study_merged_substudies[3][0].keys()))
print(dict_success_local_from_merged_studies[3][0]['zz_0594500236'])

In [None]:
len(dict_success_local_from_merged_studies[3][0])

In [None]:
df_success_2_local = pd.DataFrame.from_dict(dict_success_2_local, orient='index').reset_index(names=['set_id'])
print(df_success_2_local.head())

In [None]:
df_success_2_merged_local = pd.DataFrame.from_dict(dict_success_local_from_merged_studies[3][0], orient='index').reset_index(names=['set_id'])
print(df_success_2_merged_local.head())

In [None]:
# dict_df_success_local =  {}
# for study_key in dict_study.keys():
#     dict_df_success_local[study_key] = {}
#     print(f'Processing study {study_key}...')
#     if dict_success_local[study_key] is not None:
#         for substudy_key in range(len(dict_success_local[study_key])):
#             print(f'  Processing substudy {substudy_key}...')
#             dict_df_success_local[study_key][substudy_key] = pd.DataFrame.from_dict(dict_success_local[study_key][substudy_key], orient='index').reset_index(names=['set_id'])
#             print(dict_df_success_local[study_key][substudy_key].head())
#     else:
#         dict_df_success_local[study_key] = None

In [None]:
dict_df_success_local_from_merged_studies =  {}
for study_key in dict_study.keys():
    dict_df_success_local_from_merged_studies[study_key] = {}
    print(f'Processing study {study_key}...')
    if dict_success_local_from_merged_studies[study_key] is not None:
        for merged_substudies_key in range(len(dict_success_local_from_merged_studies[study_key])):
            print(f'  Processing merged substudies {merged_substudies_key}...')
            dict_df_success_local_from_merged_studies[study_key][merged_substudies_key] = pd.DataFrame.from_dict(dict_success_local_from_merged_studies[study_key][merged_substudies_key], orient='index').reset_index(names=['set_id'])
            print(dict_df_success_local_from_merged_studies[study_key][merged_substudies_key].shape)
    else:
        dict_df_success_local_from_merged_studies[study_key] = None
#

In [None]:
dict_df_success_local_from_merged_studies[5][0]['fraction_samples_is_found_best'].max()
print(dict_df_success_local_from_merged_studies[5][0]['fraction_samples_matched_corrected_1_2'].idxmax())
print(dict_df_success_local_from_merged_studies[5][0]['fraction_samples_matched_corrected_1_1'].idxmax())
dict_df_success_local_from_merged_studies[5][0]['fraction_samples_matched_corrected_1_2'].iloc[335]


In [None]:
df_merged_params_success_2 = dict_df_params_from_info[2][0].merge(df_success_2_local, how='right', left_on='identifiers', right_on='set_id')
print(df_merged_params_success_2.shape)
df_merged_params_success_2.head()

In [None]:
#dict_df_merged_params_success= {}
##dict_df_success_local =  {}
#for study_key in dict_df_success_local.keys():
#    dict_df_merged_params_success[study_key] = {}
#    print(f'Processing study {study_key}...')
#    if dict_df_success_local[study_key] is not None:
#        for substudy_key in range(len(dict_df_success_local[study_key])):
#            print(f'  Processing substudy {substudy_key}...')
#            dict_df_merged_params_success[study_key][substudy_key] = dict_df_params_from_info[study_key][0].merge(dict_df_success_local[study_key][substudy_key], how='right', left_on='identifiers', right_on='set_id')
#            print(dict_df_merged_params_success[study_key][substudy_key].shape)
#            #print(dict_df_merged_params_success[study_key][substudy_key].head())
#    else:
#        dict_df_merged_params_success[study_key] = None



In [None]:
dict_df_params_from_info[2][0].head()

In [None]:
dict_df_success_local_from_merged_studies[2][0].head()

In [None]:
dict_df_success_local_from_merged_studies

dict_df_merged_params_success_from_merged_studies= {}
for study_key in dict_df_success_local_from_merged_studies.keys():
    dict_df_merged_params_success_from_merged_studies[study_key] = {}
    print(f'Processing study {study_key}...')
    if dict_df_merged_params_success_from_merged_studies[study_key] is not None:
        for merged_substudies_key in range(len(dict_df_success_local_from_merged_studies[study_key])):
            print(f'  Processing substudy {merged_substudies_key}...')
            dict_df_merged_params_success_from_merged_studies[study_key][merged_substudies_key] = dict_df_params_from_info[study_key][dict_substudies_need_merge[study_key][merged_substudies_key][0]].merge(dict_df_success_local_from_merged_studies[study_key][merged_substudies_key], how='right', left_on='identifiers', right_on='set_id')
            print(dict_df_merged_params_success_from_merged_studies[study_key][merged_substudies_key].shape)
            #print(dict_df_merged_params_success_from_merged_studies[study_key][merged_substudies_key].head())
    else:
        dict_df_merged_params_success_from_merged_studies[study_key] = None


In [None]:
dict_df_merged_params_success_from_merged_studies[7][0][['fraction_samples_is_found_best'] + [f'fraction_samples_matched_{i}_{j}' for i,j in itertools.product(n_samples_to_compare, n_exact_sols_to_compare) if j>=i]].head()

In [None]:
df_merged_params_success_2_normalized = df_merged_params_success_2.copy()
df_merged_params_success_2_normalized['annealing_time'] /= df_merged_params_success_2_normalized['annealing_time'].max()
df_merged_params_success_2_normalized['programming_thermalization'] /= df_merged_params_success_2_normalized['programming_thermalization'].max()
df_merged_params_success_2_normalized['readout_thermalization'] /= df_merged_params_success_2_normalized['readout_thermalization'].max()
df_merged_params_success_2_normalized['estimated_runtime'] /= df_merged_params_success_2_normalized['estimated_runtime'].max()
df_merged_params_success_2_normalized['fraction_samples_is_found_best'] /= df_merged_params_success_2_normalized['fraction_samples_is_found_best'].max()
df_merged_params_success_2_normalized.head()

In [None]:
df_merged_params_success_2_normalized.corr(method='pearson', numeric_only=True)

In [None]:
df_merged_params_success_2_normalized.corr(method='kendall', numeric_only=True)

In [None]:
df_merged_params_success_2_normalized.corr(method='spearman', numeric_only=True)

In [None]:
df_merged_params_success_2_normalized.cov(numeric_only=True, ddof=0)

In [None]:
df_merged_params_success_2_normalized.var(numeric_only=True, ddof=0)

In [None]:
df_merged_params_success_2_normalized_only_numeric_cols = df_merged_params_success_2_normalized.select_dtypes(include=[np.number])
df_merged_params_success_2_normalized_only_numeric_cols.head()

In [None]:
for i, j, in zip(df_merged_params_success_2_normalized_only_numeric_cols.columns,
                                   sm.stats.stattools.durbin_watson(df_merged_params_success_2_normalized_only_numeric_cols)):
    print(f'{i}: {j}')

In [None]:
for i, j, in zip(df_merged_params_success_2_normalized_only_numeric_cols.columns,
                                   sm.stats.stattools.jarque_bera(df_merged_params_success_2_normalized_only_numeric_cols)):
    print(f'{i}: {j}')

In [None]:
unistats= inspections.stats.univariate_statistics(df_merged_params_success_2_normalized_only_numeric_cols)
unistats.compute_all_statistics()

In [None]:
print(list(unistats.statistics.keys()))
print(unistats.statistics['nth_moment_biased'][3])
print(unistats.statistics['variance'])

In [None]:
with np.printoptions(precision=3, linewidth=200, edgeitems=3):
    for id, col in enumerate(df_merged_params_success_2_normalized_only_numeric_cols.columns):
        moments = np.array(tuple(unistats.statistics['nth_moment_biased'][nth][id] for nth in unistats.statistics['nth_moment_biased'].keys()))
        lmoments = np.array(tuple(unistats.statistics['nth_lmoment'][nth][id] for nth in unistats.statistics['nth_lmoment'].keys()))
        print(f'{col}:\n', moments, '\n', lmoments)
    

In [None]:
with np.printoptions(precision=3, linewidth=200, edgeitems=3):
    for id, col in enumerate(df_merged_params_success_2_normalized_only_numeric_cols.columns):
        gmeans = np.array(tuple(unistats.statistics['gmean']))
        print(f'{col}:', len(gmeans), np.array([unistats.statistics['gmean'][id], unistats.statistics['hmean'][id]]))

In [None]:
with np.printoptions(precision=3, linewidth=200, edgeitems=3):
    for id, col in enumerate(df_merged_params_success_2_normalized_only_numeric_cols.columns):
        entropy = np.array(unistats.statistics['entropy'])
        crossentropy = np.array(unistats.statistics['cross_entropy'][id])
        print(f'{col}:', len(entropy), entropy[id], crossentropy)

In [None]:
inspections.stats.sp_stats.binned_statistic(df_merged_params_success_2['fraction_samples_is_found_best'], None, statistic='count')

In [None]:
inspections.stats.sp_stats.binned_statistic_2d(x = df_merged_params_success_2['annealing_time'],
                                               y = df_merged_params_success_2['readout_thermalization'], 
                                               values= df_merged_params_success_2['fraction_samples_is_found_best'],
                                               bins=[10,10],
                                               range=[[df_merged_params_success_2['annealing_time'].min(), df_merged_params_success_2['annealing_time'].max()],
                                                       [df_merged_params_success_2['readout_thermalization'].min(), df_merged_params_success_2['readout_thermalization'].max()]],
                                               statistic='count')

In [None]:
# dict_df_merged_params_success[5][0].select_dtypes(include=[np.number]).to_numpy().shape

In [None]:
# inspections.stats.sp_stats.binned_statistic_dd(sample = dict_df_merged_params_success[5][0].select_dtypes(include=[np.number]).to_numpy(),
#                                                values = dict_df_merged_params_success[5][0]['fraction_samples_is_found_best'], 
#                                                #bins=[10,10],
#                                                statistic='count')

In [None]:
df_merged_params_success_2[['annealing_time', 'readout_thermalization']]

In [None]:
dict_salib_problem = {}
for study_key in dict_df_merged_params_success_from_merged_studies.keys():
    dict_salib_problem[study_key] = {}
    print(f'Processing study {study_key}...')
    if dict_df_merged_params_success_from_merged_studies[study_key] is not None and study_key > 1:
        for merged_substudies_key in range(len(dict_df_merged_params_success_from_merged_studies[study_key])):
            print(f'  Processing merged substudies {merged_substudies_key}...')
            dict_salib_problem[study_key][merged_substudies_key] = None
            _initial_substudy_key_for_info = dict_substudies_need_merge[study_key][merged_substudies_key][0]
            if study_key not in (6, 7, 8):
                dict_salib_problem[study_key][merged_substudies_key] = {'num_vars': dict_infos_read[study_key][_initial_substudy_key_for_info]['attrs']['num_vars'],
                                                                        'names':    [n for n in dict_infos_read[study_key][_initial_substudy_key_for_info]['attrs']['names'] if n != 'estimated_runtime'],
                                                                        'bounds':   dict_infos_read[study_key][_initial_substudy_key_for_info]['attrs']['bounds']
                                                               }
            else:
                dict_salib_problem[study_key][merged_substudies_key] = {'num_vars': 0,
                                                                        'names':    [n for n in dict_infos_read[study_key][_initial_substudy_key_for_info]['attrs']['names'] if
                                                                            (n != 'estimated_runtime') and (n[0] not in ('t', 's'))],
                                                                }
                dict_salib_problem[study_key][merged_substudies_key]['names'].append('a')
                dict_salib_problem[study_key][merged_substudies_key]['names'].append('b')
                dict_salib_problem[study_key][merged_substudies_key]['num_vars'] = len(dict_salib_problem[study_key][merged_substudies_key]['names'])
                dict_salib_problem[study_key][merged_substudies_key]['bounds'] = \
                                            dict_infos_read[study_key][_initial_substudy_key_for_info]['attrs']['bounds'][:-2+len(dict_salib_problem[study_key][merged_substudies_key]['names'])]
                dict_salib_problem[study_key][merged_substudies_key]['bounds'] = np.append(dict_salib_problem[study_key][merged_substudies_key]['bounds'], np.array([[-1.0, 1.0],[0.0625, 1.0]]), axis=0)
                                                               
            assert dict_infos_read[study_key][_initial_substudy_key_for_info]['attrs']['names'][-1] == 'estimated_runtime'
            for kkk, vvv in dict_salib_problem[study_key][merged_substudies_key].items():
                print(f'    {kkk}: {vvv}')
    #elif dict_df_success_local[study_key] is not None and study_key == 1:
    elif dict_df_merged_params_success_from_merged_studies[study_key] is not None and study_key == 1:
        print('There is only a single substudy.')
        # dict_salib_problem[1][0] = {'num_vars': dict_infos_read[1][0]['attrs']['names'].shape[0],
        #                             'names':    [dict_infos_read[1][0]['attrs']['names'][i] for i in range(dict_infos_read[1][0]['attrs']['names'].shape[0])],
        #                             'bounds':   np.array([[dict_infos_read[1][0]['study']['data']['sets']['num_particles'].min(),
        #                                                         dict_infos_read[1][0]['study']['data']['sets']['num_particles'].max()],
        #                                                   [dict_infos_read[1][0]['study']['data']['sets']['num_nearest_neighbours'].min(),
        #                                                         dict_infos_read[1][0]['study']['data']['sets']['num_nearest_neighbours'].max()]])}
        dict_salib_problem[1][0] = {'num_vars': dict_infos_read[1][0]['attrs']['names'].shape[0],
                                    'names':    [dict_infos_read[1][0]['attrs']['names'][0], f'Fraction_of_{dict_infos_read[1][0]['attrs']['names'][0]}'],
                                    'bounds':   np.array([[dict_infos_read[1][0]['study']['data']['sets']['num_particles'].min(),
                                                                dict_infos_read[1][0]['study']['data']['sets']['num_particles'].max()],
                                                          [0.4, 1.0]])}
        #print(dict_salib_problem[1][0])
        for kkk, vvv in dict_salib_problem[1][0].items():
                print(f'    {kkk}: {vvv}')
    else:
        dict_salib_problem[study_key] = None
    

In [None]:


def _curvefit_annealing_schedule(t, s):
    """
    Fit a curve to the annealing schedule data.
    cuve is of the form: a * exp(b * t) + c, with a {-1.0, 1.0}, b {0.0625, 1.0}, c =-a
    :param t: time points {0.0, anneal_time}
    :param s: anneal magnitude {0.0, 1.0}
    :return: Fitted parameters
    """
    def _func(x, a, b):
        return a * np.exp(-b * x) - a
    ab_array = np.zeros((t.shape[0], 2))
    for i in range(t.shape[0]):
        ab_array[i, :] = scipy.optimize.curve_fit(_func, t[i,:], s[i,:], bounds=([-1.0, 0.0625], [1.0, 1.0]))[0]
    #print('Fitted parameters (a, b):', ab_array)
    return ab_array
    

dict_salib_analyses = {}
for study_key in dict_salib_problem.keys():
    dict_salib_analyses[study_key] = []
    print(f'Processing study {study_key}...')
    if dict_df_success_local_from_merged_studies[study_key] is not None:# and study_key > 1:
        for merged_substudies_key in range(len(dict_df_success_local_from_merged_studies[study_key])):
            print(f'  Processing merged_substudies {merged_substudies_key}...i.e. substudies {dict_substudies_need_merge[study_key][merged_substudies_key]}')
            _a = SALib.ProblemSpec(dict_salib_problem[study_key][merged_substudies_key])
            #print(dict_infos_read[study_key][merged_substudies_key]['study']['data']['sets'])
            _samples = dict_infos_read[study_key][merged_substudies_key]['study']['data']['sets']
            if _samples.ndim == 1:
                _samples = np.atleast_2d(_samples).T  # Transpose because np.atleast_2d does (n,) -> (1, n), but we need (n, 1) to be consistent
            if study_key not in (1,6,7,8):
                if study_key == 5 and merged_substudies_key in (0,1):
                    _samples = _samples.view(np.float64)[:,:-1]
                    _samples = np.pad(_samples, ((0, 2), (0, 0)), 'constant', constant_values=(0, 0))
                    _a.set_samples(_samples)
                else:
                    _a.set_samples(_samples.view(np.float64)[:,:-1])  # Exclude the last two columns (a, b) for the analysis
            elif study_key in (6,7,8):
                print(_samples.view(np.float64).shape)
                offset = 8 if study_key != 6 else 0
                ab_array = _curvefit_annealing_schedule(_samples.view(np.float64)[:,offset:offset+12], _samples.view(np.float64)[:,offset+12:offset+12+12])  # Fit the annealing schedule curve
                _col_id_to_insert = dict_df_merged_params_success_from_merged_studies[study_key][merged_substudies_key].columns.get_loc('t00')
                dict_df_merged_params_success_from_merged_studies[study_key][merged_substudies_key].insert(_col_id_to_insert,     'a', ab_array[:, 0])
                dict_df_merged_params_success_from_merged_studies[study_key][merged_substudies_key].insert(_col_id_to_insert + 1, 'b', ab_array[:, 1])
                _samples = np.append(_samples.view(np.float64)[:,:_a['num_vars']-2],
                                     ab_array, axis=1)  # Add two columns for a and b
                _a.set_samples(_samples)  # Exclude the last two columns (a, b) for the analysis
            elif study_key == 1:
                _samples = _samples.view(np.int32)
                _samples = np.pad(_samples, ((0, 1), (0, 0)), 'constant', constant_values=(0, 0))
                print(_samples.view(np.int32).shape)
                _a.set_samples(_samples.view(np.int32))

            _Y = dict_df_success_local_from_merged_studies[study_key][merged_substudies_key]['fraction_samples_is_found_best'].to_numpy()
            print(_a)
            if _Y.shape[0] != _a.samples.shape[0]:
                print(f'Padding _Y from {_Y.shape[0]} to {_a.samples.shape[0]}')
                # Pad _Y with zeros to match the number of samples in _a
                _Y = np.pad(_Y, (0, _a.samples.shape[0]-_Y.shape[0]), 'constant', constant_values=(0, 0))
            _is_calc_second_order = True if _Y.shape[0] % (2*_a['num_vars']+2) == 0  else False
            if study_key == 5 and merged_substudies_key in (0,1):
                _is_calc_second_order = False
            if study_key == 7:
                print('  ', _Y.size % (2 * _a['num_vars'] + 2) == 0)
                print('  ', _Y.size % (_a['num_vars'] + 2) == 0)
                print(f'  _Y shape: {_Y.shape}, _Y.size: {_Y.size}, _a.num_vars: {_a['num_vars']}, _is_calc_second_order: {_is_calc_second_order}')

            dict_salib_analyses[study_key].append(SALib.analyze.sobol.analyze(
                problem=_a,
                Y=_Y,
                calc_second_order=_is_calc_second_order,
                print_to_console=False
            ))
            print(dict_salib_analyses[study_key][merged_substudies_key])

    elif dict_df_success_local_from_merged_studies[study_key] is not None and study_key == 1:
        print('There is only a single substudy.')
        # dict_salib_problem[1][0] = {'num_vars': dict_infos_read[1][0]['attrs']['names'].shape[0],
        #                             'names':    dict_infos_read[1][0]['attrs']['names'],
        #                             'bounds':   np.array([[dict_infos_read[1][0]['study']['data']['sets']['num_particles'].min(),
        #                                                         dict_infos_read[1][0]['study']['data']['sets']['num_particles'].max()],
        #                                                   [dict_infos_read[1][0]['study']['data']['sets']['num_nearest_neighbours'].min(),
        #                                                         dict_infos_read[1][0]['study']['data']['sets']['num_nearest_neighbours'].max()]])}
        dict_salib_problem[1][0] = {'num_vars': dict_infos_read[1][0]['attrs']['names'].shape[0],
                                    'names':    [dict_infos_read[1][0]['attrs']['names'], 'Fraction_of_p1'],
                                    'bounds':   np.array([[dict_infos_read[1][0]['study']['data']['sets']['num_particles'].min(),
                                                                dict_infos_read[1][0]['study']['data']['sets']['num_particles'].max()],
                                                          [0.4, 1.0]])}
        print(dict_salib_problem[1][0])
    else:
        dict_salib_problem[study_key] = None


In [None]:
plt.close('all')

In [None]:
for key in dict_salib_analyses.keys():
    print(key)
    if dict_salib_analyses[key] is not None:
        print('  ', len(dict_salib_analyses[key]), [id for id in range(len(dict_salib_analyses[key]))])
        for subkey in range(len(dict_salib_analyses[key])):
            print('  ', key, subkey, dict_salib_problem[key][subkey]['num_vars'])
            if dict_salib_problem[key][subkey]['num_vars']:
                _plot = dict_salib_analyses[key][subkey].plot()
                _plot[0].figure.set_size_inches(14, 10)
                _plot[0].figure.suptitle('Sobol indices', fontsize=16)
                _plot[0].set_title('Total Sobol indices', fontsize=14)
                _plot[1].set_title('First order Sobol indices', fontsize=14)
                if _plot.shape[0] == 3:
                    _plot[2].set_title('Second order Sobol indices', fontsize=14)
                for ax in _plot:
                    ax.tick_params(axis='both', which='major', labelsize=14)
                    ax.tick_params(axis='both', which='minor', labelsize=12)
                    ax.set_ylim((-1.0, 1.0))
                plt.tight_layout()

                plt.savefig(f'03_inspect/02_figs/salib_analysis_{key}_{subkey}.pdf')
    else:
        print('  No analyses available for this study.')

In [None]:
df_salib_analysis_7 = dict_salib_analyses[7][0].to_df()
df_salib_analysis_7

In [None]:
plt.close('all')
fig = plt.figure(figsize=(10, 10))
gs = fig.add_gridspec(2, 2)
axs0 = fig.add_subplot(gs[0, 0])
axs1 = fig.add_subplot(gs[0, 1])
axs2 = fig.add_subplot(gs[1, :])

#axs[0, 0].bar(range(df_salib_analysis_7[1].shape[0]), df_salib_analysis_7[1]['S1'], label='First order Sobol indices', yerr=df_salib_analysis_7[1]['S1_conf'])
df_salib_analysis_7[0].plot(kind='bar', ax=axs0, yerr='ST_conf')
df_salib_analysis_7[1].plot(kind='bar', ax=axs1, yerr='S1_conf')
df_salib_analysis_7[2].plot(kind='bar', ax=axs2, yerr='S2_conf')

axs0.set_title('Total Sobol Indices')
axs1.set_title('First Order Sobol Indices')
axs2.set_title('Second Order Sobol Indices')

for ax in fig.axes:
    ax.grid(True, axis='y', linestyle='--', linewidth=0.25)
#fig.suptitle('Sobol Indices for Study 7', fontsize=16)
fig.tight_layout()
plt.savefig('03_inspect/02_figs/salib_analysis_7_manual.pdf')

In [None]:
dict_salib_problem[7][0]

In [None]:
plt.close('all')
dict_plots = {}
bar_width = 0.1
bar_spacing = 0.15
for key in dict_salib_analyses.keys():
    print(key)
    dict_plots[key] = []
    if dict_salib_analyses[key] is not None:
        print('  ', key, len(dict_salib_analyses[key]), [id for id in range(len(dict_salib_analyses[key]))])
        for subkey in range(len(dict_salib_analyses[key])):
            print('    ', key, subkey, dict_salib_problem[key][subkey]['num_vars'])
            _fig, _axs = plt.subplots(1, 2, figsize=(12, 6))
            _num_vars = dict_salib_problem[key][subkey]['num_vars']
            _axs[0].set_title(f'Sobol indices for study {key}, substudy {subkey}')
            _axs[0].set_ylim(None, 1.0)
            _axs[0].bar(x = np.arange(0, _num_vars*(bar_spacing), (bar_spacing)), height=dict_salib_analyses[key][subkey]['S1'], width=0.1, tick_label=list(dict_salib_problem[key][subkey]['names']))
            _axs[0].set_xticklabels(_axs[0].get_xticklabels(),rotation=60, ha='right')
            _fig.tight_layout()
    else:
        print('  No analyses available for this study.')

In [None]:
#####
# Compare 2 & 3 & 4
#####
plt.close('all')

x = np.arange(3)  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

s2_ticklabels = [dict_salib_problem[2][0]['names'][0]+',\n'+dict_salib_problem[2][0]['names'][1],
                 dict_salib_problem[2][0]['names'][0]+',\n'+dict_salib_problem[2][0]['names'][2],
                 dict_salib_problem[2][0]['names'][1]+',\n'+dict_salib_problem[2][0]['names'][2]]
legend_labels = ['Study 2', 'Study 3 flux_drift_comp=false', 'Study 3 flux_drift_comp=true']
fig, axs = plt.subplots(nrows=1, ncols=3, layout='constrained', figsize=(12, 6))
for i, id_name in enumerate(['ST', 'S1', 'S2']):
#for i, id_name in enumerate(['S1']):
    multiplier = 0
    for study_id in range(2,5):
        offset = width * multiplier
        if id_name != 'S2':
            #values = [dict_salib_analyses[2][0][id_name][id], dict_salib_analyses[3][0][id_name][id], dict_salib_analyses[4][0][id_name][id]]
            #print(values)
            values = dict_salib_analyses[study_id][0][id_name]
            rects = axs[i].bar(x + offset, values, width, label=legend_labels[study_id-2])
            axs[i].set_xticks(x + width, dict_salib_problem[2][0]['names'], rotation=90)
        else:
            if id == 0:
                values = [dict_salib_analyses[study_id][0][id_name][0,1],
                           dict_salib_analyses[study_id][0][id_name][0,2],
                           dict_salib_analyses[study_id][0][id_name][1,2]]
            elif id == 1:
                values = [dict_salib_analyses[study_id][0][id_name][0,1],
                           dict_salib_analyses[study_id][0][id_name][0,2],
                           dict_salib_analyses[study_id][0][id_name][1,2]]
            elif id == 2:
                values = [dict_salib_analyses[study_id][0][id_name][0,1],
                           dict_salib_analyses[study_id][0][id_name][0,2],
                           dict_salib_analyses[study_id][0][id_name][1,2]]
            rects = axs[i].bar(x + offset, values, width, label=legend_labels[study_id-2])
            axs[i].set_xticks(x + width, s2_ticklabels, rotation=90)

        axs[i].set_ylim(-0.5, 1.0)
        axs[i].grid(axis='y', linestyle='--', alpha=0.7)
        multiplier += 1
        #axs[i].bar_label(rects, padding=3)

# Add some text for labels, title and custom x-axis tick labels, etc.
axs[0].set_ylabel('Index value')
axs[0].set_title('Total Sobol indices')
axs[1].set_title('First order Sobol indices')
axs[2].set_title('Second order Sobol indices')
axs[1].legend(loc='upper right')
fig.suptitle('Comparison of Sobol indices for studies 2, 3, 4', fontsize=16)

plt.show()
plt.savefig('03_inspect/02_figs/sobol_indices_comparison_2_3_4.pdf')

In [None]:
dict_salib_analyses[7][0]

In [None]:
#####
# Compare 7 & 8
#####

plt.close('all')

x = np.arange(10)  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

s2_ticklabels = [i[0]+',\n'+i[1] for i in itertools.combinations(dict_salib_problem[7][0]['names'], 2)]
print(s2_ticklabels)
legend_labels = ['Study 7', 'Study 8']
fig, axs = plt.subplots(nrows=1, ncols=3, layout='constrained', figsize=(12, 6))
for i, id_name in enumerate(['ST', 'S1', 'S2']):
#for i, id_name in enumerate(['S1']):
    multiplier = 0
    for study_id in range(7,9):
        offset = width * multiplier
        if id_name != 'S2':
            #values = [dict_salib_analyses[2][0][id_name][id], dict_salib_analyses[3][0][id_name][id], dict_salib_analyses[4][0][id_name][id]]
            #print(values)
            values = dict_salib_analyses[study_id][0][id_name]
            rects = axs[i].bar(x + offset, values, width, label=legend_labels[study_id-7])
            axs[i].set_xticks(x + width, dict_salib_problem[7][0]['names'], rotation=90)
        else:
            values = [dict_salib_analyses[study_id][0][id_name][i,j] for i in range(0,10) for j in range(i+1, 10)] 
            print(values)
            rects = axs[i].bar(np.arange(len(values)) + offset, values, width, label=legend_labels[study_id-7])
            axs[i].set_xticks(np.arange(len(values)) + width, s2_ticklabels, rotation=90)

        axs[i].set_ylim(-0.5, 1.0)
        axs[i].grid(axis='y', linestyle='--', alpha=0.7)
        multiplier += 1
        #axs[i].bar_label(rects, padding=3)

# Add some text for labels, title and custom x-axis tick labels, etc.
axs[0].set_ylabel('Index value')
axs[0].set_title('Total Sobol indices')
axs[1].set_title('First order Sobol indices')
axs[2].set_title('Second order Sobol indices')
axs[1].legend(loc='upper right')
fig.suptitle('Comparison of Sobol indices for studies 7, 8', fontsize=16)

plt.show()
plt.savefig('03_inspect/02_figs/sobol_indices_comparison_7_8.pdf')

In [None]:
#dict_salib_problem[7][0]
dict_df_success_local_from_merged_studies[7][0].head()
dict_df_merged_params_success_from_merged_studies[7][0].head()
#dict_df_success_local_from_merged_studies[7][0].head()

In [None]:
dict_df_merged_params_success_from_merged_studies[7][0].columns

In [None]:
cols_for_svd = ['annealing_time', 'programming_thermalization',
       'readout_thermalization', 'flux_drift_compensation', 'chain_strength',
       'anneal_offsets_1_qubits', 'anneal_offsets_2_qubits',
       'anneal_offsets_3_qubits', 'a', 'b', 'fraction_samples_is_found_best']
array_for_svd = dict_df_merged_params_success_from_merged_studies[7][0][cols_for_svd].to_numpy()
print(array_for_svd.shape)

In [None]:
U, S, Vh = scipy.linalg.svd(array_for_svd, lapack_driver='gesvd')
sigma = np.zeros(array_for_svd.shape)
for i in range(min(array_for_svd.shape)):
    sigma[i, i] = S[i]
assert np.allclose(array_for_svd, np.dot(U, np.dot(sigma, Vh)))
S

In [None]:
with np.printoptions(linewidth=400):
    print(Vh[0, :], np.linalg.norm(Vh[0, :]))
    for c, vh in zip(cols_for_svd, Vh[0, :]):
        print(f'{c}:\t {vh}')

In [None]:
#####
# Compare cases in 1
#####
plt.close('all')

x = np.arange(2)  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

s2_ticklabels = [dict_salib_problem[1][0]['names'][0]+',\n'+dict_salib_problem[1][0]['names'][1]]
legend_labels = ['Study 1']
fig, axs = plt.subplots(nrows=1, ncols=3, layout='constrained', figsize=(12, 6))
for i, id_name in enumerate(['ST', 'S1', 'S2']):
#for i, id_name in enumerate(['S1']):
    multiplier = 0
    for study_id in range(1,2):
        offset = width * multiplier
        if id_name != 'S2':
            #values = [dict_salib_analyses[2][0][id_name][id], dict_salib_analyses[3][0][id_name][id], dict_salib_analyses[4][0][id_name][id]]
            #print(values)
            values = dict_salib_analyses[study_id][0][id_name]
            rects = axs[i].bar(x + offset, values, width, label=legend_labels[study_id-2])
            axs[i].set_xticks(x + width, dict_salib_problem[1][0]['names'], rotation=90)
        else:
            values = [dict_salib_analyses[study_id][0][id_name][0,1]]
            
            rects = axs[i].bar((x[0] + offset,), values, width, label=legend_labels[study_id-2])
            axs[i].set_xticks((x[0] + offset,), s2_ticklabels, rotation=90)

        axs[i].set_ylim(-0.5, 1.0)
        axs[i].grid(axis='y', linestyle='--', alpha=0.7)
        multiplier += 1
        #axs[i].bar_label(rects, padding=3)

# Add some text for labels, title and custom x-axis tick labels, etc.
axs[0].set_ylabel('Index value')
axs[0].set_title('Total Sobol indices')
axs[1].set_title('First order Sobol indices')
axs[2].set_title('Second order Sobol indices')
axs[1].legend(loc='upper right')
fig.suptitle('Comparison of Sobol indices for variations in study 1', fontsize=16)

plt.show()
plt.savefig('03_inspect/02_figs/sobol_indices_comparison_1.pdf')

In [None]:
dict_df_merged_params_success_from_merged_studies[1][0].columns#[['num_samples_is_found_best', 'num_samples_matched_1_3', 'num_samples_matched_2_4']]

In [None]:
#####
# Heatmap plot for cases in 1
#####

_df_sorted = dict_df_merged_params_success_from_merged_studies[1][0].sort_values(
    by=['num_particles','num_nearest_neighbours'])
_df_sorted.head()

dict_df_merged_params_success_from_merged_studies[1][0].head()
data_imshow = -1*np.ones((len(_df_sorted['num_particles'].unique()), len(_df_sorted['num_nearest_neighbours'].unique())))
data_num_samples = np.zeros_like(data_imshow, dtype=int)
for np_id in range(len(_df_sorted['num_particles'].unique())):
    nump = _df_sorted['num_particles'].unique()[np_id]
    for nn_id, numn in enumerate(_df_sorted[_df_sorted['num_particles'] == nump]['num_nearest_neighbours'].unique()):
        _nn_id = np.where(_df_sorted['num_nearest_neighbours'].unique() == numn)[0][0]
        data_imshow     [np_id, _nn_id] = _df_sorted[(_df_sorted['num_particles'] == nump) & (_df_sorted['num_nearest_neighbours'] == numn)]['fraction_samples_is_found_best'].values[0]
        data_num_samples[np_id, _nn_id] = _df_sorted[(_df_sorted['num_particles'] == nump) & (_df_sorted['num_nearest_neighbours'] == numn)]['num_samples_is_found_best'].values[0]
#data_imshow *= 100
print(data_imshow.shape, data_imshow)
print(data_num_samples.shape, data_num_samples)
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
cmap = matplotlib.colormaps.get_cmap("viridis")
cmap.set_bad(color='white')

im = axs.imshow(np.ma.masked_where(data_imshow == -1, data_imshow), cmap=cmap, vmin=0.0)
axs.set_title('Fraction of samples found best for different number of particles and nearest neighbours')
axs.set_xlabel('Number of nearest neighbours')
axs.set_ylabel('Number of particles')
axs.set_xticks(range(data_imshow.shape[1]), labels=_df_sorted['num_nearest_neighbours'].unique(),
                rotation=45, ha="right", rotation_mode="anchor")
axs.set_yticks(range(data_imshow.shape[0]), labels=_df_sorted['num_particles'].unique())
for i in range(data_imshow.shape[0]):
        for j in range(data_imshow.shape[1]):
            if _df_sorted['num_nearest_neighbours'].unique()[j] in _df_sorted[(_df_sorted['num_particles'] == _df_sorted['num_particles'].unique()[i])]['num_nearest_neighbours'].values:
                color = 'w' if i!=0 or j!=0 else 'k'
                text = axs.text(j, i, format(data_imshow[i, j], ".2e") + '\n' + str(data_num_samples[i, j]),
                        ha="center", va="center", color=color, fontsize=6)
cbar = axs.figure.colorbar(im, ax=axs, fraction=0.02, pad=0.04)
fig.tight_layout()
fig.savefig('03_inspect/02_figs/heatmap_study_1.pdf', bbox_inches='tight')

In [None]:
#####
# Simple plots for cases in 1
#####

type(np.ma.masked_where(data_imshow == -1, data_imshow)[0,5])

In [None]:
with np.printoptions(precision=3, linewidth=400):
    print(np.ma.masked_where(data_imshow == -1, data_imshow)[0,5])
    for k, v in dict_salib_problem.items():
        print('study:', k)
        for kk, vv in v.items():
            print(' substudy:', kk)
            for kkk, vvv in vv.items():
                print('  ', kkk, vvv)

In [None]:
print(dict_df_merged_params_success_from_merged_studies[2][0]['fraction_samples_is_found_best'].max())
#print(dict_df_merged_params_success_from_merged_studies[2][0]['fraction_samples_matched_1_1'].max())
print(dict_df_merged_params_success_from_merged_studies[2][0]['num_samples_matched_corrected_1_2'].max()/dict_df_merged_params_success_from_merged_studies[2][0]['num_samples'].sum())
print(dict_df_merged_params_success_from_merged_studies[2][0]['num_samples'].sum())
#print(dict_df_merged_params_success_from_merged_studies[2][0]['fraction_samples_matched_2_1'].max())

In [None]:
_cache_dict_sim_anneling_solutions[2][0]

In [None]:
def _fallback_found_0_update_ij(dict_df,i,j,_key0):
    _newj = str(int(j)-1)
    _key = _key0 + '_' + i + '_' + _newj
    _returnval = dict_df[study_id][substudy_id][_key].max()
    if _returnval == 0.0:
        _returnval = _fallback_found_0_update_ij(dict_df,i,_newj,_key0)
    return _returnval

def get_fractions_for_barplot(dict_df, study_id, substudy_id, key):
    keysplit = key.split('_')
    i, j = keysplit[-2], keysplit[-1]
    #print(i,j)
    returnval = -99.
    if False:# i == '1' and j == '1':
        returnval = dict_df[study_id][substudy_id]['fraction_samples_is_found_best'].max()
    else:
        _key = '_'.join(keysplit[:-2]) + '_' + i + '_' + j
        returnval = dict_df[study_id][substudy_id][_key].max()
        while returnval == 0.0:
            j = str(int(j)-1)
            if i=='0' and j=='0':
                returnval = dict_df[study_id][substudy_id]['fraction_samples_is_found_best'].max()
            else:
                returnval = dict_df[study_id][substudy_id]['_'.join(keysplit[:-2]) + '_' + i + '_' + j].max()
        #if returnval == 0.0:
        #    returnval = _fallback_found_0_update_ij(dict_df,i,j,'_'.join(keysplit[:-2]))
    if study_id == 5 and substudy_id == 0:
        print(_key, returnval)
        print('_'.join(keysplit[:-2]) + '_' + i + '_' + j)
        print(dict_df[study_id][substudy_id]['_'.join(keysplit[:-2]) + '_' + i + '_' + j].max())

    return returnval

In [None]:
plt.close('all')
studies = list(dict_df_merged_params_success_from_merged_studies.keys())
substudies = [(i,j) for i in studies for j in dict_df_merged_params_success_from_merged_studies[i].keys() if i > 1]
species = tuple((s[0],s[1]+1) for s in substudies) # groups
penguin_means = {
#    f'fraction_samples_matched_{i}_{j}': None for i in n_samples_to_compare for j in n_exact_sols_to_compare if i<4 and j<4}
    f'fraction_samples_matched_corrected_{i+1}_{j+1}': None for i in range(n_samples_to_compare[0]) for j in range(n_exact_sols_to_compare[0]) if j>=i and j<5}
for k in list(penguin_means.keys()):
    #penguin_means[k] = [100*dict_df_merged_params_success_from_merged_studies[i][j][k].max() for (i,j) in substudies]
    penguin_means[k] = [100*get_fractions_for_barplot(dict_df_merged_params_success_from_merged_studies, i, j, k) for (i,j) in substudies]
penguin_means['num_data'] = [dict_df_merged_params_success_from_merged_studies[i][j]['identifiers'].size/1000 for (i,j) in substudies]

x = 5.5*np.arange(len(species))  # the label locations
width = 0.45  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(figsize=(17, 6))
#fig, ax = plt.subplots()

for attribute, measurement in penguin_means.items():
    offset = width * multiplier
    label = attribute.split('_')
    if label[0] == 'fraction':
        label = f"QA: {label[-2]}, classical: {label[-1]}"
    elif label[0] == 'num':
        label = f"number parameter sets"
    rects = ax.bar(x + offset, measurement, width, label=label)
    ax.bar_label(rects, padding=3, fontsize=7, rotation=90)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Fraction of samples [%],\nNumber of successfully sampled parameter sets [x1000]')
ax.set_xlabel('(Study, substudy)')
ax.set_title('Fractions of samples of all annealer parameter studies and substudies')
ax.set_xticks(x + width + (len(species)-3)*width/2, species)
ax.legend()
#ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 12.75)
#fig.tight_layout()
plt.show()
fig.savefig('03_inspect/02_figs/fraction_samples_all_studies.pdf', bbox_inches='tight')

In [None]:
for k, v in dict_df_merged_params_success_from_merged_studies.items():
    print('study:', k)
    for kk, vv in v.items():
        print(' substudy:', kk)
        print('  fraction_samples_is_found_best:')
        print(vv['fraction_samples_is_found_best'].sort_values(ascending=False).head())
        

In [None]:
for name in dict_df_merged_params_success_from_merged_studies[7][0].columns:
    print(name)

In [None]:
names_to_exclude = ['identifiers', 'num_data', *[f't{i:02d}' for i in range(12)], *[f's{i:02d}' for i in range(12)], 'set_id', 'num_samples_is_found_best_per_run',
                     'is_found_best_per_run', 'num_subs_per_run', 'num_samples_per_run', 'num_samples_per_sub_per_run', 'submissions',
                     'fraction_samples_is_found_best_per_run',
                     *[f'num_samples_matched_per_run_{i}_{j}' for i in range(1, 6) for j in range(1, 6)],
                     *[f'num_matched_per_run_{i}_{j}' for i in range(1, 6) for j in range(1, 6)],
                     *[f'fraction_samples_matched_per_run_{i}_{j}' for i in range(1, 6) for j in range(1, 6)],
                     *[f'num_samples_matched_per_sub_per_run_{i}_{j}' for i in range(1, 6) for j in range(1, 6)],
                     *[f'num_matched_per_sub_per_run_{i}_{j}' for i in range(1, 6) for j in range(1, 6)]]
#names_to_exclude
cols_to_keep = [col for col in dict_df_merged_params_success_from_merged_studies[7][0].columns if col not in names_to_exclude]
dict_df_for_corr = dict_df_merged_params_success_from_merged_studies[7][0][cols_to_keep]
dict_df_for_corr.corr(method='pearson')
# for name in dict_df_for_corr.columns:
#     print(name, dict_df_for_corr[name].dtype)

In [None]:
dict_df_for_corr_normalized =(dict_df_for_corr-dict_df_for_corr.mean())/dict_df_for_corr.std()
dict_df_for_corr_normalized.corr(method='pearson')