# Break down of `run_experiment()`

In [1]:
import os
os.chdir('C:/Users/rokka/GH-repos/idea-lab-sw-isax')

import glob
import datetime as dt
import numpy as np
import pandas as pd
from collections import defaultdict
from itertools import product
from concurrent.futures import ProcessPoolExecutor as Pool
from tqdm import tqdm

# import local libraries
import fdl21.data.prototyping_metrics as pm
import fdl21.data.generate_catalog as gc
from fdl21 import isax_model
from fdl21.experiments import run_isax_experiments_sf as isax_exp_sf
import fdl21.visualization.isax_visualization as isax_vis

#catalog_fname = 'data/omni_master_catalog_1995_2019.csv'

ERROR [jupyter:notebook_metadata:224] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjasmine-kobayashi[0m ([33midea-lab[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Parameters

In [2]:
# init default parameters
input_file = None
start_date=dt.datetime(2018, 11, 21)
stop_date=dt.datetime(2018, 12, 31)
min_cardinality = 8
max_cardinality = 32
word_size = 10
threshold = 200
mu_x = 0.
std_x = 3.5
mu_y = 0.
std_y = 3.4
mu_z = 0.
std_z = 3.4
cadence=dt.timedelta(seconds=1)
chunk_size=dt.timedelta(seconds=300)
smooth_window=dt.timedelta(seconds=2)
detrend_window=dt.timedelta(seconds=1800)
node_level_depth = 2
min_node_size = 5
overlap = dt.timedelta(seconds=0)
min_cluster_size = 5
min_samples = 5
cache=False
cache_folder= '/cache/'
transliterate = False
instrument='psp'
cluster_selection_epsilon=None
plot_nodes=False
failsafe=False
n_processes=4

In [3]:
# non-default init parameters
input_file= None                     #if None, cut of catalog is used
start_date= dt.datetime(2018, 1, 1)  # start date of catalog cut
stop_date= dt.datetime(2018, 2, 1)   # end date of catalog cut
cadence=dt.timedelta(seconds=60)
cache=True
instrument='omni'

## Function code from line 582-731
Right before the line that has description `"Running pipelin..."`

In [4]:
# Cluster selection epsilon text (for pdf filename)
if cluster_selection_epsilon is None:
    cse_text = 'NA'
else:
    cse_text = str(int(cluster_selection_epsilon*10))

# File and directory path naming
cache_folder = f'CS{chunk_size.seconds}_C{cadence.seconds}_SW{smooth_window.seconds}_DW{detrend_window.seconds}_O{overlap.seconds}_{instrument}'
pdf_file = cache_folder + f'_WS{word_size}_CA{min_cardinality}_{max_cardinality}_MCS{min_cluster_size}_MS{min_samples}_T{threshold}_NLD{node_level_depth}_CSE{cse_text}'
cache_folder =  '/cache/' + cache_folder + '/'
v = isax_vis.iSaxVisualizer()

# Data catalog file name to access based on instrument
if instrument == 'psp':
    catalog_fname = 'psp_master_catalog_2018_2021_rads_norm.csv' 
elif instrument == 'wind':
    catalog_fname = 'wind_master_catalog_2006_2022.csv'
elif instrument == 'omni':
    catalog_fname = 'omni_master_catalog_1995_2022.csv'

# Orbit file
if instrument == 'omni':
    orbit_fname = None
else: 
    orbit_fname = 'psp_orbit.csv'

# Instantiate iSax model Pipeline
isax_pipe = isax_model.iSaxPipeline(
    orbit_fname = orbit_fname,
    catalog_fname = catalog_fname,
    threshold = threshold,
    word_size = word_size,
    min_cardinality = min_cardinality,
    max_cardinality = max_cardinality,
    mu_x = mu_x,
    std_x = std_x,
    mu_y = mu_y,
    std_y = std_y,
    mu_z = mu_z,
    std_z = std_z,
    instrument=instrument
)   

if failsafe:
    isax_pipe_dummy = isax_model.iSaxPipeline(
        orbit_fname = orbit_fname,
        catalog_fname = catalog_fname,
        threshold = threshold,
        word_size = word_size,
        min_cardinality = min_cardinality,
        max_cardinality = max_cardinality,
        mu_x = mu_x,
        std_x = std_x,
        mu_y = mu_y,
        std_y = std_y,
        mu_z = mu_z,
        std_z = std_z,
        instrument=instrument
    ) 

if input_file is None:
    catalog_cut = isax_pipe.catalog[start_date:stop_date]
    flist = list(catalog_cut['fname'].values)
    #LOG.info(f'Found {len(flist)} between {start_date} {stop_date}')
else:
    catalog_cut = pd.read_csv(input_file, header=0, index_col=0, parse_dates=True)
    flist = list(catalog_cut['fname'].values)
    #LOG.info(f'Analyzing {len(flist)} between {catalog_cut.index[0]} {catalog_cut.index[-1]}')

# Running the cache once to build the cache files
if cache:

    cache_list = list(product(
                        flist,
                        [cadence],
                        [chunk_size],
                        [overlap],
                        [smooth_window],
                        [detrend_window],
                        [cache_folder],
                        [instrument]                            
                        ))

    (flist_mp,
    cadence_mp,
    chunk_size_mp,
    overlap_mp,
    smooth_window_mp,
    detrend_window_mp,
    cache_folder_mp,
    instrument_mp) = map(list, zip(*cache_list))                                

    with Pool(max_workers=n_processes) as pool:
        good_files = pool.map(isax_exp_sf.build_cache,
                            flist_mp,
                            cadence_mp,
                            chunk_size_mp,
                            overlap_mp,
                            smooth_window_mp,
                            detrend_window_mp,
                            cache_folder_mp,
                            instrument_mp)

# Running the cache twice to calculate the histogram, means and stds
if cache:
    bad_files = []
    good_files = []
    for file in tqdm(flist, desc=f'Creating file caches and histograms...'):
        isax_pipe.mag_df = None

        try:
            isax_pipe.build_cache(
                file=file,
                cadence=cadence,
                chunk_size=chunk_size,
                overlap = overlap,
                rads_norm=True,
                smooth=True,
                smooth_window=smooth_window,
                detrend=True,
                detrend_window=detrend_window,
                optimized=True,
                cache_folder=cache_folder,
                instrument=instrument
            )
            good_files.append(True)
        except:
            bad_files.append(file)
            good_files.append(False)

    #LOG.info('Recalculating mean and standard deviations.')
    bins = isax_pipe.bins
    delta = np.nanmedian(bins[1:]-bins[0:-1])
    centers = (bins[1:]+bins[0:-1])/2
    for component in ['x', 'y', 'z']:
        hist = isax_pipe.hist[component]
        mu = np.sum(centers*hist*delta)/np.sum(hist*delta)
        
        sig = np.sum(np.power(centers-mu, 2)*hist*delta)
        sig = sig/np.sum(hist*delta)
        sig = np.sqrt(sig)

        isax_pipe._mu[component] = mu
        isax_pipe._std[component] = sig            
        #LOG.info(f'mu = {mu} and sig={sig} for ' + component + ' component')

flist = np.array(flist)
good_files = np.array(good_files)
flist = flist[good_files]

Creating file caches and histograms...:   0%|          | 0/2 [00:00<?, ?it/s]INFO [isax_model:build_cache:865] Loading cached file...
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
Creating file caches and histograms...:  50%|█████     | 1/2 [00:09<00:09,  9.04s/it]INFO [isax_model:build_cache:865] Loading cached file...
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
Creating file caches and histograms...: 100%|██████████| 2/2 [00:16<00:00,  8.26s/it]
  mu = np.sum(centers*hist*delta)/np.sum(hist*delta)


In [5]:
good_files

array([ True,  True])

In [6]:
flist  #list of good files

array(['/2018/omni_hro_1min_20180101_v01.cdf',
       '/2018/omni_hro_1min_20180201_v01.cdf'], dtype='<U36')

## Function code from line 732-847

Starting to run the iSAX pipeline right before the clustering. 

In [7]:
for file in tqdm(flist, desc=f'Running pipeline...'):
    isax_pipe.mag_df = None
    try:

        if failsafe:
            isax_pipe_dummy.run_pipeline(
                flist=[file],
                cadence=cadence,
                chunk_size=chunk_size,
                overlap = overlap,
                rads_norm=True,
                smooth=True,
                smooth_window=smooth_window,
                detrend=True,
                detrend_window=detrend_window,
                optimized=True,
                cache_folder=cache_folder,
                cache=cache,
                instrument=instrument
            )

        isax_pipe.run_pipeline(
            flist=[file],
            cadence=cadence,
            chunk_size=chunk_size,
            overlap = overlap,
            rads_norm=True,
            smooth=True,
            smooth_window=smooth_window,
            detrend=True,
            detrend_window=detrend_window,
            optimized=True,
            cache_folder=cache_folder,
            cache=cache,
            instrument=instrument
        )

        # reset dummie's forests
        isax_pipe_dummy._sw_forest = {'x': None, 'y': None, 'z': None, 'all': None}
    except:
        bad_files.append(file)


print('Bad Files------------------------------------------------------------')
print(bad_files)
bad_files_df = pd.DataFrame(data={'Bad Files': bad_files})
bad_files_df.to_csv(f'bad_files_{instrument}.csv')
print('------------------------------------------------------------')

node_sizes = defaultdict(list)
#LOG.info('Getting nodes for files')
for component in ['x', 'y', 'z']:
    isax_pipe.get_nodes_at_level(
        component=component,
        node_level=node_level_depth
    )
    for node in isax_pipe.nodes_at_level[component][node_level_depth]:
        node_sizes[component].append(node.get_annotations().shape[0])

for component in ['x', 'y', 'z']:
    node_sizes[component] = pd.Series(node_sizes[component])
    node_sizes[component].sort_values(ascending=False, inplace=True)

date_time = str(dt.datetime.now().strftime('%Y%m%d%H%M'))
# wandb.init(
#     entity='solar-wind', 
#     name=f'{pdf_file}_{date_time}',
#     project='CB_week_8_60_full_'+instrument, 
#     job_type='plot-isax-node',
#     config=isax_pipe.input_parameters
# )

if not os.path.exists('runs'):
    os.makedirs('runs')


parameter_file = isax_pipe.save(
    fname= 'runs/' + pdf_file + '.json',
    overwrite=True
)


dirname = pdf_file

# push_to_cloud(parameter_file.split('/')[1], dirname=dirname + '_' + date_time, relative_folder='runs/')
# example_table = wandb.Table(columns=[
#                 "Chunk Size",
#                 "Word Size",
#                 "Min Cardinality",
#                 "Max Cardinality",
#                 "Threshold",
#                 "Smooth_Window",
#                 "Detrend Window",
#                 "Overlap",
#                 "Component",
#                 "Min Samples",
#                 "Min Cluster Size",
#                 "Cluster Epsilon",
#                 "Number of Clusters", 
#                 "Number of Nodes", 
#                 "Cluster Image", 
#                 "Cluster PDF", 
#                 "Node PDF",
#                 "Tree",
#                 "Bucket Link"
#                 ]
# ) 

if transliterate:
    component_annotations = {'x': pd.DataFrame(),'y': pd.DataFrame(), 'z': pd.DataFrame()}
    transliteration_file = pdf_file + '_transliteration.csv'

for component in ['x','y','z']:

    nodes_at_level = isax_pipe.sw_forest[component].forest[0].get_nodes_of_level_or_terminal(node_level_depth)


  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
Running pipeline...: 100%|██████████| 2/2 [00:21<00:00, 10.87s/it]
INFO [isax_model:save:731] Saving input parameters to following output file,
runs/CS300_C60_SW2_DW1800_O0_omni_WS10_CA8_32_MCS5_MS5_T200_NLD2_C

Bad Files------------------------------------------------------------
['/2018/omni_hro_1min_20180101_v01.cdf', '/2018/omni_hro_1min_20180201_v01.cdf']
------------------------------------------------------------


In [8]:
bad_files

['/2018/omni_hro_1min_20180101_v01.cdf',
 '/2018/omni_hro_1min_20180201_v01.cdf']

In [28]:
isax_pipe.sw_forest

{'x': <pyCFOFiSAX._forest_iSAX.ForestISAX at 0x21d3ea39b10>,
 'y': <pyCFOFiSAX._forest_iSAX.ForestISAX at 0x21d3ea3a310>,
 'z': <pyCFOFiSAX._forest_iSAX.ForestISAX at 0x21d3ea3b6d0>,
 'all': None}

In [9]:
isax_pipe.run_pipeline(
            flist=[file],
            cadence=cadence,
            chunk_size=chunk_size,
            overlap = overlap,
            rads_norm=True,
            smooth=True,
            smooth_window=smooth_window,
            detrend=True,
            detrend_window=detrend_window,
            optimized=True,
            cache_folder=cache_folder,
            cache=cache,
            instrument=instrument
        )

  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(
  X_transformed[i_ts, i_seg, :] = segment.mean(axis=0)
  ret = um.true_divide(


type: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

# Full test of `run_experiment()`

In [26]:
import os
os.chdir('C:/Users/rokka/GH-repos/idea-lab-sw-isax')

import glob
import datetime as dt
import numpy as np
from tqdm import tqdm

# import local libraries
import fdl21.data.prototyping_metrics as pm
import fdl21.data.generate_catalog as gc
from fdl21 import isax_model
from fdl21.experiments import run_isax_experiments_sf as isax_exp_sf

#catalog_fname = 'data/omni_master_catalog_1995_2019.csv'

In [27]:
# isax_exp_sf.run_experiment(input_file= None,                   #if None, cut of catalog is used
#                            start_date= dt.datetime(2018, 1, 1),  # start date of catalog cut
#                            stop_date= dt.datetime(2018, 1, 2),   # end date of catalog cut
#                            cadence=dt.timedelta(seconds=60),
#                            cache=True,
#                            instrument='omni')