## Sandbox notebook


In [None]:
import os
import numpy as np
import time
import shutil
import pandas as pd

from lib.script_01_00 import generate_initial_variables
from lib.script_01_01 import generate_raw_confounds
from lib.script_01_02 import generate_nonlin_confounds

from src.nets import nets_load_match, nets_inverse_normal, nets_normalise, nets_demean, nets_deconfound
from src.duplicate import duplicate_categorical, duplicate_demedian_norm_by_site
from src.preproc import datenum, days_in_year

from src.memmap.MemoryMappedDF import MemoryMappedDF

In [None]:

import os
import shutil
import numpy as np
import pandas as pd
from dask.distributed import Client, as_completed
from lib.script_01_05 import func_01_05_gen_nonlin_conf

cluster_cfg = {'cluster_type':'slurm','num_nodes':3}

# --------------------------------------------------------------------------------
# Handle empty configuration
# --------------------------------------------------------------------------------
if cluster_cfg is None:

    # Set new local configuration
    cluster_cfg = {'cluster_type':'local','num_nodes':1}

# --------------------------------------------------------------------------------
# Set up cluster
# --------------------------------------------------------------------------------
if 'cluster_type' in cluster_cfg:

    # Check if we are using a HTCondor cluster
    if cluster_cfg['cluster_type'].lower() == 'htcondor':

        # Load the HTCondor Cluster
        from dask_jobqueue import HTCondorCluster
        cluster = HTCondorCluster()

    # Check if we are using an LSF cluster
    elif cluster_cfg['cluster_type'].lower() == 'lsf':

        # Load the LSF Cluster
        from dask_jobqueue import LSFCluster
        cluster = LSFCluster()

    # Check if we are using a Moab cluster
    elif cluster_cfg['cluster_type'].lower() == 'moab':

        # Load the Moab Cluster
        from dask_jobqueue import MoabCluster
        cluster = MoabCluster()

    # Check if we are using a OAR cluster
    elif cluster_cfg['cluster_type'].lower() == 'oar':

        # Load the OAR Cluster
        from dask_jobqueue import OARCluster
        cluster = OARCluster()

    # Check if we are using a PBS cluster
    elif cluster_cfg['cluster_type'].lower() == 'pbs':

        # Load the PBS Cluster
        from dask_jobqueue import PBSCluster
        cluster = PBSCluster()

    # Check if we are using an SGE cluster
    elif cluster_cfg['cluster_type'].lower() == 'sge':

        # Load the SGE Cluster
        from dask_jobqueue import SGECluster
        cluster = SGECluster()

    # Check if we are using a SLURM cluster
    elif cluster_cfg['cluster_type'].lower() == 'slurm':

        # Load the SLURM Cluster
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster()

    # Check if we are using a local cluster
    elif cluster_cfg['cluster_type'].lower() == 'local':

        # Load the Local Cluster
        from dask.distributed import LocalCluster
        cluster = LocalCluster()

    # Raise a value error if none of the above
    else:
        raise ValueError('The cluster type, ' + cluster_cfg['cluster_type'] + ', is not recognized.')

else:
    # Raise a value error if the cluster type was not specified
    raise ValueError('Please specify "cluster_type" in the cluster configuration.')

# --------------------------------------------------------------------------------
# Connect to client
# --------------------------------------------------------------------------------

# Connect to cluster
client = Client(cluster)   

# Read in number of nodes we need
num_nodes = int(cluster_cfg['num_nodes'])

# Scale the cluster
cluster.scale(num_nodes)

# Get the dashboard link
dashboard_link = client.cluster.dashboard_link
dashboard_port = dashboard_link.split(':')[-1].split('/')[0]

print("Dask is now running at the following address on your cluster: " + 
      dashboard_link + ". If you wish to run locally, port " + str(dashboard_port) +
      " to your machine. e.g. run something like: \n \n ssh -L "+ str(dashboard_port) + ':localhost:' +
      str(dashboard_port) + " username@cluster_address \n \nand then navigate to http://localhost:" + 
      str(dashboard_port) +"/status to view the console.")

# --------------------------------------------------------------------------------
# Run cluster jobs
# --------------------------------------------------------------------------------

# Get the number of IDPs
num_IDPs = IDPs_deconf.shape[1]

# Empty futures list
futures = []

# Submit jobs
for i in np.arange(num_IDPs):

    # Run the i^{th} job.
    future_i = client.submit(func_01_05_gen_nonlin_conf, 
                             data_dir, out_dir, i, nonlinear_confounds, 
                             IDPs_deconf, pure=False)

    # Append to list 
    futures.append(future_i)

# Completed jobs
completed = as_completed(futures)

# Wait for results
for i in completed:
    i.result()

# Delete the future objects (NOTE: This is important! If you don't delete the 
# futures dask tries to rerun them every time you call the result function).
del i, completed, futures, future_i


In [None]:
pd.read_pickle(os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz'))

In [None]:

from src.nets.nets_load_match import nets_load_match 
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Load the subject ids
sub_ids = np.loadtxt(os.path.join(data_dir, 'subj.txt'), dtype=int)

# dtypes for fmrib info
dtypes = {i: 'float32' for i in range(6)}
dtypes[0] = 'int32'
dtypes[1] = 'int32'
dtypes[4] = 'int32'

# Read in info 
fmrib_info = nets_load_match(os.path.join(data_dir, 'ID_initial_workspace.txt'), sub_ids, dtypes=dtypes)


In [None]:
fmrib_info.columns

In [None]:
x = np.random.randn(10,10)
print(x.dtype)