# Deploy a condor cluster and run Pycbc-Inference workflow

This notebook shows how to create Condor cluster controlled by pegasus and run the Pycbc Inference workflow.


## Import the FABlib Library


In [None]:
from ipaddress import ip_address, IPv4Address, IPv6Address, IPv4Network, IPv6Network
import ipaddress

from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

fablib = fablib_manager()
                     
fablib.show_config();

## Create the Experiment Slice

The following creates three nodes with basic NICs connected to an isolated local Ethernet.  
- poseidon-submit
- poseidon-data
- poseidon-worker1

Sets up a condor cluster controlled by pegasus. Then installs all the required software and runs the inference workflow.


In [None]:
slice_name = 'Pycbc-Inference'
site = fablib.get_random_site()
print(f"Site: {site}")

node1_name = 'poseidon-submit'
node2_name = 'poseidon-data'
node3_name = 'poseidon-worker1'

network_name='net1'

In [None]:
#Create Slice
slice = fablib.new_slice(name=slice_name)

# Network
net1 = slice.add_l2network(name=network_name, subnet=IPv4Network("192.168.1.0/24"))

# Node1
node1 = slice.add_node(name=node1_name, site=site, image="default_ubuntu_20", cores=4, ram=16, disk=100)
iface1 = node1.add_component(model='NIC_Basic', name='nic1').get_interfaces()[0]
iface1.set_mode('auto')
net1.add_interface(iface1)



# Node2
node2 = slice.add_node(name=node2_name, site=site, image="default_ubuntu_20", cores=4, ram=16, disk=100)
iface2 = node2.add_component(model='NIC_Basic', name='nic1').get_interfaces()[0]
iface2.set_mode('auto')
net1.add_interface(iface2)


# Node3
node3 = slice.add_node(name=node3_name, site=site, image="default_ubuntu_20", cores=4, ram=16, disk=100)
iface3 = node3.add_component(model='NIC_Basic', name='nic1').get_interfaces()[0]
iface3.set_mode('auto')
net1.add_interface(iface3)

# Run the appropriate boot scripts
for n in slice.get_nodes():
    n.add_post_boot_execute('git clone https://github.com/PoSeiDon-Workflows/pycbc.git')
    if "data" in n.get_name():
        n.add_post_boot_execute('sudo pycbc/fabric/scripts/data.sh')
    elif "submit" in n.get_name():
        n.add_post_boot_execute('sudo pycbc/fabric/scripts/submit-data.sh')
        #n.add_post_boot_execute('yes | sudo pycbc/inference/pycbc/setup.sh')
    else:
        n.add_post_boot_execute('sudo pycbc/fabric/scripts/worker.sh')
        #n.add_post_boot_execute('yes | sudo pycbc/inference/pycbc/setup.sh')
    

#Submit Slice Request
slice.submit();

## Fetch the slice and wait for nodes to be back up post reboot

In [None]:
slice = fablib.get_slice(slice_name)

In [None]:
slice.wait_ssh()

## Reconfigure IP addresses for the interfaces post reboot

In [None]:
# Reconfigure IP addresses post reboot
for n1 in slice.get_nodes():
    n1.config()

## Setup the keys between submit and worker nodes

In [None]:
# Generate SSH Keys for root and poseidon users
for n1 in slice.get_nodes():
    n1.execute('sudo ssh-keygen -t rsa -N "" -f /root/.ssh/id_rsa', quiet=True)
    n1.execute('sudo -u poseidon ssh-keygen -t rsa -N "" -f /home/poseidon/.ssh/id_rsa', quiet=True)

In [None]:
# Update /etc/hosts and authorized_keys
for n1 in slice.get_nodes():
    for n2 in slice.get_nodes():
        if n1.get_name() != n2.get_name():
            node2_addr = n2.get_interface(network_name=network_name).get_ip_addr()
            
            # Update /etc/hosts
            n1.execute(f'sudo sh -c \'echo "{node2_addr} {n2.get_name()}" >> /etc/hosts\'')
            
            # Setup authorized_keys for root user
            stdout, stderr = n2.execute("sudo cat /root/.ssh/id_rsa.pub", quiet=True)
            n1.execute(f'sudo sh -c \'echo "{stdout}" >> /root/.ssh/authorized_keys\'')
            
            # Setup authorized_keys for poseidon user
            stdout, stderr = n2.execute("sudo -u poseidon cat /home/poseidon/.ssh/id_rsa.pub", quiet=True)
            n1.execute(f'sudo -u poseidon sh -c \'echo "{stdout}" >> /home/poseidon/.ssh/authorized_keys\'')

## Update condor config to use the correct interface name

In [None]:
slice = fablib.get_slice(slice_name)

In [None]:
node1 = slice.get_node(name=node1_name)
node1_if_device_name = node1.get_interface(network_name=network_name).get_device_name()
node1.execute(f"sudo sed -i 's/NETWORK_INTERFACE = ens7/NETWORK_INTERFACE = {node1_if_device_name}/' /etc/condor/config.d/50-main.config")

In [None]:
node3 = slice.get_node(name=node3_name)
node3_if_device_name = node3.get_interface(network_name=network_name).get_device_name()
node3.execute(f"sudo sed -i 's/NETWORK_INTERFACE = ens7/NETWORK_INTERFACE = {node3_if_device_name}/' /etc/condor/config.d/50-main.config")

## Check condor status

After the config update, you may have to re-reun this cell couple of times for condor to pick up the changes

In [None]:
node1.execute("condor_status")

## Install the required packages on submit and worker nodes

Setup is now ready to deploy inference changes. Login to the submit and worker nodes and run the `setup.sh`

In [None]:
for n in slice.get_nodes():
    print(f"Login to {n.get_name()} using: {n.get_ssh_command()}")

NOTE: This step is done manually to avoid prompts on ubuntu which are not disabled even when noninteractive mode is enabled.

```
cd /home/ubuntu/pycbc/inference/pycbc
./setup.sh
```

## Start the workflow on the submit node

In [None]:
node1 = slice.get_node(name=node1_name)

### Fetch the workflow
Clone the workflow repository

In [None]:
node1.execute('sudo -u poseidon sh -c "mkdir -p /home/poseidon/workflows" && sudo -u poseidon sh -c "git clone https://github.com/PoSeiDon-Workflows/pycbc.git /home/poseidon/workflows/pycbc"')

### Dowload the data files

In [None]:
node2 = slice.get_node(name=node2_name)

In [None]:
node2.execute('sudo -u poseidon sh -c "mkdir -p /home/poseidon/workflows" && sudo -u poseidon sh -c "git clone https://github.com/PoSeiDon-Workflows/pycbc.git /home/poseidon/workflows/pycbc"')

In [None]:
node2.execute('sudo -u poseidon sh -c "/home/poseidon/workflows/pycbc/inference/pycbc/data/get.sh /home/poseidon/workflows/pycbc/inference/pycbc/data"')

### Run the workflow

In [None]:
node1.execute('sudo -u poseidon sh -c "chmod +x /home/poseidon/workflows/pycbc/inference/pycbc/gw_output/run.sh"')

In [None]:
node1.execute('sudo -u poseidon sh -c "/home/poseidon/workflows/pycbc/inference/pycbc/gw_output/run.sh"')

## Delete the Slice

Please delete your slice when you are done with your experiment.

In [None]:
slice.delete()