# LLM Performance Supercomputing

In [None]:
import sys
import os
import json
import traceback
import time
from datetime import datetime, timedelta
from dateutil import tz
from ipaddress import ip_network, ip_address, IPv4Address, IPv6Address, IPv4Network, IPv6Network

## Fabric Imports

In [None]:
from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

fablib = fablib_manager()

In [None]:
try:
    fablib.list_sites(
        force_refresh=True, 
        fields=[
            "name", 
            "cores_available", 
            "ram_available",
            "disk_available",
            "nic_basic_available", 
            "nic_connectx_5_available", 
            "nic_connectx_6_available",
            "nvme_available",
            "tesla_t4_available",
            "rtx6000_available",
            "a30_available",
            "a40_available",
            "a40_capacity",
            "fpga_u280_available"
        ]
    )
except Exception as e:
    print(f"Exception: {e}")

## Fabric Variables

In [None]:
# FABRIC Config
fabric_prefix =  f"poseidon-supercomputing-"
fabric_slice_name = fabric_prefix+'testing'
fabric_os_image='default_ubuntu_20'

## Create Fabric Slice

In [None]:
try:
    #Create Slice
    fabric_slice = fablib.new_slice(name=fabric_slice_name)

    # Add A30 node
    fabric_a30 = fabric_slice.add_node(
                    name=fabric_prefix+'a30', 
                    site='RUTG',
                    image=fabric_os_image,
                    cores=24,
                    ram=64,
                    disk=500)
    fabric_a30.add_component(model="GPU_A30", name='gpu1')
    #fabric_a30.add_fabnet() ###this is only if you want to have multiple nodes communicated with each other through fabric's infrastructure

    #Submit the Request
    fabric_slice.submit()
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

## NUMA Optimization

In [None]:
try:
    #Create Slice
    fabric_slice = fablib.get_slice(fabric_slice_name)
    
    for fabric_node in fabric_slice.get_nodes():
        print(f'----- Pinning vCPUs for node {fabric_node.get_name()} ------')
        # Pin all vCPUs for VM to same Numa node as the component
        fabric_node.pin_cpu(component_name='gpu1')
        
        # Pin memmory for VM to same Numa node as the components
        fabric_node.numa_tune()
        
        # Reboot the VM
        fabric_node.os_reboot()
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

In [None]:
fabric_slice = fablib.get_slice(fabric_slice_name)

# Wait for the SSH Connectivity to be back
fabric_slice.wait_ssh()

## Install Software on Fabric

In [None]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    for fabric_node in fabric_slice.get_nodes():
        fabric_node.upload_file("/home/fabric/work/poseidon/LLM_Supercomputing_Test/setup.sh", "/home/ubuntu/setup.sh")
        #if type(ip_address(fabric_node.get_management_ip())) is IPv6Address:
        #    fabric_node.execute("sudo sed -i 's/nameserver/nameserver 2a01:4f9:c010:3f02::1\nnameserver 2a00:1098:2c::1\nnameserver 2a00:1098:2b::1\nnameserver/' /etc/resolv.conf", quiet=True)

except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

In [None]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    #Create execute threads
    execute_threads = {}
    for fabric_node in fabric_slice.get_nodes():
        config_command = "sudo bash /home/ubuntu/setup.sh"
        print(f"Starting config on node {fabric_node.get_name()}")
        execute_threads[fabric_node] = fabric_node.execute_thread(config_command)
        
    #Wait for results from threads
    for fabric_node,thread in execute_threads.items():
        print(f"Waiting for result from node {fabric_node.get_name()}")
        stdout,stderr = thread.result()
                        
    print("All Done!")
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

In [None]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    #Create execute threads
    execute_threads = {}
    for fabric_node in fabric_slice.get_nodes():
        fabric_node.execute("nvidia-smi")
                        
    print("All Done!")
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

## Extend Fabric Slice

In [None]:
# Prolong Fabric Slice For 14 Days
end_date = (datetime.now(tz=tz.tzutc()) + timedelta(days=14)).strftime("%Y-%m-%d %H:%M:%S %z")
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)
    fabric_slice = fabric_slice.renew(end_date)
    
    fabric_slice = fablib.get_slice(name=fabric_slice_name)
    print(f'New lease end time: {fabric_slice.get_lease_end()}')
except Exception as e:
    print(f"Fail: {e}")
    traceback.print_exc()

## Cleanup Fabric (This Deletes The Deployment)

In [None]:
try:
    fabric_slice = fablib.get_slice(fabric_slice_name)
    fabric_slice.delete()
except Exception as e:
    print(f"Exception: {e}")