In [None]:
# Prerequisites
# - OCI Credentials: Ensure you have an OCI configuration file (~/.oci/config) with valid credentials.
# - SSH Key: Have an SSH private key for accessing the VM.
# - Terraform Installed: Install Terraform on the machine where the Jupyter Notebook runs.
# - Python Libraries: Install required libraries for OCI SDK, SSH, and visualization
%pip install --upgrade pip oci paramiko pandas matplotlib jupyter tqdm seaborn

In [None]:
import os

# Centralized Variables Cell - Edit all values here to configure the entire notebook.
# Users only need to update this cell and run the notebook sequentially.
# Security note: Do not hardcode sensitive info; load from secure sources if possible (e.g., env vars, secret manager).
# For each variable, replace the placeholder with your value. See comments for guidance on what to put and where to find it.

# OCI Credentials and Config
tenancy_ocid = ""  # Your OCI tenancy OCID. Find it in OCI Console: Profile > Tenancy (copy the OCID).
user_ocid = ""  # Your OCI user OCID. Find it in OCI Console: Profile > User Settings (copy the OCID).
fingerprint = ""  # Fingerprint of your OCI API key. Generated when you upload your public key to OCI (Profile > API Keys).
region = "your_region_here"  # OCI region code (e.g., "us-sanjose-1"). Find available regions in OCI Console or via OCI CLI: oci iam region list.
compartment_id = ""  # OCID of the compartment for resources. Find it in OCI Console: Identity > Compartments (copy OCID).
availability_domain = "your_ad_here"  # Availability domain name (e.g., "MVVx:US-SANJOSE-1-AD-1"). List via OCI Console: Compute > Limits or OCI CLI: oci iam availability-domain list --compartment-id <compartment_ocid>.
image_id = ""  # OCID of the VM image. Find it in OCI Console: Compute > Images (search for desired image, copy OCID).
oci_config_file = "~/.oci/config"  # Path to your OCI config file. Default is ~/.oci/config; change if using a custom location.
oci_profile = "your_profile_here"  # Profile name in OCI config file (e.g., "DEFAULT" or "SAN JOSE"). Check your ~/.oci/config file for sections like [DEFAULT].
oci_private_key_path = "your_key_path_here"  # Full path to your OCI API private key file (e.g., "/path/to/oci_api_key.pem"). This is the PEM file paired with your API key fingerprint.

# SSH Keys
ssh_public_key_path = "your_pub_key_path_here"  # Full path to your SSH public key file (e.g., "/home/user/.ssh/id_rsa.pub"). Used for VM access; generate with ssh-keygen if needed.
ssh_private_key_path = "your_priv_key_path_here"  # Full path to your SSH private key file (e.g., "/home/user/.ssh/id_rsa"). Must be unencrypted and match the public key.

# VM Configuration
vm_shape = "your_shape_here"  # VM shape name (e.g., "VM.Standard.E5.Flex"). Find available shapes in OCI Console: Compute > Shapes or OCI CLI: oci compute shape list --compartment-id <compartment_ocid>.
vm_ocpus = 0  # Number of OCPUs for the VM (e.g., 16). Must be compatible with the chosen shape; check limits in OCI Console.
vm_memory_in_gbs = 0  # Memory in GB for the VM (e.g., 32). Must be compatible with the shape and OCPUs; check in OCI Console.

# Test Parameters
sizes = range(100, 1050, 50)  # Volume sizes in GB as a Python range (start, stop, step). Example: range(100, 1050, 50) tests 100GB to 1000GB in 50GB increments. Adjust numbers as needed.
vpus = range(30, 130, 10)  # VPUs per GB as a Python range (start, stop, step). Example: range(30, 130, 10) tests 30 to 120 in 10 increments. Adjust based on desired performance testing.
fio_file_size = "10G"  # FIO test file size (e.g., "10G" for 10GB, "5G" for 5GB). Specifies the data volume for benchmarks.
fio_block_size = "1M"  # FIO block size (e.g., "1M" for 1MB, "128k" for 128KB). Affects IO pattern in tests.
fio_runtime = 60  # FIO runtime in seconds (e.g., 60 for 1 minute). Controls how long each test runs.
fio_iodepth = 32  # FIO IO queue depth (e.g., 32). Higher values can test deeper queues for performance.
fio_numjobs = 1  # FIO number of parallel jobs (e.g., 1). Increase for multi-threaded testing.
results_file = "fio_results.csv"  # Output CSV file name (e.g., "fio_results.csv"). Where test results will be saved.

# Optionally set as environment variables (if needed for subprocesses like Terraform). These mirror the Python variables.
os.environ['TENANCY_OCID'] = tenancy_ocid
os.environ['USER_OCID'] = user_ocid
os.environ['FINGERPRINT'] = fingerprint
os.environ['REGION'] = region
os.environ['COMPARTMENT_ID'] = compartment_id
os.environ['AVAILABILITY_DOMAIN'] = availability_domain
os.environ['IMAGE_ID'] = image_id
os.environ['OCI_PRIVATE_KEY_PATH'] = oci_private_key_path
os.environ['SSH_PUBLIC_KEY_PATH'] = ssh_public_key_path
os.environ['SSH_PRIVATE_KEY_PATH'] = ssh_private_key_path
# Note: Python variables are used directly in cells; env vars are optional backups.

In [None]:
# Cell 2: Write Terraform Configuration (original Cell 1)
# This cell creates a Terraform configuration file to provision OCI resources (VCN, VM, Block Volume).

import os

terraform_config = f"""
terraform {{
  required_providers {{
    oci = {{
      source  = "oracle/oci"
      version = ">= 5.0.0"
    }}
  }}
}}

provider "oci" {{
  tenancy_ocid     = var.tenancy_ocid
  user_ocid        = var.user_ocid
  private_key_path = var.oci_private_key_path
  fingerprint      = var.fingerprint
  region           = var.region
}}

variable "tenancy_ocid" {{}}
variable "user_ocid" {{}}
variable "oci_private_key_path" {{}}
variable "fingerprint" {{}}
variable "region" {{}}
variable "compartment_id" {{}}
variable "availability_domain" {{}}
variable "image_id" {{}}
variable "ssh_public_key_content" {{}}
variable "ssh_private_key_path" {{}}

# Create VCN
resource "oci_core_vcn" "test_vcn" {{
  cidr_block     = "10.0.0.0/16"
  compartment_id = var.compartment_id
  display_name   = "TestVCN"
}}

# Create Internet Gateway
resource "oci_core_internet_gateway" "test_igw" {{
  compartment_id = var.compartment_id
  vcn_id         = oci_core_vcn.test_vcn.id
  display_name   = "TestInternetGateway"
}}

# Create Route Table
resource "oci_core_route_table" "test_route_table" {{
  compartment_id = var.compartment_id
  vcn_id         = oci_core_vcn.test_vcn.id
  display_name   = "TestRouteTable"
  route_rules {{
    destination       = "0.0.0.0/0"
    destination_type  = "CIDR_BLOCK"
    network_entity_id = oci_core_internet_gateway.test_igw.id
  }}
}}

# Create Subnet
resource "oci_core_subnet" "test_subnet" {{
  cidr_block        = "10.0.1.0/24"
  compartment_id    = var.compartment_id
  vcn_id            = oci_core_vcn.test_vcn.id
  display_name      = "TestSubnet"
  route_table_id    = oci_core_route_table.test_route_table.id
}}

# Create Security List
resource "oci_core_security_list" "test_security_list" {{
  compartment_id = var.compartment_id
  vcn_id         = oci_core_vcn.test_vcn.id
  display_name   = "TestSecurityList"
  ingress_security_rules {{
    protocol    = "6" # TCP
    source      = "0.0.0.0/0"
    source_type = "CIDR_BLOCK"
    tcp_options {{
      min = 22
      max = 22
    }}
  }}
  egress_security_rules {{
    protocol    = "all"
    destination = "0.0.0.0/0"
  }}
}}

# Create VM
resource "oci_core_instance" "test_instance" {{
  availability_domain = var.availability_domain
  compartment_id      = var.compartment_id
  shape               = "{vm_shape}"
  shape_config {{
    ocpus         = {vm_ocpus}
    memory_in_gbs = {vm_memory_in_gbs}
  }}
  source_details {{
    source_type = "image"
    source_id   = var.image_id
  }}
  create_vnic_details {{
    subnet_id        = oci_core_subnet.test_subnet.id
    assign_public_ip = true
  }}
  display_name = "TestVM"
  metadata = {{
    ssh_authorized_keys = var.ssh_public_key_content
  }}

  provisioner "remote-exec" {{
    inline = [
      "sudo dnf install -y fio",
      "sudo mkdir -p /export",
      "sudo mount /dev/sdb /export || echo 'Mount failed, ensure block volume is attached'",
      "sudo chown opc:opc /export"
    ]
    connection {{
      type        = "ssh"
      user        = "opc"
      private_key = file(var.ssh_private_key_path)
      host        = self.public_ip
      timeout     = "5m"
    }}
  }}
}}

# Create Block Volume
resource "oci_core_volume" "test_volume" {{
  compartment_id      = var.compartment_id
  availability_domain = var.availability_domain
  display_name        = "TestBlockVolume"
  size_in_gbs         = 100
  vpus_per_gb         = 30
}}

# Attach Block Volume to VM
resource "oci_core_volume_attachment" "test_volume_attachment" {{
  attachment_type = "paravirtualized"
  instance_id     = oci_core_instance.test_instance.id
  volume_id       = oci_core_volume.test_volume.id
}}

# Outputs
output "instance_id" {{
  value = oci_core_instance.test_instance.id
}}
output "volume_id" {{
  value = oci_core_volume.test_volume.id
}}
output "vm_public_ip" {{
  value = oci_core_instance.test_instance.public_ip
}}
"""

# Save to file
with open("main.tf", "w") as f:
    f.write(terraform_config)
print("Terraform configuration saved as main.tf")

In [None]:
# Cell 3: Write Terraform Variables (original Cell 2)
# This cell creates a Terraform variables file with OCI credentials and SSH key paths.

import os

# Read SSH public key content from path defined in Variables Cell
try:
    with open(os.path.expanduser(ssh_public_key_path), "r") as f:
        ssh_public_key_content = f.read().strip()
except FileNotFoundError:
    raise ValueError(
        f"Public key file not found at {ssh_public_key_path}. Verify the file exists or create it."
    )

terraform_vars = f"""
tenancy_ocid = "{tenancy_ocid}"
user_ocid = "{user_ocid}"
oci_private_key_path = "{oci_private_key_path}"
fingerprint = "{fingerprint}"
region = "{region}"
compartment_id = "{compartment_id}"
availability_domain = "{availability_domain}"
image_id = "{image_id}"
ssh_public_key_content = "{ssh_public_key_content}"
ssh_private_key_path = "{ssh_private_key_path}"
"""


# Save to file
with open("terraform.tfvars", "w") as f:
    f.write(terraform_vars)
print("Terraform variables saved as terraform.tfvars")

In [None]:
# Cell 4: Run Terraform Init and Apply (original Cell 3)
# This cell initializes and applies the Terraform configuration to create the OCI resources.
# Note: This step may take several minutes to complete.
# Ensure that the OCI credentials and SSH key paths in terraform.tfvars are correct before running.

!terraform init

!terraform apply -auto-approve -var-file=terraform.tfvars

In [None]:
# Cell 5: Extract Terraform Outputs (original Cell 4)
# This cell extracts the outputs from the Terraform state file to get the instance ID, volume ID, and VM public IP.

import json
import subprocess

def get_terraform_outputs():

    result = subprocess.run(
        ["terraform", "output", "-json"], capture_output=True, text=True
    )

    if result.returncode == 0:

        outputs = json.loads(result.stdout)

        return (
            outputs["instance_id"]["value"],
            outputs["volume_id"]["value"],
            outputs["vm_public_ip"]["value"],
        )

    else:

        print("Error getting Terraform outputs:", result.stderr)

        return None, None, None

instance_id, volume_id, vm_public_ip = get_terraform_outputs()

print(f"Instance ID: {instance_id}")

print(f"Volume ID: {volume_id}")

print(f"VM Public IP: {vm_public_ip}")

In [None]:
# Cell 6: Python Script for Volume Management and FIO Testing (original Cell 5)
# This script updates the block volume size and VPUs, refreshes the volume on the VM,
# runs FIO tests, and logs the results to a CSV file.

import oci
import paramiko
import time
import csv
import re
from tqdm import tqdm

# OCI Configuration - Load from variables and force correct region
config = oci.config.from_file(
    oci_config_file, oci_profile
)  # Use variables from Variables Cell
config["region"] = region  # Force to match resource region
block_storage_client = oci.core.BlockstorageClient(config)


# VM and Volume Details (from Terraform outputs)
ssh_host = vm_public_ip
ssh_user = "opc"
ssh_key = ssh_private_key_path  # From Variables Cell


# SSH Client Setup
ssh = paramiko.SSHClient()

ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())


# Function to run SSH commands with retry and exit status
def run_ssh_command(command, timeout=60, retries=5, delay=10):

    for attempt in range(retries):

        try:

            ssh.connect(
                ssh_host, username=ssh_user, key_filename=ssh_key, timeout=timeout
            )

            stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout)

            output = stdout.read().decode("utf-8").strip()

            error = stderr.read().decode("utf-8").strip()

            exit_status = stdout.channel.recv_exit_status()  # Get exit code

            ssh.close()

            if error:

                print(f"SSH Error for command '{command}': {error}")

            print(f"SSH Output for command '{command}': {output}")

            print(f"Exit status: {exit_status}")

            return output, exit_status

        except Exception as e:

            print(
                f"SSH Attempt {attempt+1} Failed for command '{command}': {e}. Retrying in {delay}s..."
            )

            time.sleep(delay)

    return None, 1  # Failure status


# Function to update Block Volume size and VPUs with retry on 429
def update_block_volume(size_in_gbs, vpus_per_gb, retries=3, delay=60):

    for attempt in range(retries):

        try:

            update_details = oci.core.models.UpdateVolumeDetails(
                size_in_gbs=size_in_gbs, vpus_per_gb=vpus_per_gb
            )

            block_storage_client.update_volume(volume_id, update_details)

            print(
                f"Updating Block Volume to {size_in_gbs} GB and {vpus_per_gb} VPUs/GB"
            )

            oci.wait_until(
                block_storage_client,
                block_storage_client.get_volume(volume_id),
                "lifecycle_state",
                "AVAILABLE",
                max_wait_seconds=600,
            )

            print("Block Volume updated successfully.")

            return True

        except oci.exceptions.ServiceError as e:

            if e.status == 429:

                print(f"Rate limit hit (429). Retrying in {delay}s...")

                time.sleep(delay)

            else:

                print(f"Failed to update Block Volume: {e}")

                return False

        except Exception as e:

            print(f"Unexpected error: {e}")

            return False

    return False


# Function to refresh Block Volume on VM with improved detection
def refresh_block_volume(size_in_gbs, retries=5, delay=30):

    expected_size_bytes = size_in_gbs * 1024 * 1024 * 1024  # GB to bytes

    for attempt in range(retries):

        # Unmount first to ensure clean state

        run_ssh_command("sudo umount /export || true")

        # Detect block device: Find disk ~expected size, not sda

        detect_cmd = f'lsblk -bno NAME,SIZE,TYPE | awk \'$2 > {expected_size_bytes - 10*1024*1024*1024} && $2 < {expected_size_bytes + 10*1024*1024*1024} && $3 == "disk" && $1 != "sda" {{print $1}}\' | head -1'

        device = run_ssh_command(detect_cmd)[0]  # Get output

        if not device:

            print("No suitable block device found. Attempting global rescan with sudo.")

            run_ssh_command(
                "sudo sh -c 'for d in /sys/class/scsi_device/*; do echo 1 > $d/device/rescan; done'"
            )

            time.sleep(10)  # Wait for rescan

            device = run_ssh_command(detect_cmd)[0]  # Retry detection

            if not device:

                print(f"Detection attempt {attempt+1} failed. Retrying in {delay}s...")

                time.sleep(delay)

                continue

        print(f"Detected block device: /dev/{device}")

        commands = [
            f"sudo sh -c 'echo 1 > /sys/class/block/{device}/device/rescan'",
            f"sudo mkfs.xfs -f /dev/{device}",
            f"sudo mount /dev/{device} /export",
            "sudo chown opc:opc /export",
        ]

        success = True

        for cmd in commands:

            output, status = run_ssh_command(cmd)

            if status != 0:

                success = False

                break

            time.sleep(2)

        if success:

            # Verify mount

            check_mount, _ = run_ssh_command("df -h | grep /export")

            if check_mount:

                print("Mount verified.")

                return True

            else:

                print("Mount check failed.")

                success = False

        print(f"Refresh attempt {attempt+1} failed. Retrying in {delay}s...")

        time.sleep(delay)

    return False


# Function to run FIO tests and parse results with log parsing
def run_fio_tests(size, vpus, retries=5, delay=30):

    # Pre-check: Clear old files and ensure writable

    pre_cmd = "sudo rm -f /export/test_file /export/fio_*.log; touch /export/test_write && rm /export/test_write && echo 'OK'"

    output, status = run_ssh_command(pre_cmd)

    if status != 0 or output != "OK":

        print("Pre-FIO check failed.")

        return float("nan"), float("nan"), float("nan"), float("nan")

    # Run write FIO with log (using variables for params)
    write_cmd = f"sudo fio --name=seqwrite --filename=/export/test_file --size={fio_file_size} --bs={fio_block_size} --rw=write --ioengine=libaio --iodepth={fio_iodepth} --direct=1 --numjobs={fio_numjobs} --runtime={fio_runtime} --time_based --group_reporting --output=/export/fio_write.log; echo $?"

    write_output, write_status = run_ssh_command(
        write_cmd, timeout=300, retries=retries, delay=delay
    )

    if write_status != 0:

        write_log = run_ssh_command("cat /export/fio_write.log")[0]

        print(f"FIO write failed (status {write_status}). Log: {write_log}")

        return float("nan"), float("nan"), float("nan"), float("nan")

    # Parse write BW from log
    write_log = run_ssh_command("cat /export/fio_write.log")[0]

    write_match = re.search(
        r"WRITE: bw=(\d+\.?\d*)MiB/s \((\d+\.?\d*)MB/s\)", write_log
    )

    write_mib = float(write_match.group(1)) if write_match else float("nan")

    write_mb = float(write_match.group(2)) if write_match else float("nan")

    # Run read FIO with log
    read_cmd = f"sudo fio --name=seqread --filename=/export/test_file --size={fio_file_size} --bs={fio_block_size} --rw=read --ioengine=libaio --iodepth={fio_iodepth} --direct=1 --numjobs={fio_numjobs} --runtime={fio_runtime} --time_based --group_reporting --output=/export/fio_read.log; echo $?"

    read_output, read_status = run_ssh_command(
        read_cmd, timeout=300, retries=retries, delay=delay
    )

    if read_status != 0:

        read_log = run_ssh_command("cat /export/fio_read.log")[0]

        print(f"FIO read failed (status {read_status}). Log: {read_log}")

        return write_mib, write_mb, float("nan"), float("nan")

    # Parse read BW from log
    read_log = run_ssh_command("cat /export/fio_read.log")[0]

    read_match = re.search(r"READ: bw=(\d+\.?\d*)MiB/s \((\d+\.?\d*)MB/s\)", read_log)

    read_mib = float(read_match.group(1)) if read_match else float("nan")

    read_mb = float(read_match.group(2)) if read_match else float("nan")

    return write_mib, read_mib, write_mb, read_mb


# Main loop for testing with tqdm (using sizes and vpus from Variables Cell)
def run_tests():

    results_file_path = results_file  # From Variables Cell

    with open(results_file_path, mode="w", newline="") as file:

        writer = csv.writer(file)

        writer.writerow(
            [
                "Size (GB)",
                "VPUs/GB",
                "Write Bandwidth (MiB/s)",
                "Read Bandwidth (MiB/s)",
                "Write Bandwidth (MB/s)",
                "Read Bandwidth (MB/s)",
            ]
        )

        total_tests = len(list(sizes)) * len(
            list(vpus)
        )  # Convert ranges to lists for count

        with tqdm(total=total_tests, desc="Running FIO Tests") as pbar:

            for size in sizes:

                for vpu in vpus:

                    print(f"Testing Size: {size} GB, VPUs: {vpu}")

                    if update_block_volume(size, vpu):

                        if refresh_block_volume(size):  # Pass size for detection

                            time.sleep(10)  # Short delay for stability

                            write_mib, read_mib, write_mb, read_mb = run_fio_tests(
                                size, vpu
                            )

                            writer.writerow(
                                [size, vpu, write_mib, read_mib, write_mb, read_mb]
                            )

                            print(f"Completed test for {size} GB and {vpu} VPUs/GB")

                        else:
                            writer.writerow(
                                [
                                    size,
                                    vpu,
                                    "Error: Refresh Failed",
                                    "Error: Refresh Failed",
                                    "Error: Refresh Failed",
                                    "Error: Refresh Failed",
                                ]
                            )

                    else:
                        writer.writerow(
                            [
                                size,
                                vpu,
                                "Error: Update Failed",
                                "Error: Update Failed",
                                "Error: Update Failed",
                                "Error: Update Failed",
                            ]
                        )

                    pbar.update(1)


# Run the tests
run_tests()

In [None]:
# Cell 7: Cleanup Resources (original Cell 7)

!terraform destroy -var-file=terraform.tfvars -auto-approve