ROCm · mawad-amd · Oct 11, 2025 · Sep 18, 2025 · Sep 19, 2025 · Sep 29, 2025
@@ -0,0 +1,78 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+set -e
+
+# Script to clean up any lingering test processes and ports
+# This is useful when tests segfault and leave processes/ports open
+
+echo "========================================"
+echo "Port Cleanup Script - Starting"
+echo "========================================"
+
+# Show initial state of listening ports
+echo ""
+echo "Initial state - Listening TCP ports:"
+echo "------------------------------------"
+ss -tulpn 2>/dev/null | grep LISTEN | grep -E "python|pt_main_thread" || echo "No Python/PyTorch processes listening on ports"
+echo ""
+
+echo "Cleaning up lingering test processes and ports..."
+
+# Clean up Python test processes that might be stuck
+# Look for processes related to run_tests_distributed.py, pytest, and torch distributed tests
+echo "Checking for lingering Python test processes..."
+PYTHON_TEST_PIDS=$(pgrep -f "run_tests_distributed.py|pytest.*test_|torch.distributed" 2>/dev/null || true)
+
+if [ -n "$PYTHON_TEST_PIDS" ]; then
+    echo "Found Python test processes: $PYTHON_TEST_PIDS"
+    echo "Killing Python test processes..."
+    echo "$PYTHON_TEST_PIDS" | xargs kill -9 2>/dev/null || true
+    echo "Cleaned up Python test processes"
+fi
+
+# Clean up pt_main_thread processes (PyTorch multiprocessing spawned processes)
+echo "Checking for lingering PyTorch processes (multiprocessing.spawn)..."
+PT_PIDS=$(pgrep -f "multiprocessing.spawn" 2>/dev/null || true)
+
+if [ -n "$PT_PIDS" ]; then
+    echo "Found PyTorch processes: $PT_PIDS"
+    echo "Killing PyTorch processes..."
+    echo "$PT_PIDS" | xargs kill -9 2>/dev/null || true
+    echo "Cleaned up PyTorch processes"
+fi
+
+# Clean up any processes listening on TCP ports in the common test range
+# PyTorch distributed typically uses ports in the 29500+ range, but can use any available port
+echo "Checking for processes using TCP ports..."
+LISTENING_PIDS=$(lsof -ti tcp -sTCP:LISTEN 2>/dev/null | sort -u || true)
+
+if [ -n "$LISTENING_PIDS" ]; then
+    # Filter to only Python/PyTorch processes to avoid killing system services
+    for PID in $LISTENING_PIDS; do
+        PROCESS_NAME=$(ps -p $PID -o comm= 2>/dev/null || true)
+        # Check for python or pt_main_thread processes
+        if [[ "$PROCESS_NAME" == *"python"* ]] || [[ "$PROCESS_NAME" == *"pt_main_thread"* ]]; then
+            PORT=$(lsof -Pan -p $PID -i tcp -sTCP:LISTEN 2>/dev/null | awk 'NR>1 {print $9}' | cut -d':' -f2 | head -1)
+            if [ -n "$PORT" ]; then
+                echo "Found process $PROCESS_NAME (PID $PID) listening on port $PORT"
+                kill -9 $PID 2>/dev/null || true
+                echo "Cleaned up process $PID on port $PORT"
+            fi
+        fi
+    done
+fi
+
+echo ""
+echo "========================================"
+echo "Port Cleanup Script - Completed"
+echo "========================================"
+
+# Show final state of listening ports
+echo ""
+echo "Final state - Listening TCP ports:"
+echo "------------------------------------"
+ss -tulpn 2>/dev/null | grep LISTEN | grep -E "python|pt_main_thread" || echo "No Python/PyTorch processes listening on ports"
+echo ""
+echo "Port cleanup complete."
@@ -0,0 +1,63 @@
+#!/bin/bash
+set -e
+
+# Arguments
+EXAMPLE_PATH=$1
+TFLOPS_THRESHOLD=$2
+shift 2
+BENCHMARK_ARGS="$@"
+
+# Create overlay image in workspace (will be auto-cleaned by GitHub Actions)
+OVERLAY="iris_overlay_perf_${EXAMPLE_PATH//\//_}.img"
+
+echo "::group::Creating overlay image"
+apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
+echo "::endgroup::"
+
+echo "::group::Running performance benchmark"
+apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+  --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
+  ~/apptainer/iris-dev.sif bash -c "
+    set -e
+    pip install -e .
+    python examples/${EXAMPLE_PATH}/benchmark.py \
+      --benchmark \
+      --validate \
+      -r 8 \
+      ${BENCHMARK_ARGS} \
+      --output_file perf_result.json
+  "
+echo "::endgroup::"
+
+# Parse JSON and check performance
+echo "::group::Validating performance"
+
+# Check if benchmark succeeded
+SUCCESS=$(jq -r '.success' perf_result.json)
+if [ "$SUCCESS" != "true" ]; then
+  echo "::error::Benchmark failed (success: $SUCCESS)"
+  jq '.' perf_result.json
+  exit 1
+fi
+
+TFLOPS=$(jq -r '.tflops' perf_result.json)
+
+if [ -z "$TFLOPS" ] || [ "$TFLOPS" = "null" ]; then
+  echo "::error::Failed to extract tflops from benchmark output"
+  jq '.' perf_result.json
+  exit 1
+fi
+
+echo "::notice::Achieved TFLOPs: $TFLOPS"
+
+# Convert to integer for comparison
+TFLOPS_INT=${TFLOPS%.*}
+if (( TFLOPS_INT < TFLOPS_THRESHOLD )); then
+  echo "::error::Performance regression detected! TFLOPs ($TFLOPS) is below threshold ($TFLOPS_THRESHOLD)"
+  jq '.' perf_result.json
+  exit 1
+fi
+
+echo "✅ Performance test passed! TFLOPs: $TFLOPS (threshold: >$TFLOPS_THRESHOLD)"
+echo "::endgroup::"
+
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+set -e  # Exit on any error
+
+# Get num_ranks from command line argument
+NUM_RANKS=$1
+
+if [ -z "$NUM_RANKS" ]; then
+    echo "Error: NUM_RANKS not provided"
+    echo "Usage: $0 <num_ranks>"
+    exit 1
+fi
+
+# Run examples tests one at a time using distributed wrapper
+echo 'Running examples tests one at a time...'
+for test_file in tests/examples/test_*.py; do
+  echo "Testing: $test_file with $NUM_RANKS ranks"
+  python tests/run_tests_distributed.py --num_ranks $NUM_RANKS "$test_file" -v --tb=short --durations=10
+done
+
+# Run unit tests one at a time using distributed wrapper
+echo 'Running unit tests one at a time...'
+for test_file in tests/unittests/test_*.py; do
+  echo "Testing: $test_file with $NUM_RANKS ranks"
+  python tests/run_tests_distributed.py --num_ranks $NUM_RANKS "$test_file" -v --tb=short --durations=10
+done
@@ -0,0 +1,83 @@
+name: Iris External Validation Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  build-apptainer-image:
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 90
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Apptainer
+        run: |
+          apt-get update && apt-get install -y software-properties-common
+          add-apt-repository -y ppa:apptainer/ppa
+          apt-get update && apt-get install -y apptainer
+
+      - name: Build Iris Apptainer container
+        run: |
+          # Create persistent Apptainer directory
+          mkdir -p ~/apptainer
+
+          # Build Apptainer image from definition file (only if it doesn't exist)
+          if [ ! -f ~/apptainer/iris-dev.sif ]; then
+            echo "Building new Apptainer image..."
+            apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
+          else
+            echo "Using existing Apptainer image"
+          fi
+
+  external-validation-test:
+    name: External Validation Test
+    needs: build-apptainer-image
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Cleanup lingering ports before tests
+        run: |
+          bash .github/scripts/cleanup_ports.sh
+
+      - name: Run External Validation Test with Apptainer
+        run: |
+          set -e
+
+          # Create unique overlay image for isolation
+          OVERLAY="/tmp/iris_overlay_$(whoami)_external_$(date +%s%N).img"
+
+          echo "::group::Creating overlay image"
+          apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
+          echo "::endgroup::"
+
+          echo "::group::Running external validation test"
+          apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \
+            --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
+            ~/apptainer/iris-dev.sif bash -c "
+              set -e
+              pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
+              wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
+              python test_iris_distributed.py
+            "
+          echo "::endgroup::"
+
+          # Cleanup overlay image
+          echo "::group::Cleaning up overlay image"
+          rm -f "${OVERLAY}"
+          echo "::endgroup::"
+
+          echo "✅ External validation test passed!"
@@ -0,0 +1,86 @@
+name: Iris Performance Regression Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  build-apptainer-image:
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 20
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Apptainer
+        run: |
+          apt-get update && apt-get install -y software-properties-common
+          add-apt-repository -y ppa:apptainer/ppa
+          apt-get update && apt-get install -y apptainer
+
+      - name: Build Iris Apptainer container
+        run: |
+          # Create persistent Apptainer directory
+          mkdir -p ~/apptainer
+
+          # Build Apptainer image from definition file (only if it doesn't exist)
+          if [ ! -f ~/apptainer/iris-dev.sif ]; then
+            echo "Building new Apptainer image..."
+            apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
+          else
+            echo "Using existing Apptainer image"
+          fi
+
+  performance-test:
+    name: ${{ matrix.example_name }}
+    needs: build-apptainer-image
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        # Performance baselines measured on AMD Instinct MI325X (8 GPUs)
+        include:
+          - example_name: "GEMM All-Scatter WG Specialization"
+            example_path: "10_gemm_all_scatter_wg_specialization"
+            tflops_threshold: 1600  # Actual: ~2182 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
+
+          - example_name: "GEMM All-Scatter"
+            example_path: "07_gemm_all_scatter"
+            tflops_threshold: 1000  # Actual: ~1407 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
+
+          - example_name: "GEMM All-Scatter Producer-Consumer"
+            example_path: "11_gemm_all_scatter_producer_consumer"
+            tflops_threshold: 1600  # Actual: ~2190 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48"
+
+          - example_name: "GEMM All-Scatter Bulk Synchronous"
+            example_path: "12_gemm_all_scatter_bulk_synchronous"
+            tflops_threshold: 900  # Actual: ~1262 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Cleanup lingering ports before tests
+        run: |
+          bash .github/scripts/cleanup_ports.sh
+
+      - name: Run ${{ matrix.example_name }} Benchmark (8 ranks)
+        run: |
+          bash .github/scripts/run_perf_benchmark.sh \
+            "${{ matrix.example_path }}" \
+            "${{ matrix.tflops_threshold }}" \
+            ${{ matrix.benchmark_args }}
+