Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b36e12d
Add Flash Decode Example (#110)
octatrifan-amd Sep 18, 2025
0cedf1b
Make sure we capture errors of CI (#173)
mawad-amd Sep 19, 2025
cae6c83
Fix atomic_add validation by correcting expected values and enhancing…
Copilot Sep 29, 2025
86e8df4
Implement pytest tests for 06_message_passing examples (#183)
erieaton-amd Sep 30, 2025
b53aad1
Remove research purposes note (#186)
mawad-amd Oct 3, 2025
7d43fca
Add All Gather + GEMM Example (Push and Pull) (#172)
octatrifan-amd Oct 4, 2025
24a3781
Replace hardcoded XCC and CU count values with iris.hip API calls (#182)
Copilot Oct 4, 2025
ba17467
Remove fine-grained allocator (#180)
Copilot Oct 4, 2025
9cec1f0
Enable parallel CI execution for multi-rank tests using HIP_VISIBLE_D…
Copilot Oct 4, 2025
6b0a29a
Implements copy function (#113)
astroC86 Oct 4, 2025
121219e
Add pip install test workflow with Apptainer and rank matrix (#171)
mawad-amd Oct 5, 2025
4c4742c
Add directional copy tests (#191)
mawad-amd Oct 5, 2025
4ae6d73
Set device in `init_process_group` (#192)
mawad-amd Oct 5, 2025
3f5d8fd
Use setuptools for version (#166)
mawad-amd Oct 5, 2025
1dbd12b
Add iris.copy to documentation (#194)
mawad-amd Oct 5, 2025
888b48d
Enhance broadcast and docs (#195)
mawad-amd Oct 5, 2025
f974883
Update package metadata to PEP 621 format with backward compatibility…
mawad-amd Oct 5, 2025
36b912c
Update examples list (#201)
mawad-amd Oct 7, 2025
6ba9c79
Add CUDA backend support for NVIDIA GPUs with automatic detection (#200)
Copilot Oct 8, 2025
5dbc2fc
Hoist `num_xcds` query (#205)
mawad-amd Oct 9, 2025
cdc05dc
Add regression CI (#206)
mawad-amd Oct 9, 2025
4b181a3
Replace hardcoded CU/SM values with dynamic detection using PyTorch (…
Copilot Oct 9, 2025
69d365f
Add device_id parameter to all torch distributed initialization calls…
Copilot Oct 9, 2025
0f5c8a8
Add process and port cleanup step to all CI workflow files to handle …
Copilot Oct 10, 2025
0aa9ba8
Remove post-job port cleanup to prevent killing active processes (#223)
Copilot Oct 10, 2025
8c07e04
Initial plan
Copilot Oct 11, 2025
3427c38
Merge main into cache-modifiers branch
Copilot Oct 11, 2025
b2bf42f
Add cache modifiers to load, store, get, put, and copy functions
Copilot Oct 11, 2025
d5dc6f0
Add test for copy cache modifiers
Copilot Oct 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions .github/scripts/cleanup_ports.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash
# SPDX-License-Identifier: MIT
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.

set -e

# Script to clean up any lingering test processes and ports
# This is useful when tests segfault and leave processes/ports open

echo "========================================"
echo "Port Cleanup Script - Starting"
echo "========================================"

# Show initial state of listening ports
echo ""
echo "Initial state - Listening TCP ports:"
echo "------------------------------------"
ss -tulpn 2>/dev/null | grep LISTEN | grep -E "python|pt_main_thread" || echo "No Python/PyTorch processes listening on ports"
echo ""

echo "Cleaning up lingering test processes and ports..."

# Clean up Python test processes that might be stuck
# Look for processes related to run_tests_distributed.py, pytest, and torch distributed tests
echo "Checking for lingering Python test processes..."
PYTHON_TEST_PIDS=$(pgrep -f "run_tests_distributed.py|pytest.*test_|torch.distributed" 2>/dev/null || true)

if [ -n "$PYTHON_TEST_PIDS" ]; then
echo "Found Python test processes: $PYTHON_TEST_PIDS"
echo "Killing Python test processes..."
echo "$PYTHON_TEST_PIDS" | xargs kill -9 2>/dev/null || true
echo "Cleaned up Python test processes"
fi

# Clean up pt_main_thread processes (PyTorch multiprocessing spawned processes)
echo "Checking for lingering PyTorch processes (multiprocessing.spawn)..."
PT_PIDS=$(pgrep -f "multiprocessing.spawn" 2>/dev/null || true)

if [ -n "$PT_PIDS" ]; then
echo "Found PyTorch processes: $PT_PIDS"
echo "Killing PyTorch processes..."
echo "$PT_PIDS" | xargs kill -9 2>/dev/null || true
echo "Cleaned up PyTorch processes"
fi

# Clean up any processes listening on TCP ports in the common test range
# PyTorch distributed typically uses ports in the 29500+ range, but can use any available port
echo "Checking for processes using TCP ports..."
LISTENING_PIDS=$(lsof -ti tcp -sTCP:LISTEN 2>/dev/null | sort -u || true)

if [ -n "$LISTENING_PIDS" ]; then
# Filter to only Python/PyTorch processes to avoid killing system services
for PID in $LISTENING_PIDS; do
PROCESS_NAME=$(ps -p $PID -o comm= 2>/dev/null || true)
# Check for python or pt_main_thread processes
if [[ "$PROCESS_NAME" == *"python"* ]] || [[ "$PROCESS_NAME" == *"pt_main_thread"* ]]; then
PORT=$(lsof -Pan -p $PID -i tcp -sTCP:LISTEN 2>/dev/null | awk 'NR>1 {print $9}' | cut -d':' -f2 | head -1)
if [ -n "$PORT" ]; then
echo "Found process $PROCESS_NAME (PID $PID) listening on port $PORT"
kill -9 $PID 2>/dev/null || true
echo "Cleaned up process $PID on port $PORT"
fi
fi
done
fi

echo ""
echo "========================================"
echo "Port Cleanup Script - Completed"
echo "========================================"

# Show final state of listening ports
echo ""
echo "Final state - Listening TCP ports:"
echo "------------------------------------"
ss -tulpn 2>/dev/null | grep LISTEN | grep -E "python|pt_main_thread" || echo "No Python/PyTorch processes listening on ports"
echo ""
echo "Port cleanup complete."
63 changes: 63 additions & 0 deletions .github/scripts/run_perf_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash
set -e

# Arguments
EXAMPLE_PATH=$1
TFLOPS_THRESHOLD=$2
shift 2
BENCHMARK_ARGS="$@"

# Create overlay image in workspace (will be auto-cleaned by GitHub Actions)
OVERLAY="iris_overlay_perf_${EXAMPLE_PATH//\//_}.img"

echo "::group::Creating overlay image"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
echo "::endgroup::"

echo "::group::Running performance benchmark"
apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
set -e
pip install -e .
python examples/${EXAMPLE_PATH}/benchmark.py \
--benchmark \
--validate \
-r 8 \
${BENCHMARK_ARGS} \
--output_file perf_result.json
"
echo "::endgroup::"

# Parse JSON and check performance
echo "::group::Validating performance"

# Check if benchmark succeeded
SUCCESS=$(jq -r '.success' perf_result.json)
if [ "$SUCCESS" != "true" ]; then
echo "::error::Benchmark failed (success: $SUCCESS)"
jq '.' perf_result.json
exit 1
fi

TFLOPS=$(jq -r '.tflops' perf_result.json)

if [ -z "$TFLOPS" ] || [ "$TFLOPS" = "null" ]; then
echo "::error::Failed to extract tflops from benchmark output"
jq '.' perf_result.json
exit 1
fi

echo "::notice::Achieved TFLOPs: $TFLOPS"

# Convert to integer for comparison
TFLOPS_INT=${TFLOPS%.*}
if (( TFLOPS_INT < TFLOPS_THRESHOLD )); then
echo "::error::Performance regression detected! TFLOPs ($TFLOPS) is below threshold ($TFLOPS_THRESHOLD)"
jq '.' perf_result.json
exit 1
fi

echo "✅ Performance test passed! TFLOPs: $TFLOPS (threshold: >$TFLOPS_THRESHOLD)"
echo "::endgroup::"

28 changes: 28 additions & 0 deletions .github/scripts/run_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
# SPDX-License-Identifier: MIT
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.

set -e # Exit on any error

# Get num_ranks from command line argument
NUM_RANKS=$1

if [ -z "$NUM_RANKS" ]; then
echo "Error: NUM_RANKS not provided"
echo "Usage: $0 <num_ranks>"
exit 1
fi

# Run examples tests one at a time using distributed wrapper
echo 'Running examples tests one at a time...'
for test_file in tests/examples/test_*.py; do
echo "Testing: $test_file with $NUM_RANKS ranks"
python tests/run_tests_distributed.py --num_ranks $NUM_RANKS "$test_file" -v --tb=short --durations=10
done

# Run unit tests one at a time using distributed wrapper
echo 'Running unit tests one at a time...'
for test_file in tests/unittests/test_*.py; do
echo "Testing: $test_file with $NUM_RANKS ranks"
python tests/run_tests_distributed.py --num_ranks $NUM_RANKS "$test_file" -v --tb=short --durations=10
done
83 changes: 83 additions & 0 deletions .github/workflows/iris-external-validation-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Iris External Validation Test

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Apptainer
run: |
apt-get update && apt-get install -y software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update && apt-get install -y apptainer

- name: Build Iris Apptainer container
run: |
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Build Apptainer image from definition file (only if it doesn't exist)
if [ ! -f ~/apptainer/iris-dev.sif ]; then
echo "Building new Apptainer image..."
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
else
echo "Using existing Apptainer image"
fi

external-validation-test:
name: External Validation Test
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Cleanup lingering ports before tests
run: |
bash .github/scripts/cleanup_ports.sh

- name: Run External Validation Test with Apptainer
run: |
set -e

# Create unique overlay image for isolation
OVERLAY="/tmp/iris_overlay_$(whoami)_external_$(date +%s%N).img"

echo "::group::Creating overlay image"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
echo "::endgroup::"

echo "::group::Running external validation test"
apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
set -e
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
python test_iris_distributed.py
"
echo "::endgroup::"

# Cleanup overlay image
echo "::group::Cleaning up overlay image"
rm -f "${OVERLAY}"
echo "::endgroup::"

echo "✅ External validation test passed!"
86 changes: 86 additions & 0 deletions .github/workflows/iris-performance-regression-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: Iris Performance Regression Test

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 20

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Apptainer
run: |
apt-get update && apt-get install -y software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update && apt-get install -y apptainer

- name: Build Iris Apptainer container
run: |
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Build Apptainer image from definition file (only if it doesn't exist)
if [ ! -f ~/apptainer/iris-dev.sif ]; then
echo "Building new Apptainer image..."
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
else
echo "Using existing Apptainer image"
fi

performance-test:
name: ${{ matrix.example_name }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
# Performance baselines measured on AMD Instinct MI325X (8 GPUs)
include:
- example_name: "GEMM All-Scatter WG Specialization"
example_path: "10_gemm_all_scatter_wg_specialization"
tflops_threshold: 1600 # Actual: ~2182 TFLOPs
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"

- example_name: "GEMM All-Scatter"
example_path: "07_gemm_all_scatter"
tflops_threshold: 1000 # Actual: ~1407 TFLOPs
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256"

- example_name: "GEMM All-Scatter Producer-Consumer"
example_path: "11_gemm_all_scatter_producer_consumer"
tflops_threshold: 1600 # Actual: ~2190 TFLOPs
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48"

- example_name: "GEMM All-Scatter Bulk Synchronous"
example_path: "12_gemm_all_scatter_bulk_synchronous"
tflops_threshold: 900 # Actual: ~1262 TFLOPs
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Cleanup lingering ports before tests
run: |
bash .github/scripts/cleanup_ports.sh

- name: Run ${{ matrix.example_name }} Benchmark (8 ranks)
run: |
bash .github/scripts/run_perf_benchmark.sh \
"${{ matrix.example_path }}" \
"${{ matrix.tflops_threshold }}" \
${{ matrix.benchmark_args }}

Loading