-
Notifications
You must be signed in to change notification settings - Fork 743
[FDConfig] Enable distributed communication environment variables by default #7746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dbb1c5c
cc5a7b5
c5e6c14
5275248
217c1f4
4b240e3
9006b16
962c426
0bbd051
38a870e
dfe3762
5d11e06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -848,3 +848,13 @@ def cleanup(self): | |
| """ | ||
| if self.manager is not None and self.is_server: | ||
| self.manager.shutdown() | ||
|
|
||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| def is_broken(self): | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| try: | ||
| self.manager.connect() | ||
| return False | ||
| except (ConnectionRefusedError, ConnectionResetError, BrokenPipeError, EOFError, OSError): | ||
| llm_logger.error("Failed to connect to engine worker queue") | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| return True | ||
| except Exception: | ||
This comment was marked as outdated.
Sorry, something went wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 若 except Exception as e:
llm_logger.warning(f"Unexpected error in is_broken check: {e}")
return False |
||
| return False | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,6 @@ | |
| import queue | ||
| import shutil | ||
| import signal | ||
| import socket | ||
| import subprocess | ||
| import sys | ||
| import time | ||
|
|
@@ -30,6 +29,7 @@ | |
| sys.path.insert(0, project_root) | ||
|
|
||
| from ci_use.EB_Lite_with_adapter.zmq_client import LLMControlClient, LLMReqClient | ||
| from e2e.utils.serving_utils import clean_ports, is_port_open | ||
|
|
||
| env = os.environ.copy() | ||
|
|
||
|
|
@@ -79,88 +79,6 @@ def zmq_control_client(): | |
| return client | ||
|
|
||
|
|
||
| def is_port_open(host: str, port: int, timeout=1.0): | ||
| """ | ||
| Check if a TCP port is open on the given host. | ||
| Returns True if connection succeeds, False otherwise. | ||
| """ | ||
| try: | ||
| with socket.create_connection((host, port), timeout): | ||
| return True | ||
| except Exception: | ||
| return False | ||
|
|
||
|
|
||
| def kill_process_on_port(port: int): | ||
| """ | ||
| Kill processes that are listening on the given port. | ||
| Uses multiple methods to ensure thorough cleanup. | ||
| """ | ||
| current_pid = os.getpid() | ||
| parent_pid = os.getppid() | ||
|
|
||
| # Method 1: Use lsof to find processes | ||
| try: | ||
| output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() | ||
| for pid in output.splitlines(): | ||
| pid = int(pid) | ||
| if pid in (current_pid, parent_pid): | ||
| print(f"Skip killing current process (pid={pid}) on port {port}") | ||
| continue | ||
| try: | ||
| # First try SIGTERM for graceful shutdown | ||
| os.kill(pid, signal.SIGTERM) | ||
| time.sleep(1) | ||
| # Then SIGKILL if still running | ||
| os.kill(pid, signal.SIGKILL) | ||
| print(f"Killed process on port {port}, pid={pid}") | ||
| except ProcessLookupError: | ||
| pass # Process already terminated | ||
| except subprocess.CalledProcessError: | ||
| pass | ||
|
|
||
| # Method 2: Use netstat and fuser as backup | ||
| try: | ||
| # Find processes using netstat and awk | ||
| cmd = f"netstat -tulpn 2>/dev/null | grep :{port} | awk '{{print $7}}' | cut -d'/' -f1" | ||
| output = subprocess.check_output(cmd, shell=True).decode().strip() | ||
| for pid in output.splitlines(): | ||
| if pid and pid.isdigit(): | ||
| pid = int(pid) | ||
| if pid in (current_pid, parent_pid): | ||
| continue | ||
| try: | ||
| os.kill(pid, signal.SIGKILL) | ||
| print(f"Killed process (netstat) on port {port}, pid={pid}") | ||
| except ProcessLookupError: | ||
| pass | ||
| except (subprocess.CalledProcessError, FileNotFoundError): | ||
| pass | ||
|
|
||
| # Method 3: Use fuser if available | ||
| try: | ||
| subprocess.run(f"fuser -k {port}/tcp", shell=True, timeout=5) | ||
| except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): | ||
| pass | ||
|
|
||
|
|
||
| def clean_ports(): | ||
| """ | ||
| Kill all processes occupying the ports listed in PORTS_TO_CLEAN. | ||
| """ | ||
| print(f"Cleaning ports: {PORTS_TO_CLEAN}") | ||
| for port in PORTS_TO_CLEAN: | ||
| kill_process_on_port(port) | ||
|
|
||
| # Double check and retry if ports are still in use | ||
| time.sleep(2) | ||
| for port in PORTS_TO_CLEAN: | ||
| if is_port_open("127.0.0.1", port, timeout=0.1): | ||
| print(f"Port {port} still in use, retrying cleanup...") | ||
| kill_process_on_port(port) | ||
| time.sleep(1) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session", autouse=True) | ||
| def setup_and_run_server(): | ||
| """ | ||
|
|
@@ -170,8 +88,15 @@ def setup_and_run_server(): | |
| - Waits for server port to open (up to 30 seconds) | ||
| - Tears down server after all tests finish | ||
| """ | ||
| # 清理/dev/shm中的临时文件 | ||
| try: | ||
| subprocess.run("rm -rf /dev/shm/*", shell=True) | ||
This comment was marked as outdated.
Sorry, something went wrong.
This comment was marked as outdated.
Sorry, something went wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 建议改为只清理本框架生成的 socket 文件: rm -f /dev/shm/fd_task_queue_*.sock |
||
| print("Successfully cleaned up /dev/shm.") | ||
| except Exception as e: | ||
| print(f"Failed to cleanup /dev/shm: {e}") | ||
|
|
||
| print("Pre-test port cleanup...") | ||
| clean_ports() | ||
| clean_ports(PORTS_TO_CLEAN) | ||
|
|
||
| base_path = os.getenv("MODEL_PATH") | ||
| if base_path: | ||
|
|
@@ -236,7 +161,7 @@ def setup_and_run_server(): | |
| print("\n===== Post-test server cleanup... =====") | ||
| try: | ||
| os.killpg(process.pid, signal.SIGTERM) | ||
| clean_ports() | ||
| clean_ports(PORTS_TO_CLEAN) | ||
| print(f"API server (pid={process.pid}) terminated") | ||
| except Exception as e: | ||
| print(f"Failed to terminate API server: {e}") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,31 +13,25 @@ | |
| # limitations under the License. | ||
|
|
||
| import os | ||
| import signal | ||
| import socket | ||
| import subprocess | ||
| import sys | ||
| import time | ||
| import traceback | ||
|
|
||
| import pytest | ||
|
|
||
| from fastdeploy import LLM, SamplingParams | ||
|
|
||
| FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) | ||
| FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) | ||
| MAX_WAIT_SECONDS = 60 | ||
|
|
||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| project_root = os.path.abspath(os.path.join(current_dir, "..", "..")) | ||
| sys.path.insert(0, project_root) | ||
| from e2e.utils.serving_utils import ( | ||
| FD_API_PORT, | ||
| FD_CACHE_QUEUE_PORT, | ||
| FD_ENGINE_QUEUE_PORT, | ||
| clean_ports, | ||
| ) | ||
|
|
||
| def is_port_open(host: str, port: int, timeout=1.0): | ||
| """ | ||
| Check if a TCP port is open on the given host. | ||
| Returns True if connection succeeds, False otherwise. | ||
| """ | ||
| try: | ||
| with socket.create_connection((host, port), timeout): | ||
| return True | ||
| except Exception: | ||
| return False | ||
| MAX_WAIT_SECONDS = 60 | ||
|
|
||
|
|
||
| def format_chat_prompt(messages): | ||
|
|
@@ -74,35 +68,23 @@ def llm(model_path): | |
| """ | ||
| Fixture to initialize the LLM model with a given model path | ||
| """ | ||
| try: | ||
| output = subprocess.check_output(f"lsof -i:{FD_ENGINE_QUEUE_PORT} -t", shell=True).decode().strip() | ||
| for pid in output.splitlines(): | ||
| os.kill(int(pid), signal.SIGKILL) | ||
| print(f"Killed process on port {FD_ENGINE_QUEUE_PORT}, pid={pid}") | ||
| except subprocess.CalledProcessError: | ||
| pass | ||
| # Clean ports before starting the test | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| clean_ports() | ||
|
|
||
| try: | ||
| start = time.time() | ||
| llm = LLM( | ||
| model=model_path, | ||
| tensor_parallel_size=1, | ||
| port=FD_API_PORT, | ||
| engine_worker_queue_port=FD_ENGINE_QUEUE_PORT, | ||
| cache_queue_port=FD_CACHE_QUEUE_PORT, | ||
| max_model_len=32768, | ||
| quantization="wint8", | ||
| logits_processors=["LogitBiasLogitsProcessor"], | ||
| ) | ||
|
|
||
| # Wait for the port to be open | ||
| wait_start = time.time() | ||
| while not is_port_open("127.0.0.1", FD_ENGINE_QUEUE_PORT): | ||
| if time.time() - wait_start > MAX_WAIT_SECONDS: | ||
| pytest.fail( | ||
| f"Model engine did not start within {MAX_WAIT_SECONDS} seconds on port {FD_ENGINE_QUEUE_PORT}" | ||
| ) | ||
| time.sleep(1) | ||
|
|
||
| time.sleep(2) | ||
This comment was marked as outdated.
Sorry, something went wrong.
This comment was marked as outdated.
Sorry, something went wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 原逻辑轮询 建议改为轮询 SHM socket 文件或设合理上限重试(如 30s),以保持与原逻辑等价的健壮性。 |
||
| print(f"Model loaded successfully from {model_path} in {time.time() - start:.2f}s.") | ||
| yield llm | ||
| except Exception: | ||
|
|
||
This comment was marked as outdated.
Sorry, something went wrong.
Uh oh!
There was an error while loading. Please reload this page.