Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions fastdeploy/model_executor/layers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def __init__(
),
)
if self.world_size > 1:
set_weight_attrs(self.embeddings.weight, {"output_dim": False, "weight_loader": self.weight_loader})
set_weight_attrs(self.embeddings.weight, {"output_dim": False})
if num_embeddings % self.world_size != 0:
set_weight_attrs(self.embeddings.weight, {"weight_loader", self.weight_loader})
else:
# column cut embedding
self.embeddings = nn.Embedding(
Expand Down Expand Up @@ -236,6 +238,9 @@ def weight_loader(self, param, loaded_weight, shard_id=None):
output_dim = getattr(param, "output_dim", None)
packed_dim = getattr(param, "packed_dim", None)

if not param._is_initialized():
param.initialize()

loaded_weight = get_tensor(loaded_weight)
if param.dtype != loaded_weight.dtype:
if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn:
Expand All @@ -247,7 +252,7 @@ def weight_loader(self, param, loaded_weight, shard_id=None):
assert (
param.shape == loaded_weight.shape
), f"Shape mismatch: param {param.shape} vs loaded_weight {loaded_weight.shape}"
param.set_value(loaded_weight)
param.copy_(loaded_weight, False)
return

start_idx = self.shard_indices.org_vocab_start_index
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/model_executor/models/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from fastdeploy.config import ModelConfig
from fastdeploy.model_executor.layers.activation import get_act_fn
from fastdeploy.model_executor.models.interfaces_base import is_pooling_model
from fastdeploy.transformer_utils.config import get_hf_file_to_dict

_T = TypeVar("_T", bound=type[nn.Layer])
Expand Down Expand Up @@ -191,6 +190,8 @@ def as_embedding_model(cls: _T) -> _T:
please implement your own model if this is not the case.
"""
# Avoid modifying existing embedding models
from fastdeploy.model_executor.models.interfaces_base import is_pooling_model

if is_pooling_model(cls):
return cls

Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1321,6 +1321,7 @@ def _dummy_run(

logits = None
if hasattr(self.model, "is_pooling_model") and self.model.is_pooling_model:
# TODO(lizexu123) The preheating the pooling function have not been implemented yet.
pass
else:
# 4. Execute spec decode
Expand Down Expand Up @@ -1632,9 +1633,9 @@ class at the server level, which is too granular for ModelRunner.
logits = None
# 4. Compute logits, Sample
if hasattr(self.model, "is_pooling_model") and self.model.is_pooling_model:
# TODO(lizexu123) The execution of the pooling function have not been implemented yet.
pass
else:
# 4. Execute spec decode
logits = self.model.compute_logits(hidden_states)

if not self.speculative_decoding:
Expand Down
Loading