Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(
self.with_bias = with_bias
self.add_bias = add_bias
self.prefix = prefix
self.is_quantized = fd_config.model_config.is_quantized
# key
if weight_key:
self.weight_key = f"{prefix}.{weight_key}"
Expand Down
5 changes: 4 additions & 1 deletion fastdeploy/model_executor/layers/quantization/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import paddle
from paddle.nn.quant import weight_quantize
from paddleformers.utils.log import logger

from fastdeploy import envs
from fastdeploy.model_executor.layers.linear import (
Expand Down Expand Up @@ -159,9 +160,11 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if (
_ENABLE_MACHETE
and envs.FD_USE_MACHETE == "1"
and not layer.is_quantized
and layer.weight_shape[1]
and layer.weight_shape[1] % 128 == 0
):
logger.info("Using Machete kernel for WeightOnlyLinearMethod")
return MacheteWeightOnlyLinearMethod(self)
return GPUWeightOnlyLinearMethod(self)

Expand Down Expand Up @@ -399,7 +402,7 @@ def __init__(
super().__init__(quant_config)

def process_prequanted_weights(self, layer, state_dict) -> None:
pass
raise NotImplementedError("Machete kernel doesn't support prequant. Please set FD_USE_MACHETE to 0.")

def process_loaded_weights(self, layer, weight) -> None:
from fastdeploy.model_executor.layers.quantization.ops import (
Expand Down
Loading